From 3d5b36642b32e4d1060115cab800d763db1bdeab Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Sun, 12 Feb 2023 18:48:28 +0000 Subject: [PATCH 001/454] Add empty SYCL variant --- CMakeLists.txt | 5 ++++- src/common/Executor.cpp | 4 ++++ src/common/KernelBase.cpp | 24 +++++++++++++++++++++++- src/common/KernelBase.hpp | 19 +++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 26 ++++++++++++++++++++++++++ src/common/RAJAPerfSuite.hpp | 4 ++++ 6 files changed, 80 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c7233236..88416a097 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ if (PERFSUITE_ENABLE_WARNINGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") endif() -if (ENABLE_KOKKOS) +if (ENABLE_KOKKOS OR ENABLE_SYCL) set(CMAKE_CXX_STANDARD 17) set(BLT_CXX_STD c++17) else() @@ -110,6 +110,9 @@ endif() if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) endif() +if (ENABLE_SYCL) + list(APPEND RAJA_PERFSUITE_DEPENDS sycl) +endif() # Kokkos requires hipcc as the CMAKE_CXX_COMPILER for HIP AMD/VEGA GPU # platforms, whereas RAJAPerf Suite uses blt/CMake FindHIP to set HIP compiler. diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 85af7dbcf..18911fbf2 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -125,6 +125,10 @@ void Executor::setupSuite() getCout() << "\nSetting up suite based on input..." << endl; + #if defined(RAJA_ENABLE_SYCL) + KernelBase::qu = KernelBase::sycl_res.get().get_queue(); + #endif + using Slist = list; using Svector = vector; using COvector = vector; diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 764770ca7..0168de5e8 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -10,6 +10,7 @@ #include "RunParams.hpp" +#include "RAJA/RAJA.hpp" #include #include @@ -135,6 +136,15 @@ void KernelBase::setVariantDefined(VariantID vid) #if defined(RUN_KOKKOS) setKokkosTuningDefinitions(vid); #endif + case Base_SYCL: + case Range_SYCL: + case RAJA_SYCL: + { +#if defined(RAJA_ENABLE_SYCL) + setSyclTuningDefinitions(vid); +#endif + break; + } break; } @@ -158,6 +168,10 @@ void KernelBase::execute(VariantID vid, size_t tune_idx) running_variant = vid; running_tuning = tune_idx; +#if defined(RAJA_ENABLE_SYCL) + ::RAJA::sycl::detail::setQueue(&sycl_res); +#endif + resetTimer(); resetDataInitCount(); @@ -252,7 +266,15 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx) runKokkosVariant(vid, tune_idx); #endif } - + case Base_SYCL: + case Range_SYCL: + case RAJA_SYCL: + { +#if defined(RAJA_ENABLE_SYCL) + runSyclVariant(vid, tune_idx); +#endif + break; + } default : { #if 0 getCout() << "\n " << getName() diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 885650336..6474a4c92 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -25,6 +25,11 @@ #if defined(RAJA_ENABLE_HIP) #include "RAJA/policy/hip/raja_hiperrchk.hpp" #endif +#if defined(RAJA_ENABLE_SYCL) +#include +#include "camp/resource.hpp" +#endif + #include #include @@ -94,6 +99,11 @@ class KernelBase virtual void setKokkosTuningDefinitions(VariantID vid) { addVariantTuningName(vid, getDefaultTuningName()); } #endif +#if defined(RAJA_ENABLE_SYCL) + virtual void setSyclTuningDefinitions(VariantID vid) + { addVariantTuningName(vid, getDefaultTuningName()); } +#endif + // // Getter methods used to generate kernel execution summary @@ -240,6 +250,15 @@ class KernelBase getCout() << "\n KernelBase: Unimplemented Kokkos variant id = " << vid << std::endl; } #endif +#if defined(RAJA_ENABLE_SYCL) + virtual void runSyclVariant(VariantID vid, size_t tune_idx) + { + getCout() << "\n KernelBase: Unimplemented Sycl variant id = " << vid << std::endl; + } + static cl::sycl::queue* qu; + static camp::resources::Resource sycl_res; +#endif + protected: const RunParams& run_params; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 27650cf56..52b68e2b3 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -280,6 +280,10 @@ static const std::string VariantNames [] = std::string("Kokkos_Lambda"), + std::string("Base_SYCL"), + std::string("Range_SYCL"), + std::string("RAJA_SYCL"), + std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... }; // END VariantNames @@ -430,6 +434,14 @@ bool isVariantAvailable(VariantID vid) } #endif +#if defined(RAJA_ENABLE_SYCL) + if ( vid == Base_SYCL || + vid == Range_SYCL || + vid == RAJA_SYCL ) { + ret_val = true; + } +#endif + return ret_val; } @@ -491,6 +503,14 @@ bool isVariantGPU(VariantID vid) } #endif +#if defined(RAJA_ENABLE_SYCL) + if ( vid == Base_SYCL || + vid == Range_SYCL || + vid == RAJA_SYCL ) { + ret_val = true; + } +#endif + return ret_val; } @@ -813,6 +833,12 @@ KernelBase* getKernelObject(KernelID kid, return kernel; } +#if defined(RAJA_ENABLE_SYCL) +sycl::context ctx; +camp::resources::Resource KernelBase::sycl_res{camp::resources::Sycl(ctx)}; +sycl::queue* KernelBase::qu; +#endif + // subclass of streambuf that ignores overflow // never printing anything to the underlying stream struct NullStream : std::streambuf, std::ostream diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index e73bd9888..b60d9af3f 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -196,6 +196,10 @@ enum VariantID { Kokkos_Lambda, + Base_SYCL, + Range_SYCL, + RAJA_SYCL, + NumVariants // Keep this one last and NEVER comment out (!!) }; From 2819324379693d7dbad1583ac39f372c6cee4ed0 Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Mon, 13 Feb 2023 01:35:40 +0000 Subject: [PATCH 002/454] Add INIT3-Sycl Kernels --- scripts/alcf-builds/sycl.sh | 39 ++++++++++ src/basic/CMakeLists.txt | 1 + src/basic/INIT3-Sycl.cpp | 137 +++++++++++++++++++++++++++++++++++ src/basic/INIT3.cpp | 4 + src/basic/INIT3.hpp | 4 + src/common/KernelBase.hpp | 2 +- src/common/SyclDataUtils.hpp | 87 ++++++++++++++++++++++ 7 files changed, 273 insertions(+), 1 deletion(-) create mode 100755 scripts/alcf-builds/sycl.sh create mode 100644 src/basic/INIT3-Sycl.cpp create mode 100644 src/common/SyclDataUtils.hpp diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh new file mode 100755 index 000000000..748182edc --- /dev/null +++ b/scripts/alcf-builds/sycl.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +## +## Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +## +## Produced at the Lawrence Livermore National Laboratory. +## +## LLNL-CODE-738930 +## +## All rights reserved. +## +## This file is part of the RAJA Performance Suite. +## +## For details about use and distribution, please read RAJAPerf/LICENSE. +## + + +BUILD_SUFFIX=sycl +: ${BUILD_TYPE:=RelWithDebInfo} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/alcf-builds/sycl.cmake + +rm -rf build_${BUILD_SUFFIX}_${USER} >/dev/null +mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER} + +cmake \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_OPENMP=Off \ + -DENABLE_CUDA=Off \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ + -DENABLE_TARGET_OPENMP=Off \ + -DENABLE_ALL_WARNINGS=Off \ + -DENABLE_SYCL=On \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_LINKER=icpx \ + "$@" \ + .. + +make -j 18 diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 3be6e0c3c..e2b712f50 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -44,6 +44,7 @@ blt_add_library( INIT3-Cuda.cpp INIT3-OMP.cpp INIT3-OMPTarget.cpp + INIT3-Sycl.cpp INIT_VIEW1D.cpp INIT_VIEW1D-Seq.cpp INIT_VIEW1D-Hip.cpp diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp new file mode 100644 index 000000000..587040d71 --- /dev/null +++ b/src/basic/INIT3-Sycl.cpp @@ -0,0 +1,137 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT3.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define INIT3_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(out1, m_out1, iend, qu); \ + allocAndInitSyclDeviceData(out2, m_out2, iend, qu); \ + allocAndInitSyclDeviceData(out3, m_out3, iend, qu); \ + allocAndInitSyclDeviceData(in1, m_in1, iend, qu); \ + allocAndInitSyclDeviceData(in2, m_in2, iend, qu); + +#define INIT3_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_out1, out1, iend, qu); \ + getSyclDeviceData(m_out2, out2, iend, qu); \ + getSyclDeviceData(m_out3, out3, iend, qu); \ + deallocSyclDeviceData(out1, qu); \ + deallocSyclDeviceData(out2, qu); \ + deallocSyclDeviceData(out3, qu); \ + deallocSyclDeviceData(in1, qu); \ + deallocSyclDeviceData(in2, qu); + +template +void INIT3::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT3_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + INIT3_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INIT3_BODY + } + + }); + }); + + } + qu->wait(); + + stopTimer(); + + INIT3_DATA_TEARDOWN_SYCL; + + } else if ( vid == Range_SYCL ) { + + INIT3_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<1>(iend), + [=] (sycl::item<1> item ) { + + Index_type i = item.get_id(0); + INIT3_BODY + + }); + }); + + } + qu->wait(); + + stopTimer(); + + INIT3_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + INIT3_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INIT3_BODY; + }); + + } + qu->wait(); + stopTimer(); + + INIT3_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n INIT3 : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT3, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 130fbc3b4..4252bfba6 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -53,6 +53,10 @@ INIT3::INIT3(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( Range_SYCL ); + setVariantDefined( RAJA_SYCL ); } INIT3::~INIT3() diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index aed67bfeb..73a53524e 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -56,13 +56,17 @@ class INIT3 : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 6474a4c92..37a5d9ff0 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -255,7 +255,7 @@ class KernelBase { getCout() << "\n KernelBase: Unimplemented Sycl variant id = " << vid << std::endl; } - static cl::sycl::queue* qu; + static sycl::queue* qu; static camp::resources::Resource sycl_res; #endif diff --git a/src/common/SyclDataUtils.hpp b/src/common/SyclDataUtils.hpp new file mode 100644 index 000000000..a444a8733 --- /dev/null +++ b/src/common/SyclDataUtils.hpp @@ -0,0 +1,87 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Methods for SYCL kernel data allocation, initialization, and deallocation. +/// + + +#ifndef RAJAPerf_SyclDataUtils_HPP +#define RAJAPerf_SyclDataUtils_HPP + +#include "RPTypes.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/GPUUtils.hpp" + +#include + + +namespace rajaperf +{ + +/*! + * \brief Copy given hptr (host) data to SYCL device (dptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void initSyclDeviceData(T& dptr, const T hptr, int len, sycl::queue* qu) +{ + auto e = qu->memcpy( dptr, hptr, + len * sizeof(typename std::remove_pointer::type)); + e.wait(); + + incDataInitCount(); +} + +/*! + * \brief Allocate SYCL device data array (dptr) and copy given hptr (host) + * data to device array. + */ +template +void allocAndInitSyclDeviceData(T& dptr, const T hptr, int len, sycl::queue *qu) +{ + dptr = sycl::malloc_device::type>(len, *qu); + + initSyclDeviceData(dptr, hptr, len, qu); +} + +/*! + * \brief Copy given dptr (SYCL device) data to host (hptr). + * + * Method assumes both host and device data arrays are allocated + * and of propoer size for copy operation to succeed. + */ +template +void getSyclDeviceData(T& hptr, const T dptr, int len, sycl::queue *qu) +{ + auto e = qu->memcpy( hptr, dptr, + len * sizeof(typename std::remove_pointer::type)); + e.wait(); +} + +/*! + * \brief Free device data array. + */ +template +void deallocSyclDeviceData(T& dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + + +} // closing brace for rajaperf namespace + +#endif // RAJA_ENABLE_SYCL + +#endif // closing endif for header file include guard + From 4c28e107eb9131cf27d45cd4e74b745c6c035c0a Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Tue, 21 Feb 2023 22:05:26 +0000 Subject: [PATCH 003/454] Add auto block size for 0 value --- src/basic/INIT3-Sycl.cpp | 103 +++++++++++++++++++---------------- src/basic/INIT3.cpp | 1 - src/common/GPUUtils.hpp | 6 +- src/common/KernelBase.cpp | 2 - src/common/RAJAPerfSuite.cpp | 3 - src/common/RAJAPerfSuite.hpp | 1 - 6 files changed, 60 insertions(+), 56 deletions(-) diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index 587040d71..aa6294c6b 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -55,58 +55,65 @@ void INIT3::runSyclVariantImpl(VariantID vid) INIT3_DATA_SETUP; if ( vid == Base_SYCL ) { - - INIT3_DATA_SETUP_SYCL; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - [=] (sycl::nd_item<1> item ) { - - Index_type i = item.get_global_id(0); - if (i < iend) { - INIT3_BODY - } - + if (work_group_size > 0) { + + INIT3_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INIT3_BODY + } + + }); }); - }); - - } - qu->wait(); - - stopTimer(); - - INIT3_DATA_TEARDOWN_SYCL; - - } else if ( vid == Range_SYCL ) { - - INIT3_DATA_SETUP_SYCL; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend), - [=] (sycl::item<1> item ) { - - Index_type i = item.get_id(0); - INIT3_BODY - + + } + qu->wait(); + + stopTimer(); + + INIT3_DATA_TEARDOWN_SYCL; + + } else { + + INIT3_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<1>(iend), + [=] (sycl::item<1> item ) { + + Index_type i = item.get_id(0); + INIT3_BODY + + }); }); - }); + + } + qu->wait(); + + stopTimer(); + + INIT3_DATA_TEARDOWN_SYCL; + + } + } else if ( vid == RAJA_SYCL ) { + if ( work_group_size == 0 ) { + std::cout << "\n INIT3 : RAJA_SYCL does not support auto work group size" << std::endl; + return; } - qu->wait(); - - stopTimer(); - - INIT3_DATA_TEARDOWN_SYCL; - - } else if ( vid == RAJA_SYCL ) { INIT3_DATA_SETUP_SYCL; diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 4252bfba6..bda4e577d 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -55,7 +55,6 @@ INIT3::INIT3(const RunParams& params) setVariantDefined( Kokkos_Lambda ); setVariantDefined( Base_SYCL ); - setVariantDefined( Range_SYCL ); setVariantDefined( RAJA_SYCL ); } diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index eceabcfea..0c6b3059e 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -189,7 +189,11 @@ inline void seq_for(camp::int_seq const&, Func&& func) seq_for(gpu_block_sizes_type{}, [&](auto block_size) { \ if (run_params.numValidGPUBlockSize() == 0u || \ run_params.validGPUBlockSize(block_size)) { \ - addVariantTuningName(vid, "block_"+std::to_string(block_size)); \ + if (block_size == 0u) { \ + addVariantTuningName(vid, "block_auto"); \ + } else { \ + addVariantTuningName(vid, "block_"+std::to_string(block_size)); \ + } \ } \ }); \ } diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 0168de5e8..2b0a8571a 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -137,7 +137,6 @@ void KernelBase::setVariantDefined(VariantID vid) setKokkosTuningDefinitions(vid); #endif case Base_SYCL: - case Range_SYCL: case RAJA_SYCL: { #if defined(RAJA_ENABLE_SYCL) @@ -267,7 +266,6 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx) #endif } case Base_SYCL: - case Range_SYCL: case RAJA_SYCL: { #if defined(RAJA_ENABLE_SYCL) diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 52b68e2b3..bd97f5739 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -281,7 +281,6 @@ static const std::string VariantNames [] = std::string("Kokkos_Lambda"), std::string("Base_SYCL"), - std::string("Range_SYCL"), std::string("RAJA_SYCL"), std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... @@ -436,7 +435,6 @@ bool isVariantAvailable(VariantID vid) #if defined(RAJA_ENABLE_SYCL) if ( vid == Base_SYCL || - vid == Range_SYCL || vid == RAJA_SYCL ) { ret_val = true; } @@ -505,7 +503,6 @@ bool isVariantGPU(VariantID vid) #if defined(RAJA_ENABLE_SYCL) if ( vid == Base_SYCL || - vid == Range_SYCL || vid == RAJA_SYCL ) { ret_val = true; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index b60d9af3f..c64935a77 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -197,7 +197,6 @@ enum VariantID { Kokkos_Lambda, Base_SYCL, - Range_SYCL, RAJA_SYCL, NumVariants // Keep this one last and NEVER comment out (!!) From 839b74dd58e05e4ce0f70e144a97d88eeae876bd Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Wed, 12 Apr 2023 13:08:19 +0000 Subject: [PATCH 004/454] Add existing basic SYCL kernels --- scripts/alcf-builds/sycl.sh | 2 +- src/basic/CMakeLists.txt | 8 ++ src/basic/DAXPY-Sycl.cpp | 134 +++++++++++++++++ src/basic/DAXPY.cpp | 4 + src/basic/DAXPY.hpp | 4 + src/basic/IF_QUAD-Sycl.cpp | 141 ++++++++++++++++++ src/basic/IF_QUAD.cpp | 3 + src/basic/IF_QUAD.hpp | 4 + src/basic/INIT3-Sycl.cpp | 15 +- src/basic/INIT_VIEW1D-Sycl.cpp | 130 +++++++++++++++++ src/basic/INIT_VIEW1D.cpp | 4 + src/basic/INIT_VIEW1D.hpp | 4 + src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 129 +++++++++++++++++ src/basic/INIT_VIEW1D_OFFSET.cpp | 3 + src/basic/INIT_VIEW1D_OFFSET.hpp | 4 + src/basic/MULADDSUB-Sycl.cpp | 139 ++++++++++++++++++ src/basic/MULADDSUB.cpp | 3 + src/basic/MULADDSUB.hpp | 4 + src/basic/NESTED_INIT-Sycl.cpp | 150 +++++++++++++++++++ src/basic/NESTED_INIT.cpp | 3 + src/basic/NESTED_INIT.hpp | 4 + src/basic/REDUCE3_INT-Sycl.cpp | 199 ++++++++++++++++++++++++++ src/basic/REDUCE3_INT.cpp | 3 + src/basic/REDUCE3_INT.hpp | 4 + src/basic/TRAP_INT-Sycl.cpp | 169 ++++++++++++++++++++++ src/basic/TRAP_INT.cpp | 3 + src/basic/TRAP_INT.hpp | 4 + 27 files changed, 1263 insertions(+), 11 deletions(-) create mode 100644 src/basic/DAXPY-Sycl.cpp create mode 100644 src/basic/IF_QUAD-Sycl.cpp create mode 100644 src/basic/INIT_VIEW1D-Sycl.cpp create mode 100644 src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp create mode 100644 src/basic/MULADDSUB-Sycl.cpp create mode 100644 src/basic/NESTED_INIT-Sycl.cpp create mode 100644 src/basic/REDUCE3_INT-Sycl.cpp create mode 100644 src/basic/TRAP_INT-Sycl.cpp diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh index 748182edc..c4421b08f 100755 --- a/scripts/alcf-builds/sycl.sh +++ b/scripts/alcf-builds/sycl.sh @@ -27,7 +27,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=Off \ -DENABLE_CUDA=Off \ - -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=0,64,128,256,512,1024 \ -DENABLE_TARGET_OPENMP=Off \ -DENABLE_ALL_WARNINGS=Off \ -DENABLE_SYCL=On \ diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index e2b712f50..54a45b506 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -14,6 +14,7 @@ blt_add_library( DAXPY-Cuda.cpp DAXPY-OMP.cpp DAXPY-OMPTarget.cpp + DAXPY-Sycl.cpp DAXPY_ATOMIC.cpp DAXPY_ATOMIC-Seq.cpp DAXPY_ATOMIC-Hip.cpp @@ -26,6 +27,7 @@ blt_add_library( IF_QUAD-Cuda.cpp IF_QUAD-OMP.cpp IF_QUAD-OMPTarget.cpp + IF_QUAD-Sycl.cpp INDEXLIST.cpp INDEXLIST-Seq.cpp INDEXLIST-Hip.cpp @@ -51,12 +53,14 @@ blt_add_library( INIT_VIEW1D-Cuda.cpp INIT_VIEW1D-OMP.cpp INIT_VIEW1D-OMPTarget.cpp + INIT_VIEW1D-Sycl.cpp INIT_VIEW1D_OFFSET.cpp INIT_VIEW1D_OFFSET-Seq.cpp INIT_VIEW1D_OFFSET-Hip.cpp INIT_VIEW1D_OFFSET-Cuda.cpp INIT_VIEW1D_OFFSET-OMP.cpp INIT_VIEW1D_OFFSET-OMPTarget.cpp + INIT_VIEW1D_OFFSET-Sycl.cpp MAT_MAT_SHARED.cpp MAT_MAT_SHARED-Seq.cpp MAT_MAT_SHARED-Hip.cpp @@ -69,12 +73,14 @@ blt_add_library( MULADDSUB-Cuda.cpp MULADDSUB-OMP.cpp MULADDSUB-OMPTarget.cpp + MULADDSUB-Sycl.cpp NESTED_INIT.cpp NESTED_INIT-Seq.cpp NESTED_INIT-Hip.cpp NESTED_INIT-Cuda.cpp NESTED_INIT-OMP.cpp NESTED_INIT-OMPTarget.cpp + NESTED_INIT-Sycl.cpp PI_ATOMIC.cpp PI_ATOMIC-Seq.cpp PI_ATOMIC-Hip.cpp @@ -93,6 +99,7 @@ blt_add_library( REDUCE3_INT-Cuda.cpp REDUCE3_INT-OMP.cpp REDUCE3_INT-OMPTarget.cpp + REDUCE3_INT-Sycl.cpp REDUCE_STRUCT.cpp REDUCE_STRUCT-Seq.cpp REDUCE_STRUCT-Hip.cpp @@ -105,5 +112,6 @@ blt_add_library( TRAP_INT-Cuda.cpp TRAP_INT-OMPTarget.cpp TRAP_INT-OMP.cpp + TRAP_INT-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp new file mode 100644 index 000000000..f436d966e --- /dev/null +++ b/src/basic/DAXPY-Sycl.cpp @@ -0,0 +1,134 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define DAXPY_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(x, m_x, iend, qu); \ + allocAndInitSyclDeviceData(y, m_y, iend, qu); + +#define DAXPY_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_y, y, iend, qu); \ + deallocSyclDeviceData(x, qu); \ + deallocSyclDeviceData(y, qu); + +template +void DAXPY::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_DATA_SETUP; + + if ( vid == Base_SYCL ) { + if (work_group_size > 0) { + + DAXPY_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>{global_size, work_group_size}, + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + DAXPY_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + + stopTimer(); + + DAXPY_DATA_TEARDOWN_SYCL; + } else { + + DAXPY_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<1>(iend), + [=] (sycl::item<1> item) { + + Index_type i = item.get_id(0); + DAXPY_BODY + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + + stopTimer(); + + DAXPY_DATA_TEARDOWN_SYCL; + } + + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n INIT3 : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + DAXPY_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + DAXPY_BODY; + }); + + } + qu->wait(); + stopTimer(); + + DAXPY_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n DAXPY : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index f9f3c9d2d..76637a5a6 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -53,6 +53,10 @@ DAXPY::DAXPY(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } DAXPY::~DAXPY() diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index bcaca8054..b3b4f341a 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -53,13 +53,17 @@ class DAXPY : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp new file mode 100644 index 000000000..166a85bf9 --- /dev/null +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -0,0 +1,141 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "IF_QUAD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + +#define IF_QUAD_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(a, m_a, iend, qu); \ + allocAndInitSyclDeviceData(b, m_b, iend, qu); \ + allocAndInitSyclDeviceData(c, m_c, iend, qu); \ + allocAndInitSyclDeviceData(x1, m_x1, iend, qu); \ + allocAndInitSyclDeviceData(x2, m_x2, iend, qu); + +#define IF_QUAD_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_x1, x1, iend, qu); \ + getSyclDeviceData(m_x2, x2, iend, qu); \ + deallocSyclDeviceData(a, qu); \ + deallocSyclDeviceData(b, qu); \ + deallocSyclDeviceData(c, qu); \ + deallocSyclDeviceData(x1, qu); \ + deallocSyclDeviceData(x2, qu); + +template +void IF_QUAD::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + IF_QUAD_DATA_SETUP; + + if ( vid == Base_SYCL ) { + if (work_group_size > 0) { + + IF_QUAD_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + + if (i < iend) { + IF_QUAD_BODY + } + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_SYCL; + + } else { + + IF_QUAD_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<1>(iend), + [=] (sycl::item<1> item) { + + Index_type i = item.get_id(0); + IF_QUAD_BODY + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_SYCL; + + } + + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n IF_QUAD : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + IF_QUAD_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + IF_QUAD_BODY; + }); + + } + qu->wait(); + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n IF_QUAD : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(IF_QUAD, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 57ef34f7a..92b3160b2 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -57,6 +57,9 @@ IF_QUAD::IF_QUAD(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } IF_QUAD::~IF_QUAD() diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index f1f3e12a8..dfdeb6de4 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -70,13 +70,17 @@ class IF_QUAD : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index aa6294c6b..ab2e59686 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -55,10 +55,11 @@ void INIT3::runSyclVariantImpl(VariantID vid) INIT3_DATA_SETUP; if ( vid == Base_SYCL ) { + + INIT3_DATA_SETUP_SYCL; + if (work_group_size > 0) { - INIT3_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -78,15 +79,10 @@ void INIT3::runSyclVariantImpl(VariantID vid) } qu->wait(); - stopTimer(); - INIT3_DATA_TEARDOWN_SYCL; - } else { - INIT3_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -102,12 +98,11 @@ void INIT3::runSyclVariantImpl(VariantID vid) } qu->wait(); - stopTimer(); - INIT3_DATA_TEARDOWN_SYCL; - } + + INIT3_DATA_TEARDOWN_SYCL; } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp new file mode 100644 index 000000000..5ea1c5399 --- /dev/null +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -0,0 +1,130 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + +#define INIT_VIEW1D_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(a, m_a, iend, qu); + +#define INIT_VIEW1D_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_a, a, iend, qu); \ + deallocSyclDeviceData(a, qu); + +template +void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT_VIEW1D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + INIT_VIEW1D_DATA_SETUP_SYCL; + + if (work_group_size > 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + + h.parallel_for(sycl::nd_range<1>{global_size, work_group_size}, + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INIT_VIEW1D_BODY + } + }); + }); + } + qu->wait(); + stopTimer(); + + } else { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<1>(iend), + [=] (sycl::item<1> item) { + + Index_type i = item.get_id(0); + INIT_VIEW1D_BODY + + }); + }); + } + qu->wait(); + stopTimer(); + + } + + INIT_VIEW1D_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n INIT3 : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + INIT_VIEW1D_DATA_SETUP_SYCL; + + INIT_VIEW1D_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }); + + } + qu->wait(); + stopTimer(); + + INIT_VIEW1D_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n INIT_VIEW1D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index dd52b057e..b4cbe96ae 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -54,6 +54,10 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } INIT_VIEW1D::~INIT_VIEW1D() diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index f3770f69a..dd94811b3 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -67,13 +67,17 @@ class INIT_VIEW1D : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp new file mode 100644 index 000000000..e832ceb48 --- /dev/null +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -0,0 +1,129 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D_OFFSET.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define INIT_VIEW1D_OFFSET_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(a, m_a, iend, qu); + +#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_a, a, iend, qu); \ + deallocSyclDeviceData(a, qu); + +template +void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT_VIEW1D_OFFSET_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + INIT_VIEW1D_OFFSET_DATA_SETUP_SYCL; + + if (work_group_size > 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INIT_VIEW1D_OFFSET_BODY + } + + }); + }); + + } + qu->wait(); + stopTimer(); + + } else { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<1>(iend), + [=] (sycl::item<1> item ) { + + Index_type i = item.get_id(0); + INIT_VIEW1D_OFFSET_BODY + + }); + }); + + } + qu->wait(); + stopTimer(); + + } + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_SYCL; + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n INIT_VIEW1D_OFFSET : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + INIT_VIEW1D_OFFSET_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INIT_VIEW1D_OFFSET_BODY; + }); + + } + qu->wait(); + stopTimer(); + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D_OFFSET, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 2dcda4f4a..0a8e73f8a 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -54,6 +54,9 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET() diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index d32f59c7b..510f00654 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -66,13 +66,17 @@ class INIT_VIEW1D_OFFSET : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp new file mode 100644 index 000000000..fdce13fb3 --- /dev/null +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -0,0 +1,139 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULADDSUB.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define MULADDSUB_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(out1, m_out1, iend, qu); \ + allocAndInitSyclDeviceData(out2, m_out2, iend, qu); \ + allocAndInitSyclDeviceData(out3, m_out3, iend, qu); \ + allocAndInitSyclDeviceData(in1, m_in1, iend, qu); \ + allocAndInitSyclDeviceData(in2, m_in2, iend, qu); + +#define MULADDSUB_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_out1, out1, iend, qu); \ + getSyclDeviceData(m_out2, out2, iend, qu); \ + getSyclDeviceData(m_out3, out3, iend, qu); \ + deallocSyclDeviceData(out1, qu); \ + deallocSyclDeviceData(out2, qu); \ + deallocSyclDeviceData(out3, qu); \ + deallocSyclDeviceData(in1, qu); \ + deallocSyclDeviceData(in2, qu); + +template +void MULADDSUB::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULADDSUB_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + MULADDSUB_DATA_SETUP_SYCL; + + if (work_group_size > 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + MULADDSUB_BODY + } + + }); + }); + + } + qu->wait(); + stopTimer(); + + } else { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<1>(iend), + [=] (sycl::item<1> item ) { + + Index_type i = item.get_id(0); + MULADDSUB_BODY + + }); + }); + + } + qu->wait(); + stopTimer(); + + } + + MULADDSUB_DATA_TEARDOWN_SYCL; + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n MULADDSUB : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + MULADDSUB_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + MULADDSUB_BODY; + }); + + } + qu->wait(); + stopTimer(); + + MULADDSUB_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n MULADDSUB : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MULADDSUB, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 1d5f1bfcc..b6ae7a706 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -53,6 +53,9 @@ MULADDSUB::MULADDSUB(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } MULADDSUB::~MULADDSUB() diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index e604a34c8..b52a7388d 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -59,13 +59,17 @@ class MULADDSUB : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp new file mode 100644 index 000000000..c999d76ea --- /dev/null +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -0,0 +1,150 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NESTED_INIT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define NESTED_INIT_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(array, m_array, m_array_length, qu); + +#define NESTED_INIT_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_array, array, m_array_length, qu); \ + deallocSyclDeviceData(array, qu); + +template +void NESTED_INIT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + NESTED_INIT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + NESTED_INIT_DATA_SETUP_SYCL; + + if (work_group_size > 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(ni, work_group_size); + + qu->submit([&] (cl::sycl::handler& h) { + h.parallel_for(sycl::nd_range<3> ( + sycl::range<3> (nk, nj, global_size), + sycl::range<3> (1, 1, work_group_size)), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(2); + Index_type j = item.get_global_id(1); + Index_type k = item.get_global_id(0); + + if (i < ni) { + NESTED_INIT_BODY + } + }); + }); + + } + qu->wait(); + stopTimer(); + + } else { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<3> (nk, nj, ni), + [=] (sycl::item<3> item) { + + Index_type i = item.get_id(2); + Index_type j = item.get_id(1); + Index_type k = item.get_id(0); + + NESTED_INIT_BODY + + }); + }); + + } + qu->wait(); + stopTimer(); + + } + + NESTED_INIT_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n NESTED_INIT : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + NESTED_INIT_DATA_SETUP_SYCL; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<2, RAJA::sycl_global_2<1>, // k + RAJA::statement::For<1, RAJA::sycl_global_1<1>, // j + RAJA::statement::For<0, RAJA::sycl_global_0, // i + RAJA::statement::Lambda<0> + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + [=] (Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }); + + } + qu->wait(); + stopTimer(); + + NESTED_INIT_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n NESTED_INIT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NESTED_INIT, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 4b9183245..6fb40a6af 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -64,6 +64,9 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } NESTED_INIT::~NESTED_INIT() diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index ccaf7079e..31fe95bf6 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -59,13 +59,17 @@ class NESTED_INIT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp new file mode 100644 index 000000000..bc3fa55c6 --- /dev/null +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -0,0 +1,199 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE3_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define REDUCE3_INT_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(vec, m_vec, iend, qu); \ + Int_ptr hsum; \ + allocAndInitSyclDeviceData(hsum, &m_vsum_init, 1, qu); \ + Int_ptr hmin; \ + allocAndInitSyclDeviceData(hmin, &m_vmin_init, 1, qu); \ + Int_ptr hmax; \ + allocAndInitSyclDeviceData(hmax, &m_vmax_init, 1, qu); + +#define REDUCE3_INT_DATA_TEARDOWN_SYCL \ + deallocSyclDeviceData(vec, qu); + +template +void REDUCE3_INT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE3_INT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + REDUCE3_INT_DATA_SETUP_SYCL; + + + if (work_group_size > 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(hsum, &m_vsum_init, 1, qu); + initSyclDeviceData(hmin, &m_vmin_init, 1, qu); + initSyclDeviceData(hmax, &m_vmax_init, 1, qu); + + qu->submit([&] (sycl::handler& h) { + + auto sum_reduction = sycl::reduction(hsum, sycl::plus<>()); + auto min_reduction = sycl::reduction(hmin, sycl::minimum<>()); + auto max_reduction = sycl::reduction(hmax, sycl::maximum<>()); + + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, min_reduction, max_reduction, + [=] (sycl::nd_item<1> item, auto& vsum, auto& vmin, auto& vmax) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + // REDUCE3_INT_BODY + vsum += vec[i]; + vmin.combine(vec[i]); + vmax.combine(vec[i]); + } + + }); + }); + + Int_type lsum; + Int_ptr plsum = &lsum; + getSyclDeviceData(plsum, hsum, 1, qu); + m_vsum += lsum; + + Int_type lmin; + Int_ptr plmin = &lmin; + getSyclDeviceData(plmin, hmin, 1, qu); + m_vmin = RAJA_MIN(m_vmin, lmin); + + Int_type lmax; + Int_ptr plmax = &lmax; + getSyclDeviceData(plmax, hmax, 1, qu); + m_vmax = RAJA_MAX(m_vmax, lmax); + + } + qu->wait(); + stopTimer(); + + } else { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initSyclDeviceData(hsum, &m_vsum_init, 1, qu); + initSyclDeviceData(hmin, &m_vmin_init, 1, qu); + initSyclDeviceData(hmax, &m_vmax_init, 1, qu); + + + qu->submit([&] (sycl::handler& h) { + + auto sum_reduction = sycl::reduction(hsum, sycl::plus<>()); + auto min_reduction = sycl::reduction(hmin, sycl::minimum<>()); + auto max_reduction = sycl::reduction(hmax, sycl::maximum<>()); + + h.parallel_for(sycl::range<1>(iend), + sum_reduction, min_reduction, max_reduction, + [=] (sycl::item<1> item, auto& vsum, auto& vmin, auto& vmax ) { + + Index_type i = item.get_id(0); + vsum += vec[i]; + vmin.combine(vec[i]); + vmax.combine(vec[i]); + + }); + }); + + Int_type lsum; + Int_ptr plsum = &lsum; + getSyclDeviceData(plsum, hsum, 1, qu); + m_vsum += lsum; + + Int_type lmin; + Int_ptr plmin = &lmin; + getSyclDeviceData(plmin, hmin, 1, qu); + m_vmin = RAJA_MIN(m_vmin, lmin); + + Int_type lmax; + Int_ptr plmax = &lmax; + getSyclDeviceData(plmax, hmax, 1, qu); + m_vmax = RAJA_MAX(m_vmax, lmax); + + } + qu->wait(); + stopTimer(); + + } + + REDUCE3_INT_DATA_TEARDOWN_SYCL; + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n REDUCE3_INT : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + REDUCE3_INT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + qu->wait(); + stopTimer(); + + REDUCE3_INT_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n REDUCE3_INT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE3_INT, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index d5cf9f4c2..4ef28bcf6 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -58,6 +58,9 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } REDUCE3_INT::~REDUCE3_INT() diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index e82c2cf05..ba1ecb2fc 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -71,13 +71,17 @@ class REDUCE3_INT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp new file mode 100644 index 000000000..8e401b419 --- /dev/null +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -0,0 +1,169 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRAP_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop. +// +RAJA_INLINE +RAJA_DEVICE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + +#define TRAP_INT_DATA_SETUP_SYCL // nothing to do here... + +#define TRAP_INT_DATA_TEARDOWN_SYCL // nothing to do here... + +template +void TRAP_INT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRAP_INT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + TRAP_INT_DATA_SETUP_SYCL; + + if (work_group_size > 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_ptr sumx; + allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu); + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& hdl) { + + auto sum_reduction = sycl::reduction(sumx, sycl::plus<>()); + + hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, + [=] (sycl::nd_item<1> item, auto& sumx) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + TRAP_INT_BODY + } + + }); + }); + + Real_type lsumx; + Real_ptr plsumx = &lsumx; + getSyclDeviceData(plsumx, sumx, 1, qu); + m_sumx += lsumx * h; + + } + qu->wait(); + stopTimer(); + + } else { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_ptr sumx; + allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu); + + qu->submit([&] (sycl::handler& hdl) { + + auto sum_reduction = sycl::reduction(sumx, sycl::plus<>()); + + hdl.parallel_for(sycl::range<1>(iend), + sum_reduction, + [=] (sycl::item<1> item, auto& sumx ) { + + Index_type i = item.get_id(0); + TRAP_INT_BODY + + }); + }); + + Real_type lsumx; + Real_ptr plsumx = &lsumx; + getSyclDeviceData(plsumx, sumx, 1, qu); + m_sumx += lsumx * h; + + } + qu->wait(); + stopTimer(); + + } + + TRAP_INT_DATA_TEARDOWN_SYCL; + } else if ( vid == RAJA_SYCL ) { + + if ( work_group_size == 0 ) { + std::cout << "\n TRAP_INT : RAJA_SYCL does not support auto work group size" << std::endl; + return; + } + + TRAP_INT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + qu->wait(); + stopTimer(); + + TRAP_INT_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n TRAP_INT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRAP_INT, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index eaac3ffda..cf83ba82f 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -53,6 +53,9 @@ TRAP_INT::TRAP_INT(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } TRAP_INT::~TRAP_INT() diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index e64932dbe..8a3a29024 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -68,13 +68,17 @@ class TRAP_INT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From f7db9dc791d22440836bcd912a39d98d0bd602d1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 31 May 2023 14:03:50 -0700 Subject: [PATCH 005/454] Move HALOEXCHANGE shared code into base --- src/apps/CMakeLists.txt | 1 + src/apps/HALOEXCHANGE.cpp | 403 +-------------------- src/apps/HALOEXCHANGE.hpp | 69 +--- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 10 +- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 10 +- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 14 +- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 10 +- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 10 +- src/apps/HALOEXCHANGE_FUSED.cpp | 403 +-------------------- src/apps/HALOEXCHANGE_FUSED.hpp | 76 +--- src/apps/HALOEXCHANGE_base.cpp | 410 ++++++++++++++++++++++ src/apps/HALOEXCHANGE_base.hpp | 152 ++++++++ 12 files changed, 600 insertions(+), 968 deletions(-) create mode 100644 src/apps/HALOEXCHANGE_base.cpp create mode 100644 src/apps/HALOEXCHANGE_base.hpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 7c1973810..76fea9cfc 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -39,6 +39,7 @@ blt_add_library( FIR-Cuda.cpp FIR-OMP.cpp FIR-OMPTarget.cpp + HALOEXCHANGE_base.cpp HALOEXCHANGE.cpp HALOEXCHANGE-Seq.cpp HALOEXCHANGE-Hip.cpp diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 58534da21..2b3bfd7ec 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -10,8 +10,6 @@ #include "RAJA/RAJA.hpp" -#include "common/DataUtils.hpp" - #include namespace rajaperf @@ -20,44 +18,8 @@ namespace apps { HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) - : KernelBase(rajaperf::Apps_HALOEXCHANGE, params) + : HALOEXCHANGE_base(rajaperf::Apps_HALOEXCHANGE, params) { - m_grid_dims_default[0] = 100; - m_grid_dims_default[1] = 100; - m_grid_dims_default[2] = 100; - m_halo_width_default = 1; - m_num_vars_default = 3; - - setDefaultProblemSize( m_grid_dims_default[0] * - m_grid_dims_default[1] * - m_grid_dims_default[2] ); - setDefaultReps(50); - - double cbrt_run_size = std::cbrt(getTargetProblemSize()); - - m_grid_dims[0] = cbrt_run_size; - m_grid_dims[1] = cbrt_run_size; - m_grid_dims[2] = cbrt_run_size; - m_halo_width = m_halo_width_default; - m_num_vars = m_num_vars_default; - - m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; - m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; - m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width; - m_var_size = m_grid_plus_halo_dims[0] * - m_grid_plus_halo_dims[1] * - m_grid_plus_halo_dims[2] ; - - setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); - - setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); - setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); - setFLOPsPerRep(0); - setUsesFeature(Forall); setVariantDefined( Base_Seq ); @@ -82,368 +44,5 @@ HALOEXCHANGE::~HALOEXCHANGE() { } -void HALOEXCHANGE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - m_vars.resize(m_num_vars, nullptr); - for (Index_type v = 0; v < m_num_vars; ++v) { - allocAndInitData(m_vars[v], m_var_size, vid); - auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); - - Real_ptr var = m_vars[v]; - - for (Index_type i = 0; i < m_var_size; i++) { - var[i] = i + v; - } - } - - m_pack_index_lists.resize(s_num_neighbors, nullptr); - m_pack_index_list_lengths.resize(s_num_neighbors, 0); - create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_unpack_index_lists.resize(s_num_neighbors, nullptr); - m_unpack_index_list_lengths.resize(s_num_neighbors, 0); - create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_buffers.resize(s_num_neighbors, nullptr); - for (Index_type l = 0; l < s_num_neighbors; ++l) { - Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_buffers[l], buffer_len, vid); - } -} - -void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) -{ - for (Real_ptr var : m_vars) { - checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); - } -} - -void HALOEXCHANGE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l], vid); - } - m_buffers.clear(); - - destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid); - m_unpack_index_list_lengths.clear(); - m_unpack_index_lists.clear(); - - destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid); - m_pack_index_list_lengths.clear(); - m_pack_index_lists.clear(); - - for (int v = 0; v < m_num_vars; ++v) { - deallocData(m_vars[v], vid); - } - m_vars.clear(); -} - -namespace { - -struct Extent -{ - Index_type i_min; - Index_type i_max; - Index_type j_min; - Index_type j_max; - Index_type k_min; - Index_type k_max; -}; - -} - -// -// Function to generate index lists for packing. -// -void HALOEXCHANGE::create_pack_lists( - std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector pack_index_list_extents(num_neighbors); - - // faces - pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // edges - pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // corners - pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = pack_index_list_extents[l]; - - pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); - - Int_ptr pack_list = pack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type pack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - pack_list[list_idx] = pack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy packing index lists. -// -void HALOEXCHANGE::destroy_pack_lists( - std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(pack_index_lists[l], vid); - } -} - -// -// Function to generate index lists for unpacking. -// -void HALOEXCHANGE::create_unpack_lists( - std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector unpack_index_list_extents(num_neighbors); - - // faces - unpack_index_list_extents[0] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // edges - unpack_index_list_extents[6] = Extent{0 , halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[7] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[10] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[11] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // corners - unpack_index_list_extents[18] = Extent{0 , halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[19] = Extent{0 , halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[20] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[21] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = unpack_index_list_extents[l]; - - unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - - Int_ptr unpack_list = unpack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type unpack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - unpack_list[list_idx] = unpack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy unpacking index lists. -// -void HALOEXCHANGE::destroy_unpack_lists( - std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(unpack_index_lists[l], vid); - } -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index 1f21d9616..1d5be416f 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -45,38 +45,16 @@ #ifndef RAJAPerf_Apps_HALOEXCHANGE_HPP #define RAJAPerf_Apps_HALOEXCHANGE_HPP -#define HALOEXCHANGE_DATA_SETUP \ - std::vector vars = m_vars; \ - std::vector buffers = m_buffers; \ -\ - Index_type num_neighbors = s_num_neighbors; \ - Index_type num_vars = m_num_vars; \ - std::vector pack_index_lists = m_pack_index_lists; \ - std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ - std::vector unpack_index_lists = m_unpack_index_lists; \ - std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; - -#define HALOEXCHANGE_PACK_BODY \ - buffer[i] = var[list[i]]; - -#define HALOEXCHANGE_UNPACK_BODY \ - var[list[i]] = buffer[i]; - - -#include "common/KernelBase.hpp" +#include "HALOEXCHANGE_base.hpp" #include "RAJA/RAJA.hpp" -#include - namespace rajaperf { -class RunParams; - namespace apps { -class HALOEXCHANGE : public KernelBase +class HALOEXCHANGE : public HALOEXCHANGE_base { public: @@ -84,10 +62,6 @@ class HALOEXCHANGE : public KernelBase ~HALOEXCHANGE(); - void setUp(VariantID vid, size_t tune_idx); - void updateChecksum(VariantID vid, size_t tune_idx); - void tearDown(VariantID vid, size_t tune_idx); - void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); void runCudaVariant(VariantID vid, size_t tune_idx); @@ -104,45 +78,6 @@ class HALOEXCHANGE : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = gpu_block_size::make_list_type; - - static const int s_num_neighbors = 26; - - Index_type m_grid_dims[3]; - Index_type m_halo_width; - Index_type m_num_vars; - - Index_type m_grid_dims_default[3]; - Index_type m_halo_width_default; - Index_type m_num_vars_default; - - Index_type m_grid_plus_halo_dims[3]; - Index_type m_var_size; - Index_type m_var_halo_size; - - std::vector m_vars; - std::vector m_buffers; - - std::vector m_pack_index_lists; - std::vector m_pack_index_list_lengths; - std::vector m_unpack_index_lists; - std::vector m_unpack_index_list_lengths; - - void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid); - void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid); }; } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 464cc114d..ac0fb7b62 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -64,7 +64,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; } } @@ -83,7 +83,7 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; } } @@ -93,7 +93,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -201,7 +201,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -220,7 +220,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 4fb4d5533..badc7457c 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -64,7 +64,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; } } @@ -83,7 +83,7 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; } } @@ -93,7 +93,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; if ( vid == Base_HIP ) { @@ -205,7 +205,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -224,7 +224,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index dc44e2aae..caf8fb67d 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -24,7 +24,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; switch ( vid ) { @@ -60,7 +60,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; } } } @@ -72,7 +72,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; } } #endif @@ -102,7 +102,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; } } } @@ -114,7 +114,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; } } #endif @@ -261,7 +261,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -279,7 +279,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 4dd2dad31..79ac438ec 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -67,7 +67,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -107,7 +107,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type len = pack_len_ptrs[j]; for (Index_type i = ii; i < len; i += pack_len_ave) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; } } } @@ -143,7 +143,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type len = unpack_len_ptrs[j]; for (Index_type i = ii; i < len; i += unpack_len_ave) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; } } } @@ -196,7 +196,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -214,7 +214,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index e7baa12de..2c84f35a1 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -22,7 +22,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; switch ( vid ) { @@ -53,7 +53,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; } } @@ -77,7 +77,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; } } @@ -192,7 +192,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; + HALOEXCHANGE_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -210,7 +210,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; + HALOEXCHANGE_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 74dd5b0d5..63788f6d3 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -10,8 +10,6 @@ #include "RAJA/RAJA.hpp" -#include "common/DataUtils.hpp" - #include namespace rajaperf @@ -20,44 +18,8 @@ namespace apps { HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) - : KernelBase(rajaperf::Apps_HALOEXCHANGE_FUSED, params) + : HALOEXCHANGE_base(rajaperf::Apps_HALOEXCHANGE_FUSED, params) { - m_grid_dims_default[0] = 100; - m_grid_dims_default[1] = 100; - m_grid_dims_default[2] = 100; - m_halo_width_default = 1; - m_num_vars_default = 3; - - setDefaultProblemSize( m_grid_dims_default[0] * - m_grid_dims_default[1] * - m_grid_dims_default[2] ); - setDefaultReps(50); - - double cbrt_run_size = std::cbrt(getTargetProblemSize()); - - m_grid_dims[0] = cbrt_run_size; - m_grid_dims[1] = cbrt_run_size; - m_grid_dims[2] = cbrt_run_size; - m_halo_width = m_halo_width_default; - m_num_vars = m_num_vars_default; - - m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; - m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; - m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width; - m_var_size = m_grid_plus_halo_dims[0] * - m_grid_plus_halo_dims[1] * - m_grid_plus_halo_dims[2] ; - - setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); - - setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); - setKernelsPerRep( 2 ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); - setFLOPsPerRep(0); - setUsesFeature(Workgroup); setVariantDefined( Base_Seq ); @@ -82,368 +44,5 @@ HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() { } -void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - m_vars.resize(m_num_vars, nullptr); - for (Index_type v = 0; v < m_num_vars; ++v) { - allocAndInitData(m_vars[v], m_var_size, vid); - auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); - - Real_ptr var = m_vars[v]; - - for (Index_type i = 0; i < m_var_size; i++) { - var[i] = i + v; - } - } - - m_pack_index_lists.resize(s_num_neighbors, nullptr); - m_pack_index_list_lengths.resize(s_num_neighbors, 0); - create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_unpack_index_lists.resize(s_num_neighbors, nullptr); - m_unpack_index_list_lengths.resize(s_num_neighbors, 0); - create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_buffers.resize(s_num_neighbors, nullptr); - for (Index_type l = 0; l < s_num_neighbors; ++l) { - Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_buffers[l], buffer_len, vid); - } -} - -void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) -{ - for (Real_ptr var : m_vars) { - checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); - } -} - -void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) -{ - for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l], vid); - } - m_buffers.clear(); - - destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid); - m_unpack_index_list_lengths.clear(); - m_unpack_index_lists.clear(); - - destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid); - m_pack_index_list_lengths.clear(); - m_pack_index_lists.clear(); - - for (int v = 0; v < m_num_vars; ++v) { - deallocData(m_vars[v], vid); - } - m_vars.clear(); -} - -namespace { - -struct Extent -{ - Index_type i_min; - Index_type i_max; - Index_type j_min; - Index_type j_max; - Index_type k_min; - Index_type k_max; -}; - -} - -// -// Function to generate index lists for packing. -// -void HALOEXCHANGE_FUSED::create_pack_lists( - std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector pack_index_list_extents(num_neighbors); - - // faces - pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // edges - pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - // corners - pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = pack_index_list_extents[l]; - - pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); - - Int_ptr pack_list = pack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type pack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - pack_list[list_idx] = pack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy packing index lists. -// -void HALOEXCHANGE_FUSED::destroy_pack_lists( - std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(pack_index_lists[l], vid); - } -} - -// -// Function to generate index lists for unpacking. -// -void HALOEXCHANGE_FUSED::create_unpack_lists( - std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - std::vector unpack_index_list_extents(num_neighbors); - - // faces - unpack_index_list_extents[0] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // edges - unpack_index_list_extents[6] = Extent{0 , halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[7] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[10] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[11] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - // corners - unpack_index_list_extents[18] = Extent{0 , halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[19] = Extent{0 , halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[20] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[21] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { - - Extent extent = unpack_index_list_extents[l]; - - unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - - Int_ptr unpack_list = unpack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type unpack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; - - unpack_list[list_idx] = unpack_idx; - - list_idx += 1; - } - } - } - } -} - -// -// Function to destroy unpacking index lists. -// -void HALOEXCHANGE_FUSED::destroy_unpack_lists( - std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - (void) vid; - - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(unpack_index_lists[l], vid); - } -} - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index b0af7e60e..71749b6e3 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -18,7 +18,7 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_FUSED_PACK_BODY; +/// HALOEXCHANGE_PACK_BODY; /// } /// buffer += len; /// } @@ -35,7 +35,7 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_FUSED_UNPACK_BODY; +/// HALOEXCHANGE_UNPACK_BODY; /// } /// buffer += len; /// } @@ -45,17 +45,6 @@ #ifndef RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP #define RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP -#define HALOEXCHANGE_FUSED_DATA_SETUP \ - std::vector vars = m_vars; \ - std::vector buffers = m_buffers; \ -\ - Index_type num_neighbors = s_num_neighbors; \ - Index_type num_vars = m_num_vars; \ - std::vector pack_index_lists = m_pack_index_lists; \ - std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ - std::vector unpack_index_lists = m_unpack_index_lists; \ - std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; - #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ Real_ptr buffer; \ @@ -73,17 +62,11 @@ delete[] unpack_ptr_holders; \ delete[] unpack_lens; -#define HALOEXCHANGE_FUSED_PACK_BODY \ - buffer[i] = var[list[i]]; - -#define HALOEXCHANGE_FUSED_UNPACK_BODY \ - var[list[i]] = buffer[i]; - #define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_FUSED_PACK_BODY; \ + HALOEXCHANGE_PACK_BODY; \ }; \ }; \ using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ @@ -92,7 +75,7 @@ Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_FUSED_UNPACK_BODY; \ + HALOEXCHANGE_UNPACK_BODY; \ }; \ }; \ using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ @@ -107,20 +90,16 @@ delete[] unpack_lens; -#include "common/KernelBase.hpp" +#include "HALOEXCHANGE_base.hpp" #include "RAJA/RAJA.hpp" -#include - namespace rajaperf { -class RunParams; - namespace apps { -class HALOEXCHANGE_FUSED : public KernelBase +class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base { public: @@ -128,10 +107,6 @@ class HALOEXCHANGE_FUSED : public KernelBase ~HALOEXCHANGE_FUSED(); - void setUp(VariantID vid, size_t tune_idx); - void updateChecksum(VariantID vid, size_t tune_idx); - void tearDown(VariantID vid, size_t tune_idx); - void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); void runCudaVariant(VariantID vid, size_t tune_idx); @@ -148,45 +123,6 @@ class HALOEXCHANGE_FUSED : public KernelBase private: static const size_t default_gpu_block_size = 1024; using gpu_block_sizes_type = gpu_block_size::make_list_type; - - static const int s_num_neighbors = 26; - - Index_type m_grid_dims[3]; - Index_type m_halo_width; - Index_type m_num_vars; - - Index_type m_grid_dims_default[3]; - Index_type m_halo_width_default; - Index_type m_num_vars_default; - - Index_type m_grid_plus_halo_dims[3]; - Index_type m_var_size; - Index_type m_var_halo_size; - - std::vector m_vars; - std::vector m_buffers; - - std::vector m_pack_index_lists; - std::vector m_pack_index_list_lengths; - std::vector m_unpack_index_lists; - std::vector m_unpack_index_list_lengths; - - void create_pack_lists(std::vector& pack_index_lists, - std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_pack_lists(std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid); - void create_unpack_lists(std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - void destroy_unpack_lists(std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid); }; } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp new file mode 100644 index 000000000..f518a15c4 --- /dev/null +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -0,0 +1,410 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALOEXCHANGE_base.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +HALOEXCHANGE_base::HALOEXCHANGE_base(KernelID kid, const RunParams& params) + : KernelBase(kid, params) +{ + m_grid_dims_default[0] = 100; + m_grid_dims_default[1] = 100; + m_grid_dims_default[2] = 100; + m_halo_width_default = 1; + m_num_vars_default = 3; + + setDefaultProblemSize( m_grid_dims_default[0] * + m_grid_dims_default[1] * + m_grid_dims_default[2] ); + setDefaultReps(50); + + double cbrt_run_size = std::cbrt(getTargetProblemSize()); + + m_grid_dims[0] = cbrt_run_size; + m_grid_dims[1] = cbrt_run_size; + m_grid_dims[2] = cbrt_run_size; + m_halo_width = m_halo_width_default; + m_num_vars = m_num_vars_default; + + m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; + m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; + m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width; + m_var_size = m_grid_plus_halo_dims[0] * + m_grid_plus_halo_dims[1] * + m_grid_plus_halo_dims[2] ; + + setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); + setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); + setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + + (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); + setFLOPsPerRep(0); +} + +HALOEXCHANGE_base::~HALOEXCHANGE_base() +{ +} + +void HALOEXCHANGE_base::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + + m_pack_index_lists.resize(s_num_neighbors, nullptr); + m_pack_index_list_lengths.resize(s_num_neighbors, 0); + create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); + + m_unpack_index_lists.resize(s_num_neighbors, nullptr); + m_unpack_index_list_lengths.resize(s_num_neighbors, 0); + create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); + + m_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + allocAndInitData(m_buffers[l], buffer_len, vid); + } +} + +void HALOEXCHANGE_base::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } +} + +void HALOEXCHANGE_base::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + for (int l = 0; l < s_num_neighbors; ++l) { + deallocData(m_buffers[l], vid); + } + m_buffers.clear(); + + destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid); + m_unpack_index_list_lengths.clear(); + m_unpack_index_lists.clear(); + + destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid); + m_pack_index_list_lengths.clear(); + m_pack_index_lists.clear(); + + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); +} + +// +// Function to generate index lists for packing. +// +void HALOEXCHANGE_base::create_pack_lists( + std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid) +{ + std::vector pack_index_list_extents(num_neighbors); + + // faces + pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + + // edges + pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + + // corners + pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, + halo_width , halo_width + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + halo_width , halo_width + halo_width}; + pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, + grid_dims[1], grid_dims[1] + halo_width, + grid_dims[2], grid_dims[2] + halo_width}; + + const Index_type grid_i_stride = 1; + const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; + const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); + + for (Index_type l = 0; l < num_neighbors; ++l) { + + Extent extent = pack_index_list_extents[l]; + + pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + (extent.j_max - extent.j_min) * + (extent.k_max - extent.k_min) ; + + allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); + + Int_ptr pack_list = pack_index_lists[l]; + + Index_type list_idx = 0; + for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { + for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { + for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { + + Index_type pack_idx = ii * grid_i_stride + + jj * grid_j_stride + + kk * grid_k_stride ; + + pack_list[list_idx] = pack_idx; + + list_idx += 1; + } + } + } + } +} + +// +// Function to destroy packing index lists. +// +void HALOEXCHANGE_base::destroy_pack_lists( + std::vector& pack_index_lists, + const Index_type num_neighbors, + VariantID vid) +{ + for (Index_type l = 0; l < num_neighbors; ++l) { + deallocData(pack_index_lists[l], vid); + } +} + +// +// Function to generate index lists for unpacking. +// +void HALOEXCHANGE_base::create_unpack_lists( + std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid) +{ + std::vector unpack_index_list_extents(num_neighbors); + + // faces + unpack_index_list_extents[0] = Extent{0 , halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + halo_width , grid_dims[1] + halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, + 0 , halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + 0 , halo_width}; + unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + + // edges + unpack_index_list_extents[6] = Extent{0 , halo_width, + 0 , halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[7] = Extent{0 , halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + 0 , halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + halo_width , grid_dims[2] + halo_width}; + unpack_index_list_extents[10] = Extent{0 , halo_width, + halo_width , grid_dims[1] + halo_width, + 0 , halo_width}; + unpack_index_list_extents[11] = Extent{0 , halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + halo_width , grid_dims[1] + halo_width, + 0 , halo_width}; + unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + halo_width , grid_dims[1] + halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, + 0 , halo_width, + 0 , halo_width}; + unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, + 0 , halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + 0 , halo_width}; + unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + + // corners + unpack_index_list_extents[18] = Extent{0 , halo_width, + 0 , halo_width, + 0 , halo_width}; + unpack_index_list_extents[19] = Extent{0 , halo_width, + 0 , halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[20] = Extent{0 , halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + 0 , halo_width}; + unpack_index_list_extents[21] = Extent{0 , halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + 0 , halo_width, + 0 , halo_width}; + unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + 0 , halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + 0 , halo_width}; + unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, + grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, + grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + + const Index_type grid_i_stride = 1; + const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; + const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); + + for (Index_type l = 0; l < num_neighbors; ++l) { + + Extent extent = unpack_index_list_extents[l]; + + unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + (extent.j_max - extent.j_min) * + (extent.k_max - extent.k_min) ; + + allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + + Int_ptr unpack_list = unpack_index_lists[l]; + + Index_type list_idx = 0; + for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { + for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { + for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { + + Index_type unpack_idx = ii * grid_i_stride + + jj * grid_j_stride + + kk * grid_k_stride ; + + unpack_list[list_idx] = unpack_idx; + + list_idx += 1; + } + } + } + } +} + +// +// Function to destroy unpacking index lists. +// +void HALOEXCHANGE_base::destroy_unpack_lists( + std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid) +{ + for (Index_type l = 0; l < num_neighbors; ++l) { + deallocData(unpack_index_lists[l], vid); + } +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp new file mode 100644 index 000000000..18c91aece --- /dev/null +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -0,0 +1,152 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HALOEXCHANGE kernel reference implementation: +/// +/// // pack message for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// HALOEXCHANGE_PACK_BODY; +/// } +/// buffer += len; +/// } +/// // send message to neighbor +/// } +/// +/// // unpack messages for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// // receive message from neighbor +/// Real_ptr buffer = buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// HALOEXCHANGE_UNPACK_BODY; +/// } +/// buffer += len; +/// } +/// } +/// + +#ifndef RAJAPerf_Apps_HALOEXCHANGE_base_HPP +#define RAJAPerf_Apps_HALOEXCHANGE_base_HPP + +#define HALOEXCHANGE_DATA_SETUP \ + std::vector vars = m_vars; \ + std::vector buffers = m_buffers; \ +\ + Index_type num_neighbors = s_num_neighbors; \ + Index_type num_vars = m_num_vars; \ + std::vector pack_index_lists = m_pack_index_lists; \ + std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ + std::vector unpack_index_lists = m_unpack_index_lists; \ + std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; + +#define HALOEXCHANGE_PACK_BODY \ + buffer[i] = var[list[i]]; + +#define HALOEXCHANGE_UNPACK_BODY \ + var[list[i]] = buffer[i]; + + +#include "common/KernelBase.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +class RunParams; + +namespace apps +{ + +class HALOEXCHANGE_base : public KernelBase +{ +public: + + HALOEXCHANGE_base(KernelID kid, const RunParams& params); + + ~HALOEXCHANGE_base(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + +protected: + struct Extent + { + Index_type i_min; + Index_type i_max; + Index_type j_min; + Index_type j_max; + Index_type k_min; + Index_type k_max; + }; + + static const int s_num_neighbors = 26; + + Index_type m_grid_dims[3]; + Index_type m_halo_width; + Index_type m_num_vars; + + Index_type m_grid_dims_default[3]; + Index_type m_halo_width_default; + Index_type m_num_vars_default; + + Index_type m_grid_plus_halo_dims[3]; + Index_type m_var_size; + Index_type m_var_halo_size; + + std::vector m_vars; + std::vector m_buffers; + + std::vector m_pack_index_lists; + std::vector m_pack_index_list_lengths; + std::vector m_unpack_index_lists; + std::vector m_unpack_index_list_lengths; + + void create_pack_lists( + std::vector& pack_index_lists, + std::vector& pack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid); + + void destroy_pack_lists( + std::vector& pack_index_lists, + const Index_type num_neighbors, + VariantID vid); + + void create_unpack_lists( + std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, + const Index_type halo_width, const Index_type* grid_dims, + const Index_type num_neighbors, + VariantID vid); + + void destroy_unpack_lists( + std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid); +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard From 8b209d3c8fe78251b2ae46e5daa3ca331d39a85a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 31 May 2023 16:41:02 -0700 Subject: [PATCH 006/454] Use periodic boundaries in HALOEXCHANGE Change corresponding pack and unpack extents to be on opposite sides of the domain so they make sense with periodic boundary conditions. Also refactor extent generation to be easier to understand. --- src/apps/HALOEXCHANGE_base.cpp | 370 +++++++++++++++++++-------------- src/apps/HALOEXCHANGE_base.hpp | 13 ++ 2 files changed, 227 insertions(+), 156 deletions(-) diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp index f518a15c4..f1cb179f0 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -10,6 +10,7 @@ #include "RAJA/RAJA.hpp" +#include #include namespace rajaperf @@ -118,6 +119,59 @@ void HALOEXCHANGE_base::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ m_vars.clear(); } + +enum struct location : int +{ + low_phony, + low_interior, + all_interior, + high_interior, + high_phony +}; + +HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_extent( + HALOEXCHANGE_base::location x_extent, + HALOEXCHANGE_base::location y_extent, + HALOEXCHANGE_base::location z_extent, + const Index_type halo_width, const Index_type* grid_dims) +{ + auto get_bounds = [&](location loc, Index_type dim_size) { + std::pair bounds; + switch (loc) { + case location::low_phony: + bounds.first = 0; + bounds.second = halo_width; + break; + case location::low_interior: + bounds.first = halo_width; + bounds.second = halo_width + halo_width; + break; + case location::all_interior: + bounds.first = halo_width; + bounds.second = halo_width + dim_size; + break; + case location::high_interior: + bounds.first = halo_width + dim_size - halo_width; + bounds.second = halo_width + dim_size; + break; + case location::high_phony: + bounds.first = halo_width + dim_size; + bounds.second = halo_width + dim_size + halo_width; + break; + default: + throw std::runtime_error("make_extent: Invalid location"); + } + return bounds; + }; + auto x_bounds = get_bounds(x_extent, grid_dims[0]); + auto y_bounds = get_bounds(y_extent, grid_dims[1]); + auto z_bounds = get_bounds(z_extent, grid_dims[2]); + return {x_bounds.first, x_bounds.second, + y_bounds.first, y_bounds.second, + z_bounds.first, z_bounds.second}; +} + + // // Function to generate index lists for packing. // @@ -130,89 +184,91 @@ void HALOEXCHANGE_base::create_pack_lists( { std::vector pack_index_list_extents(num_neighbors); + // The pack extents have high and low flipped compared to the unpack extents. + // faces - pack_index_list_extents[0] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[1] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[0] = make_extent(location::low_interior, + location::all_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[1] = make_extent(location::high_interior, + location::all_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[2] = make_extent(location::all_interior, + location::low_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[3] = make_extent(location::all_interior, + location::high_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[4] = make_extent(location::all_interior, + location::all_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[5] = make_extent(location::all_interior, + location::all_interior, + location::high_interior, halo_width, grid_dims); // edges - pack_index_list_extents[6] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[7] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[8] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[9] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - pack_index_list_extents[10] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[11] = Extent{halo_width , halo_width + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[12] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[13] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[6] = make_extent(location::low_interior, + location::low_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[7] = make_extent(location::low_interior, + location::high_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[8] = make_extent(location::high_interior, + location::low_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[9] = make_extent(location::high_interior, + location::high_interior, + location::all_interior, halo_width, grid_dims); + pack_index_list_extents[10] = make_extent(location::low_interior, + location::all_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[11] = make_extent(location::low_interior, + location::all_interior, + location::high_interior, halo_width, grid_dims); + pack_index_list_extents[12] = make_extent(location::high_interior, + location::all_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[13] = make_extent(location::high_interior, + location::all_interior, + location::high_interior, halo_width, grid_dims); + pack_index_list_extents[14] = make_extent(location::all_interior, + location::low_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[15] = make_extent(location::all_interior, + location::low_interior, + location::high_interior, halo_width, grid_dims); + pack_index_list_extents[16] = make_extent(location::all_interior, + location::high_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[17] = make_extent(location::all_interior, + location::high_interior, + location::high_interior, halo_width, grid_dims); // corners - pack_index_list_extents[18] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[19] = Extent{halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[20] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[21] = Extent{halo_width , halo_width + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[22] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[23] = Extent{grid_dims[0], grid_dims[0] + halo_width, - halo_width , halo_width + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; - pack_index_list_extents[24] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - halo_width , halo_width + halo_width}; - pack_index_list_extents[25] = Extent{grid_dims[0], grid_dims[0] + halo_width, - grid_dims[1], grid_dims[1] + halo_width, - grid_dims[2], grid_dims[2] + halo_width}; + pack_index_list_extents[18] = make_extent(location::low_interior, + location::low_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[19] = make_extent(location::low_interior, + location::low_interior, + location::high_interior, halo_width, grid_dims); + pack_index_list_extents[20] = make_extent(location::low_interior, + location::high_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[21] = make_extent(location::low_interior, + location::high_interior, + location::high_interior, halo_width, grid_dims); + pack_index_list_extents[22] = make_extent(location::high_interior, + location::low_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[23] = make_extent(location::high_interior, + location::low_interior, + location::high_interior, halo_width, grid_dims); + pack_index_list_extents[24] = make_extent(location::high_interior, + location::high_interior, + location::low_interior, halo_width, grid_dims); + pack_index_list_extents[25] = make_extent(location::high_interior, + location::high_interior, + location::high_interior, halo_width, grid_dims); const Index_type grid_i_stride = 1; const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; @@ -274,89 +330,91 @@ void HALOEXCHANGE_base::create_unpack_lists( { std::vector unpack_index_list_extents(num_neighbors); + // The pack extents have high and low flipped compared to the unpack extents. + // faces - unpack_index_list_extents[0] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[1] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[2] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[3] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[4] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[5] = Extent{halo_width , grid_dims[0] + halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[0] = make_extent(location::high_phony, + location::all_interior, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[1] = make_extent(location::low_phony, + location::all_interior, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[2] = make_extent(location::all_interior, + location::high_phony, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[3] = make_extent(location::all_interior, + location::low_phony, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[4] = make_extent(location::all_interior, + location::all_interior, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[5] = make_extent(location::all_interior, + location::all_interior, + location::low_phony, halo_width, grid_dims); // edges - unpack_index_list_extents[6] = Extent{0 , halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[7] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[8] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[9] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - halo_width , grid_dims[2] + halo_width}; - unpack_index_list_extents[10] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[11] = Extent{0 , halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[12] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - 0 , halo_width}; - unpack_index_list_extents[13] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - halo_width , grid_dims[1] + halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[14] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[15] = Extent{halo_width , grid_dims[0] + halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[16] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[17] = Extent{halo_width , grid_dims[0] + halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[6] = make_extent(location::high_phony, + location::high_phony, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[7] = make_extent(location::high_phony, + location::low_phony, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[8] = make_extent(location::low_phony, + location::high_phony, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[9] = make_extent(location::low_phony, + location::low_phony, + location::all_interior, halo_width, grid_dims); + unpack_index_list_extents[10] = make_extent(location::high_phony, + location::all_interior, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[11] = make_extent(location::high_phony, + location::all_interior, + location::low_phony, halo_width, grid_dims); + unpack_index_list_extents[12] = make_extent(location::low_phony, + location::all_interior, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[13] = make_extent(location::low_phony, + location::all_interior, + location::low_phony, halo_width, grid_dims); + unpack_index_list_extents[14] = make_extent(location::all_interior, + location::high_phony, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[15] = make_extent(location::all_interior, + location::high_phony, + location::low_phony, halo_width, grid_dims); + unpack_index_list_extents[16] = make_extent(location::all_interior, + location::low_phony, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[17] = make_extent(location::all_interior, + location::low_phony, + location::low_phony, halo_width, grid_dims); // corners - unpack_index_list_extents[18] = Extent{0 , halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[19] = Extent{0 , halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[20] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[21] = Extent{0 , halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[22] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - 0 , halo_width}; - unpack_index_list_extents[23] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - 0 , halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; - unpack_index_list_extents[24] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - 0 , halo_width}; - unpack_index_list_extents[25] = Extent{grid_dims[0] + halo_width, grid_dims[0] + 2*halo_width, - grid_dims[1] + halo_width, grid_dims[1] + 2*halo_width, - grid_dims[2] + halo_width, grid_dims[2] + 2*halo_width}; + unpack_index_list_extents[18] = make_extent(location::high_phony, + location::high_phony, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[19] = make_extent(location::high_phony, + location::high_phony, + location::low_phony, halo_width, grid_dims); + unpack_index_list_extents[20] = make_extent(location::high_phony, + location::low_phony, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[21] = make_extent(location::high_phony, + location::low_phony, + location::low_phony, halo_width, grid_dims); + unpack_index_list_extents[22] = make_extent(location::low_phony, + location::high_phony, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[23] = make_extent(location::low_phony, + location::high_phony, + location::low_phony, halo_width, grid_dims); + unpack_index_list_extents[24] = make_extent(location::low_phony, + location::low_phony, + location::high_phony, halo_width, grid_dims); + unpack_index_list_extents[25] = make_extent(location::low_phony, + location::low_phony, + location::low_phony, halo_width, grid_dims); const Index_type grid_i_stride = 1; const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp index 18c91aece..85f67ca50 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -89,6 +89,15 @@ class HALOEXCHANGE_base : public KernelBase void tearDown(VariantID vid, size_t tune_idx); protected: + enum struct location : int + { + low_phony, + low_interior, + all_interior, + high_interior, + high_phony + }; + struct Extent { Index_type i_min; @@ -121,6 +130,10 @@ class HALOEXCHANGE_base : public KernelBase std::vector m_unpack_index_lists; std::vector m_unpack_index_list_lengths; + Extent make_extent( + location x_extent, location y_extent, location z_extent, + const Index_type halo_width, const Index_type* grid_dims); + void create_pack_lists( std::vector& pack_index_lists, std::vector& pack_index_list_lengths, From aa2226dfd9d7cd7944c559b96cda4fb2a54cca9b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 31 May 2023 16:41:40 -0700 Subject: [PATCH 007/454] Add basic MPI_HALOEXCHANGE Only Base_Seq is implemented --- src/CMakeLists.txt | 3 + src/apps/CMakeLists.txt | 6 + src/apps/MPI_HALOEXCHANGE-Cuda.cpp | 150 +++++++++++++++++++ src/apps/MPI_HALOEXCHANGE-Hip.cpp | 152 ++++++++++++++++++++ src/apps/MPI_HALOEXCHANGE-OMP.cpp | 176 +++++++++++++++++++++++ src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp | 126 ++++++++++++++++ src/apps/MPI_HALOEXCHANGE-Seq.cpp | 184 ++++++++++++++++++++++++ src/apps/MPI_HALOEXCHANGE.cpp | 73 ++++++++++ src/apps/MPI_HALOEXCHANGE.hpp | 107 ++++++++++++++ src/common/RAJAPerfSuite.cpp | 12 ++ src/common/RAJAPerfSuite.hpp | 5 + src/common/RPTypes.hpp | 4 + 12 files changed, 998 insertions(+) create mode 100644 src/apps/MPI_HALOEXCHANGE-Cuda.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE-Hip.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE-OMP.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE-Seq.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 52d54cb67..b5b8b3599 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -63,6 +63,9 @@ blt_add_executable( apps/HALOEXCHANGE_FUSED.cpp apps/HALOEXCHANGE_FUSED-Seq.cpp apps/HALOEXCHANGE_FUSED-OMPTarget.cpp + apps/MPI_HALOEXCHANGE.cpp + apps/MPI_HALOEXCHANGE-Seq.cpp + apps/MPI_HALOEXCHANGE-OMPTarget.cpp apps/LTIMES.cpp apps/LTIMES-Seq.cpp apps/LTIMES-OMPTarget.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 76fea9cfc..eef675ac1 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -52,6 +52,12 @@ blt_add_library( HALOEXCHANGE_FUSED-Cuda.cpp HALOEXCHANGE_FUSED-OMP.cpp HALOEXCHANGE_FUSED-OMPTarget.cpp + MPI_HALOEXCHANGE.cpp + MPI_HALOEXCHANGE-Seq.cpp + MPI_HALOEXCHANGE-Hip.cpp + MPI_HALOEXCHANGE-Cuda.cpp + MPI_HALOEXCHANGE-OMP.cpp + MPI_HALOEXCHANGE-OMPTarget.cpp LTIMES.cpp LTIMES-Seq.cpp LTIMES-Hip.cpp diff --git a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp new file mode 100644 index 000000000..431b5e5a7 --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp @@ -0,0 +1,150 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALOEXCHANGE_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALOEXCHANGE_UNPACK_BODY; + } +} + + +template < size_t block_size > +void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) +{ +#if 0 + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + haloexchange_pack<<>>(buffer, list, var, len); + cudaErrchk( cudaGetLastError() ); + buffer += len; + } + } + synchronize(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + haloexchange_unpack<<>>(buffer, list, var, len); + cudaErrchk( cudaGetLastError() ); + buffer += len; + } + } + synchronize(); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + using EXEC_POL = RAJA::cuda_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); + buffer += len; + } + } + synchronize(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); + buffer += len; + } + } + synchronize(); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; + } +#endif +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MPI_HALOEXCHANGE, Cuda) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MPI_HALOEXCHANGE-Hip.cpp b/src/apps/MPI_HALOEXCHANGE-Hip.cpp new file mode 100644 index 000000000..78cc7903b --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE-Hip.cpp @@ -0,0 +1,152 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALOEXCHANGE_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, + Index_type len) +{ + Index_type i = threadIdx.x + blockIdx.x * block_size; + + if (i < len) { + HALOEXCHANGE_UNPACK_BODY; + } +} + + +template < size_t block_size > +void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) +{ +#if 0 + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_DATA_SETUP; + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, 0, + buffer, list, var, len); + hipErrchk( hipGetLastError() ); + buffer += len; + } + } + synchronize(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + dim3 nthreads_per_block(block_size); + dim3 nblocks((len + block_size-1) / block_size); + hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, 0, + buffer, list, var, len); + hipErrchk( hipGetLastError() ); + buffer += len; + } + } + synchronize(); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + using EXEC_POL = RAJA::hip_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); + buffer += len; + } + } + synchronize(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); + buffer += len; + } + } + synchronize(); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; + } +#endif +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MPI_HALOEXCHANGE, Hip) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/MPI_HALOEXCHANGE-OMP.cpp b/src/apps/MPI_HALOEXCHANGE-OMP.cpp new file mode 100644 index 000000000..82fa3b8cc --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE-OMP.cpp @@ -0,0 +1,176 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) && 0 + + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_PACK_BODY; + } + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_UNPACK_BODY; + } + buffer += len; + } + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + haloexchange_pack_base_lam(i); + } + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + #pragma omp parallel for + for (Index_type i = 0; i < len; i++) { + haloexchange_unpack_base_lam(i); + } + buffer += len; + } + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + using EXEC_POL = RAJA::omp_parallel_for_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); + buffer += len; + } + } + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MPI_HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace apps +} // end namespace rajaperf + +#endif diff --git a/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp b/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp new file mode 100644 index 000000000..e93ddfe3c --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp @@ -0,0 +1,126 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + #if 0 + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp target is_device_ptr(buffer, list, var) device( did ) + #pragma omp teams distribute parallel for schedule(static, 1) + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_PACK_BODY; + } + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + #pragma omp target is_device_ptr(buffer, list, var) device( did ) + #pragma omp teams distribute parallel for schedule(static, 1) + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_UNPACK_BODY; + } + buffer += len; + } + } + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + using EXEC_POL = RAJA::omp_target_parallel_for_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); + buffer += len; + } + } + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; + } +#endif +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/MPI_HALOEXCHANGE-Seq.cpp b/src/apps/MPI_HALOEXCHANGE-Seq.cpp new file mode 100644 index 000000000..98bba26b3 --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE-Seq.cpp @@ -0,0 +1,184 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_PACK_BODY; + } + buffer += len; + } + int mpi_rank = mpi_ranks[l]; + MPI_Isend(buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_UNPACK_BODY; + } + buffer += len; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) && 0 + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + for (Index_type i = 0; i < len; i++) { + haloexchange_pack_base_lam(i); + } + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + for (Index_type i = 0; i < len; i++) { + haloexchange_unpack_base_lam(i); + } + buffer += len; + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + using EXEC_POL = RAJA::loop_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); + buffer += len; + } + } + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n MPI_HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf + +#endif diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp new file mode 100644 index 000000000..96f7c95a4 --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -0,0 +1,73 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace apps +{ + +MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) + : HALOEXCHANGE_base(rajaperf::Apps_MPI_HALOEXCHANGE, params) +{ + setUsesFeature(Forall); + setUsesFeature(MPI); + + setVariantDefined( Base_Seq ); + // setVariantDefined( Lambda_Seq ); + // setVariantDefined( RAJA_Seq ); + + // setVariantDefined( Base_OpenMP ); + // setVariantDefined( Lambda_OpenMP ); + // setVariantDefined( RAJA_OpenMP ); + + // setVariantDefined( Base_OpenMPTarget ); + // setVariantDefined( RAJA_OpenMPTarget ); + + // setVariantDefined( Base_CUDA ); + // setVariantDefined( RAJA_CUDA ); + + // setVariantDefined( Base_HIP ); + // setVariantDefined( RAJA_HIP ); +} + +MPI_HALOEXCHANGE::~MPI_HALOEXCHANGE() +{ +} + +void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) +{ + HALOEXCHANGE_base::setUp(vid, tune_idx); + + m_mpi_ranks.resize(s_num_neighbors, -1); + + MPI_Comm_rank(MPI_COMM_WORLD, &m_my_mpi_rank); + + for (Index_type l = 0; l < s_num_neighbors; ++l) { + m_mpi_ranks[l] = m_my_mpi_rank; // send and recv to own rank + } +} + +void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) +{ + m_mpi_ranks.clear(); + + HALOEXCHANGE_base::tearDown(vid, tune_idx); +} + +} // end namespace apps +} // end namespace rajaperf + +#endif diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/apps/MPI_HALOEXCHANGE.hpp new file mode 100644 index 000000000..cf46a6cdf --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE.hpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// MPI_HALOEXCHANGE kernel reference implementation: +/// +/// // pack message for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// HALOEXCHANGE_PACK_BODY; +/// } +/// buffer += len; +/// } +/// // send message to neighbor +/// } +/// +/// // unpack messages for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// // receive message from neighbor +/// Real_ptr buffer = buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// HALOEXCHANGE_UNPACK_BODY; +/// } +/// buffer += len; +/// } +/// } +/// + +#ifndef RAJAPerf_Apps_MPI_HALOEXCHANGE_HPP +#define RAJAPerf_Apps_MPI_HALOEXCHANGE_HPP + +#define MPI_HALOEXCHANGE_DATA_SETUP \ + HALOEXCHANGE_DATA_SETUP \ + \ + const int my_mpi_rank = m_my_mpi_rank; \ + std::vector mpi_ranks = m_mpi_ranks; \ + \ + std::vector pack_mpi_requests(num_neighbors); \ + std::vector unpack_mpi_requests(num_neighbors); + + +#include "HALOEXCHANGE_base.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace apps +{ + +class MPI_HALOEXCHANGE : public HALOEXCHANGE_base +{ +public: + + MPI_HALOEXCHANGE(const RunParams& params); + + ~MPI_HALOEXCHANGE(); + + void setUp(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + int m_my_mpi_rank = -1; + std::vector m_mpi_ranks; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 63a777bd8..e85427c7d 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -85,6 +85,9 @@ #include "apps/FIR.hpp" #include "apps/HALOEXCHANGE.hpp" #include "apps/HALOEXCHANGE_FUSED.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "apps/MPI_HALOEXCHANGE.hpp" +#endif #include "apps/LTIMES.hpp" #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DPA.hpp" @@ -220,6 +223,9 @@ static const std::string KernelNames [] = std::string("Apps_FIR"), std::string("Apps_HALOEXCHANGE"), std::string("Apps_HALOEXCHANGE_FUSED"), +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + std::string("Apps_MPI_HALOEXCHANGE"), +#endif std::string("Apps_LTIMES"), std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DPA"), @@ -857,6 +863,12 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::HALOEXCHANGE_FUSED(run_params); break; } +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + case Apps_MPI_HALOEXCHANGE : { + kernel = new apps::MPI_HALOEXCHANGE(run_params); + break; + } +#endif case Apps_LTIMES : { kernel = new apps::LTIMES(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index ce82a8ab7..bf627d1c4 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -138,6 +138,9 @@ enum KernelID { Apps_FIR, Apps_HALOEXCHANGE, Apps_HALOEXCHANGE_FUSED, +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + Apps_MPI_HALOEXCHANGE, +#endif Apps_LTIMES, Apps_LTIMES_NOVIEW, Apps_MASS3DPA, @@ -227,6 +230,8 @@ enum FeatureID { View, + MPI, + NumFeatures // Keep this one last and NEVER comment out (!!) }; diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index b86f6b7b6..0743a9b0d 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -95,10 +95,14 @@ using Checksum_type = long double; #if defined(RP_USE_DOUBLE) /// using Real_type = double; +/// +#define Real_MPI_type MPI_DOUBLE #elif defined(RP_USE_FLOAT) /// using Real_type = float; +/// +#define Real_MPI_type MPI_FLOAT #else #error Real_type is undefined! From 0c4f53d111b21dfe8ef9d5df7650133033c8edb9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 07:54:02 -0700 Subject: [PATCH 008/454] Use separate pack and unpack buffers with MPI_HALOEXCHANGE --- src/apps/HALOEXCHANGE.cpp | 21 +++++++++++++++++++ src/apps/HALOEXCHANGE.hpp | 11 ++++++++++ src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 2 +- src/apps/HALOEXCHANGE_FUSED.cpp | 21 +++++++++++++++++++ src/apps/HALOEXCHANGE_FUSED.hpp | 10 +++++++++ src/apps/HALOEXCHANGE_base.cpp | 11 ---------- src/apps/HALOEXCHANGE_base.hpp | 6 ++---- src/apps/MPI_HALOEXCHANGE-Seq.cpp | 8 ++++---- src/apps/MPI_HALOEXCHANGE.cpp | 25 +++++++++++++++++++++-- src/apps/MPI_HALOEXCHANGE.hpp | 10 +++++++-- 14 files changed, 105 insertions(+), 28 deletions(-) diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 2b3bfd7ec..08f1849a5 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -44,5 +44,26 @@ HALOEXCHANGE::~HALOEXCHANGE() { } +void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) +{ + HALOEXCHANGE_base::setUp(vid, tune_idx); + + m_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + allocAndInitData(m_buffers[l], buffer_len, vid); + } +} + +void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) +{ + for (int l = 0; l < s_num_neighbors; ++l) { + deallocData(m_buffers[l], vid); + } + m_buffers.clear(); + + HALOEXCHANGE_base::tearDown(vid, tune_idx); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index 1d5be416f..602db0efd 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -45,6 +45,12 @@ #ifndef RAJAPerf_Apps_HALOEXCHANGE_HPP #define RAJAPerf_Apps_HALOEXCHANGE_HPP +#define HALOEXCHANGE_DATA_SETUP \ + HALOEXCHANGE_base_DATA_SETUP \ + \ + std::vector buffers = m_buffers; + + #include "HALOEXCHANGE_base.hpp" #include "RAJA/RAJA.hpp" @@ -62,6 +68,9 @@ class HALOEXCHANGE : public HALOEXCHANGE_base ~HALOEXCHANGE(); + void setUp(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); void runCudaVariant(VariantID vid, size_t tune_idx); @@ -78,6 +87,8 @@ class HALOEXCHANGE : public HALOEXCHANGE_base private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = gpu_block_size::make_list_type; + + std::vector m_buffers; }; } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index ac0fb7b62..986600282 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -93,7 +93,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_CUDA ) { diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index badc7457c..b130cf4b5 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -93,7 +93,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_HIP ) { diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index caf8fb67d..864393d0f 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -24,7 +24,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; switch ( vid ) { diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 79ac438ec..285e283e2 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -67,7 +67,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index 2c84f35a1..7114de6c8 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -22,7 +22,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; switch ( vid ) { diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 63788f6d3..34b4553c2 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -44,5 +44,26 @@ HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() { } +void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) +{ + HALOEXCHANGE_base::setUp(vid, tune_idx); + + m_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + allocAndInitData(m_buffers[l], buffer_len, vid); + } +} + +void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) +{ + for (int l = 0; l < s_num_neighbors; ++l) { + deallocData(m_buffers[l], vid); + } + m_buffers.clear(); + + HALOEXCHANGE_base::tearDown(vid, tune_idx); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index 71749b6e3..85c710ac8 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -45,6 +45,11 @@ #ifndef RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP #define RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP +#define HALOEXCHANGE_FUSED_DATA_SETUP \ + HALOEXCHANGE_base_DATA_SETUP \ + \ + std::vector buffers = m_buffers; + #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ Real_ptr buffer; \ @@ -107,6 +112,9 @@ class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base ~HALOEXCHANGE_FUSED(); + void setUp(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); void runCudaVariant(VariantID vid, size_t tune_idx); @@ -123,6 +131,8 @@ class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base private: static const size_t default_gpu_block_size = 1024; using gpu_block_sizes_type = gpu_block_size::make_list_type; + + std::vector m_buffers; }; } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp index f1cb179f0..f1d3ece7c 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -83,12 +83,6 @@ void HALOEXCHANGE_base::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx m_unpack_index_lists.resize(s_num_neighbors, nullptr); m_unpack_index_list_lengths.resize(s_num_neighbors, 0); create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - - m_buffers.resize(s_num_neighbors, nullptr); - for (Index_type l = 0; l < s_num_neighbors; ++l) { - Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_buffers[l], buffer_len, vid); - } } void HALOEXCHANGE_base::updateChecksum(VariantID vid, size_t tune_idx) @@ -100,11 +94,6 @@ void HALOEXCHANGE_base::updateChecksum(VariantID vid, size_t tune_idx) void HALOEXCHANGE_base::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l], vid); - } - m_buffers.clear(); - destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid); m_unpack_index_list_lengths.clear(); m_unpack_index_lists.clear(); diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp index 85f67ca50..153387b5d 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -45,10 +45,9 @@ #ifndef RAJAPerf_Apps_HALOEXCHANGE_base_HPP #define RAJAPerf_Apps_HALOEXCHANGE_base_HPP -#define HALOEXCHANGE_DATA_SETUP \ +#define HALOEXCHANGE_base_DATA_SETUP \ std::vector vars = m_vars; \ - std::vector buffers = m_buffers; \ -\ + \ Index_type num_neighbors = s_num_neighbors; \ Index_type num_vars = m_num_vars; \ std::vector pack_index_lists = m_pack_index_lists; \ @@ -123,7 +122,6 @@ class HALOEXCHANGE_base : public KernelBase Index_type m_var_halo_size; std::vector m_vars; - std::vector m_buffers; std::vector m_pack_index_lists; std::vector m_pack_index_list_lengths; diff --git a/src/apps/MPI_HALOEXCHANGE-Seq.cpp b/src/apps/MPI_HALOEXCHANGE-Seq.cpp index 98bba26b3..18b35f9ee 100644 --- a/src/apps/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Seq.cpp @@ -36,12 +36,12 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; int mpi_rank = mpi_ranks[l]; - MPI_Irecv(buffers[l], len*num_vars, Real_MPI_type, + MPI_Irecv(unpack_buffers[l], len*num_vars, Real_MPI_type, mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -52,14 +52,14 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t buffer += len; } int mpi_rank = mpi_ranks[l]; - MPI_Isend(buffers[l], len*num_vars, Real_MPI_type, + MPI_Isend(pack_buffers[l], len*num_vars, Real_MPI_type, mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { int l = -1; MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); - Real_ptr buffer = buffers[l]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index 96f7c95a4..40607a6a7 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -51,17 +51,38 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) { HALOEXCHANGE_base::setUp(vid, tune_idx); - m_mpi_ranks.resize(s_num_neighbors, -1); - MPI_Comm_rank(MPI_COMM_WORLD, &m_my_mpi_rank); + m_mpi_ranks.resize(s_num_neighbors, -1); for (Index_type l = 0; l < s_num_neighbors; ++l) { m_mpi_ranks[l] = m_my_mpi_rank; // send and recv to own rank } + + m_pack_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + allocAndInitData(m_pack_buffers[l], buffer_len, vid); + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + allocAndInitData(m_unpack_buffers[l], buffer_len, vid); + } } void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) { + for (int l = 0; l < s_num_neighbors; ++l) { + deallocData(m_unpack_buffers[l], vid); + } + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + deallocData(m_pack_buffers[l], vid); + } + m_pack_buffers.clear(); + m_mpi_ranks.clear(); HALOEXCHANGE_base::tearDown(vid, tune_idx); diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/apps/MPI_HALOEXCHANGE.hpp index cf46a6cdf..2c08731cd 100644 --- a/src/apps/MPI_HALOEXCHANGE.hpp +++ b/src/apps/MPI_HALOEXCHANGE.hpp @@ -46,13 +46,16 @@ #define RAJAPerf_Apps_MPI_HALOEXCHANGE_HPP #define MPI_HALOEXCHANGE_DATA_SETUP \ - HALOEXCHANGE_DATA_SETUP \ + HALOEXCHANGE_base_DATA_SETUP \ \ const int my_mpi_rank = m_my_mpi_rank; \ std::vector mpi_ranks = m_mpi_ranks; \ \ std::vector pack_mpi_requests(num_neighbors); \ - std::vector unpack_mpi_requests(num_neighbors); + std::vector unpack_mpi_requests(num_neighbors); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; #include "HALOEXCHANGE_base.hpp" @@ -98,6 +101,9 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base int m_my_mpi_rank = -1; std::vector m_mpi_ranks; + + std::vector m_pack_buffers; + std::vector m_unpack_buffers; }; } // end namespace apps From 67beebb99791688f33fa288043f61171146fa6d5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 12:06:49 -0700 Subject: [PATCH 009/454] Add support for separate pack and send buffers Add support for separate buffers where for example you pack into one buffer, then copy that to the MPI buffer, and then call MPI. In addition to a single buffer where for example you pack into the one buffer, and then call MPI. --- src/apps/MPI_HALOEXCHANGE-Seq.cpp | 17 +++++- src/apps/MPI_HALOEXCHANGE.cpp | 41 +++++++++++-- src/apps/MPI_HALOEXCHANGE.hpp | 12 +++- src/common/KernelBase.cpp | 36 ++++++++++++ src/common/KernelBase.hpp | 8 +++ src/common/RAJAPerfSuite.cpp | 26 ++++++++- src/common/RAJAPerfSuite.hpp | 17 +++++- src/common/RunParams.cpp | 96 ++++++++++++++++++++++++++++--- src/common/RunParams.hpp | 14 +++++ 9 files changed, 248 insertions(+), 19 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE-Seq.cpp b/src/apps/MPI_HALOEXCHANGE-Seq.cpp index 18b35f9ee..019512d43 100644 --- a/src/apps/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Seq.cpp @@ -36,7 +36,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; int mpi_rank = mpi_ranks[l]; - MPI_Irecv(unpack_buffers[l], len*num_vars, Real_MPI_type, + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); } @@ -51,8 +51,15 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + int mpi_rank = mpi_ranks[l]; - MPI_Isend(pack_buffers[l], len*num_vars, Real_MPI_type, + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } @@ -63,6 +70,12 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index 40607a6a7..d6c746120 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -58,29 +58,60 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) m_mpi_ranks[l] = m_my_mpi_rank; // send and recv to own rank } + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_pack_buffers[l], buffer_len, vid); + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } } m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { - Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_unpack_buffers[l], buffer_len, vid); + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } } + } void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) { + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_unpack_buffers[l], vid); + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } } + m_recv_buffers.clear(); m_unpack_buffers.clear(); for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_pack_buffers[l], vid); + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } } + m_send_buffers.clear(); m_pack_buffers.clear(); m_mpi_ranks.clear(); diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/apps/MPI_HALOEXCHANGE.hpp index 2c08731cd..f1067ed7b 100644 --- a/src/apps/MPI_HALOEXCHANGE.hpp +++ b/src/apps/MPI_HALOEXCHANGE.hpp @@ -54,8 +54,15 @@ std::vector pack_mpi_requests(num_neighbors); \ std::vector unpack_mpi_requests(num_neighbors); \ \ + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ std::vector pack_buffers = m_pack_buffers; \ - std::vector unpack_buffers = m_unpack_buffers; + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; #include "HALOEXCHANGE_base.hpp" @@ -104,6 +111,9 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base std::vector m_pack_buffers; std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; }; } // end namespace apps diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index cbc2083dd..81ce1eea4 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -199,6 +199,42 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const } } +DataSpace KernelBase::getMPIDataSpace(VariantID vid) const +{ + switch ( vid ) { + + case Base_Seq : + case Lambda_Seq : + case RAJA_Seq : + return run_params.getSeqMPIDataSpace(); + + case Base_OpenMP : + case Lambda_OpenMP : + case RAJA_OpenMP : + return run_params.getOmpMPIDataSpace(); + + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + return run_params.getOmpTargetMPIDataSpace(); + + case Base_CUDA : + case Lambda_CUDA : + case RAJA_CUDA : + return run_params.getCudaMPIDataSpace(); + + case Base_HIP : + case Lambda_HIP : + case RAJA_HIP : + return run_params.getHipMPIDataSpace(); + + case Kokkos_Lambda : + return run_params.getKokkosMPIDataSpace(); + + default: + throw std::invalid_argument("getDataSpace : Unknown variant id"); + } +} + DataSpace KernelBase::getHostAccessibleDataSpace(VariantID vid) const { return hostAccessibleDataSpace(getDataSpace(vid)); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 5b431f05f..c4bd1ce61 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -209,6 +209,7 @@ class KernelBase DataSpace getDataSpace(VariantID vid) const; DataSpace getHostAccessibleDataSpace(VariantID vid) const; + DataSpace getMPIDataSpace(VariantID vid) const; template void allocData(DataSpace dataSpace, T& ptr, int len) @@ -217,6 +218,13 @@ class KernelBase ptr, len, getDataAlignment()); } + template + void allocAndInitData(DataSpace dataSpace, T*& ptr, int len) + { + rajaperf::allocAndInitData(dataSpace, + ptr, len, getDataAlignment()); + } + template void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index e85427c7d..3108ca33f 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -357,6 +357,10 @@ static const std::string DataSpaceNames [] = std::string("HipDevice"), std::string("HipDeviceFine"), + std::string("Unknown Memory"), // Keep this at the end and DO NOT remove.... + + std::string("Copy"), + std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... }; // END VariantNames @@ -566,7 +570,7 @@ const std::string& getDataSpaceName(DataSpace ds) /*! ******************************************************************************* * - * Return true if the allocate associated with DataSpace enum value is available. + * Return true if the allocator associated with DataSpace enum value is available. * ******************************************************************************* */ @@ -620,6 +624,26 @@ bool isDataSpaceAvailable(DataSpace dataSpace) return ret_val; } +/*! + ******************************************************************************* + * + * Return true if the DataSpace enum value is a psuedo DataSpace. + * + ******************************************************************************* + */ +bool isPseudoDataSpace(DataSpace dataSpace) +{ + bool ret_val = false; + + switch (dataSpace) { + case DataSpace::Copy: + ret_val = true; break; + default: + ret_val = false; break; + } + + return ret_val; +} /* ******************************************************************************* diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index bf627d1c4..f1b88cc75 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -273,7 +273,11 @@ enum struct DataSpace { HipDevice, HipDeviceFine, - NumSpaces // Keep this one last and NEVER comment out (!!) + NumSpaces, // Keep this one here and NEVER comment out (!!) + + Copy, + + EndPseudoSpaces // Keep this one last and NEVER comment out (!!) }; @@ -359,12 +363,21 @@ const std::string& getDataSpaceName(DataSpace cd); /*! ******************************************************************************* * - * Return true if the allocate associated with DataSpace enum value is available. + * Return true if the allocator associated with DataSpace enum value is available. * ******************************************************************************* */ bool isDataSpaceAvailable(DataSpace dataSpace); +/*! + ******************************************************************************* + * + * Return true if the DataSpace enum value is a pseudo DataSpace. + * + ******************************************************************************* + */ +bool isPseudoDataSpace(DataSpace dataSpace); + /*! ******************************************************************************* * diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index e85ee5c86..838e24eec 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -480,10 +480,23 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) opt == std::string("--hip-data-space") || opt == std::string("-hds") || opt == std::string("--kokkos-data-space") || - opt == std::string("-kds") ) { + opt == std::string("-kds") || + opt == std::string("--seq-mpi-data-space") || + opt == std::string("-smpids") || + opt == std::string("--omp-mpi-data-space") || + opt == std::string("-ompids") || + opt == std::string("--omptarget-mpi-data-space") || + opt == std::string("-ompitds") || + opt == std::string("--cuda-mpi-data-space") || + opt == std::string("-cmpids") || + opt == std::string("--hip-mpi-data-space") || + opt == std::string("-hmpids") || + opt == std::string("--kokkos-mpi-data-space") || + opt == std::string("-kmpids") ) { bool got_someting = false; bool got_something_available = false; + bool got_something_pseudo = false; i++; if ( i < argc ) { auto opt_name = std::move(opt); @@ -491,11 +504,12 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if ( opt.at(0) == '-' ) { i--; } else { - for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { + for (int ids = 0; ids < static_cast(DataSpace::EndPseudoSpaces); ++ids) { DataSpace ds = static_cast(ids); if (getDataSpaceName(ds) == opt) { got_someting = true; got_something_available = isDataSpaceAvailable(ds); + got_something_pseudo = isPseudoDataSpace(ds); if ( opt_name == std::string("--seq-data-space") || opt_name == std::string("-sds") ) { seqDataSpace = ds; @@ -514,6 +528,30 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt_name == std::string("--kokkos-data-space") || opt_name == std::string("-kds") ) { kokkosDataSpace = ds; + } else if ( opt_name == std::string("--seq-mpi-data-space") || + opt_name == std::string("-smpids") ) { + seqMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--omp-mpi-data-space") || + opt_name == std::string("-ompids") ) { + ompMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--omptarget-mpi-data-space") || + opt_name == std::string("-otmpids") ) { + ompTargetMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--cuda-mpi-data-space") || + opt_name == std::string("-cmpids") ) { + cudaMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--hip-mpi-data-space") || + opt_name == std::string("-hmpids") ) { + hipMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; + } else if ( opt_name == std::string("--kokkos-mpi-data-space") || + opt_name == std::string("-kmpids") ) { + kokkosMPIDataSpace = ds; + got_something_available = got_something_available || got_something_pseudo; } else { got_someting = false; } @@ -752,41 +790,77 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t -ev Base_Seq RAJA_CUDA (exclude Base_Seq and RAJA_CUDA variants)\n\n"; str << "\t --seq-data-space, -sds [Default is Host]\n" - << "\t (names of data space to use)\n"; + << "\t (name of data space to use with sequential execution)\n"; str << "\t\t Examples...\n" << "\t\t --seq-data-space Host (run sequential variants with Host memory)\n" << "\t\t -sds CudaPinned (run sequential variants with Cuda Pinned memory)\n\n"; str << "\t --omp-data-space, -ods [Default is Omp]\n" - << "\t (names of data space to use)\n"; + << "\t (name of data space to use with OpenMP execution)\n"; str << "\t\t Examples...\n" << "\t\t --omp-data-space Omp (run Omp variants with Omp memory)\n" << "\t\t -ods Host (run Omp variants with Host memory)\n\n"; str << "\t --omptarget-data-space, -otds [Default is OmpTarget]\n" - << "\t (names of data space to use)\n"; + << "\t (name of data space to use with OpenMP target execution)\n"; str << "\t\t Examples...\n" << "\t\t --omptarget-data-space OmpTarget (run Omp Target variants with Omp Target memory)\n" << "\t\t -otds CudaPinned (run Omp Target variants with Cuda Pinned memory)\n\n"; str << "\t --cuda-data-space, -cds [Default is CudaDevice]\n" - << "\t (names of data space to use)\n"; + << "\t (name of data space to use with cuda execution)\n"; str << "\t\t Examples...\n" << "\t\t --cuda-data-space CudaManaged (run CUDA variants with Cuda Managed memory)\n" << "\t\t -cds CudaPinned (run CUDA variants with Cuda Pinned memory)\n\n"; str << "\t --hip-data-space, -hds [Default is HipDevice]\n" - << "\t (names of data space to use)\n"; + << "\t (name of data space to use with hip execution)\n"; str << "\t\t Examples...\n" << "\t\t --hip-data-space HipManaged (run HIP variants with Hip Managed memory)\n" << "\t\t -hds HipPinned (run HIP variants with Hip Pinned memory)\n\n"; str << "\t --kokkos-data-space, -kds [Default is Host]\n" - << "\t (names of data space to use)\n"; + << "\t (name of data space to use with kokkos execution)\n"; str << "\t\t Examples...\n" << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n"; + str << "\t --seq-mpi-data-space, -smpids [Default is Host]\n" + << "\t (name of data space to use with MPI and sequential execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --seq-mpi-data-space Host (run sequential variants with Host memory for MPI buffers)\n" + << "\t\t -smpids Copy (run sequential variants and copy to Host memory for MPI buffers)\n\n"; + + str << "\t --omp-mpi-data-space, -ompids [Default is Omp]\n" + << "\t (name of data space to use with MPI and OpenMP execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --omp-mpi-data-space Omp (run Omp variants with Omp memory for MPI buffers)\n" + << "\t\t -ompids Host (run Omp variants with Host memory for MPI buffers)\n\n"; + + str << "\t --omptarget-mpi-data-space, -otmpids [Default is Copy]\n" + << "\t (name of data space to use with MPI and OpenMP target execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --omptarget-mpi-data-space Copy (run Omp Target variants and copy to Host memory for MPI buffers)\n" + << "\t\t -otmpids OmpTarget (run Omp Target variants with OmpTarget memory for MPI buffers (assumes MPI can access OmpTarget memory))\n\n"; + + str << "\t --cuda-mpi-data-space, -cmpids [Default is CudaPinned]\n" + << "\t (name of data space to use with MPI and cuda execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --cuda-mpi-data-space CudaPinned (run CUDA variants with Cuda Pinned memory for MPI buffers)\n" + << "\t\t -cmpids CudaDevice (run CUDA variants with Cuda Device memory for MPI buffers (assumes MPI is cuda/gpu aware))\n\n"; + + str << "\t --hip-mpi-data-space, -hmpids [Default is HipPinned]\n" + << "\t (name of data space to use with MPI and hip execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --hip-mpi-data-space Copy (run HIP variants and copy to Host memory for MPI buffers)\n" + << "\t\t -hmpids hipDevice (run HIP variants with Hip Device memory for MPI buffers (assumes MPI is hip/gpu aware))\n\n"; + + str << "\t --kokkos-mpi-data-space, -kmpids [Default is Copy]\n" + << "\t (name of data space to use with MPI and kokkos execution)\n"; + str << "\t\t Examples...\n" + << "\t\t --kokkos-mpi-data-space Copy (run KOKKOS variants and copy to Host memory for MPI buffers)\n" + << "\t\t -kmpids HipPinned (run KOKKOS variants with Hip Pinned memory for MPI buffers)\n\n"; + str << "\t --features, -f [Default is run all]\n" << "\t (names of features to run)\n"; str << "\t\t Examples...\n" @@ -881,6 +955,12 @@ void RunParams::printDataSpaceNames(std::ostream& str) const str << getDataSpaceName(ds) << std::endl; } } + str << "\nPseudo data spaces:"; + str << "\n-------------------\n"; + for (int ids = static_cast(DataSpace::NumSpaces)+1; ids < static_cast(DataSpace::EndPseudoSpaces); ++ids) { + DataSpace ds = static_cast(ids); + str << getDataSpaceName(ds) << std::endl; + } str.flush(); } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 2c81a4f70..4a07f4ddd 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -145,6 +145,13 @@ class RunParams { DataSpace getHipDataSpace() const { return hipDataSpace; } DataSpace getKokkosDataSpace() const { return kokkosDataSpace; } + DataSpace getSeqMPIDataSpace() const { return seqMPIDataSpace; } + DataSpace getOmpMPIDataSpace() const { return ompMPIDataSpace; } + DataSpace getOmpTargetMPIDataSpace() const { return ompTargetMPIDataSpace; } + DataSpace getCudaMPIDataSpace() const { return cudaMPIDataSpace; } + DataSpace getHipMPIDataSpace() const { return hipMPIDataSpace; } + DataSpace getKokkosMPIDataSpace() const { return kokkosMPIDataSpace; } + double getPFTolerance() const { return pf_tol; } int getCheckRunReps() const { return checkrun_reps; } @@ -262,6 +269,13 @@ class RunParams { DataSpace hipDataSpace = DataSpace::HipDevice; DataSpace kokkosDataSpace = DataSpace::Host; + DataSpace seqMPIDataSpace = DataSpace::Host; + DataSpace ompMPIDataSpace = DataSpace::Omp; + DataSpace ompTargetMPIDataSpace = DataSpace::Copy; + DataSpace cudaMPIDataSpace = DataSpace::CudaPinned; + DataSpace hipMPIDataSpace = DataSpace::HipPinned; + DataSpace kokkosMPIDataSpace = DataSpace::Copy; + // // Arrays to hold input strings for valid/invalid input. Helpful for // debugging command line args. From f5e44b18d27065a9628fb0e13def203ab33ca058 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 12:20:58 -0700 Subject: [PATCH 010/454] Implement Lambda and RAJA Seq variants --- src/apps/MPI_HALOEXCHANGE-Seq.cpp | 76 +++++++++++++++++++++++++++---- src/apps/MPI_HALOEXCHANGE.cpp | 4 +- src/apps/MPI_HALOEXCHANGE.hpp | 1 - 3 files changed, 68 insertions(+), 13 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE-Seq.cpp b/src/apps/MPI_HALOEXCHANGE-Seq.cpp index 019512d43..3bd185d8b 100644 --- a/src/apps/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Seq.cpp @@ -66,10 +66,10 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type ll = 0; ll < num_neighbors; ++ll) { int l = -1; MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; - if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], @@ -93,14 +93,21 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t break; } -#if defined(RUN_RAJA_SEQ) && 0 +#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -113,12 +120,31 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { @@ -131,6 +157,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } } + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); @@ -145,7 +173,14 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -158,12 +193,31 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { @@ -176,6 +230,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } } + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index d6c746120..e4e1f2174 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -26,8 +26,8 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) setUsesFeature(MPI); setVariantDefined( Base_Seq ); - // setVariantDefined( Lambda_Seq ); - // setVariantDefined( RAJA_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); // setVariantDefined( Base_OpenMP ); // setVariantDefined( Lambda_OpenMP ); diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/apps/MPI_HALOEXCHANGE.hpp index f1067ed7b..f6c607f59 100644 --- a/src/apps/MPI_HALOEXCHANGE.hpp +++ b/src/apps/MPI_HALOEXCHANGE.hpp @@ -48,7 +48,6 @@ #define MPI_HALOEXCHANGE_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ \ - const int my_mpi_rank = m_my_mpi_rank; \ std::vector mpi_ranks = m_mpi_ranks; \ \ std::vector pack_mpi_requests(num_neighbors); \ From a943f7412afc98d1826bbceb9ab4d1d7b04d7425 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 12:28:15 -0700 Subject: [PATCH 011/454] Implement omp variants --- src/apps/MPI_HALOEXCHANGE-OMP.cpp | 110 ++++++++++++++++++++++++++---- src/apps/MPI_HALOEXCHANGE.cpp | 6 +- 2 files changed, 100 insertions(+), 16 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE-OMP.cpp b/src/apps/MPI_HALOEXCHANGE-OMP.cpp index 82fa3b8cc..11a1b6ece 100644 --- a/src/apps/MPI_HALOEXCHANGE-OMP.cpp +++ b/src/apps/MPI_HALOEXCHANGE-OMP.cpp @@ -22,7 +22,7 @@ namespace apps void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) && 0 +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); @@ -36,7 +36,14 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -47,12 +54,31 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp parallel for @@ -63,6 +89,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } } + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); @@ -75,7 +103,14 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -89,12 +124,31 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { @@ -108,6 +162,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } } + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); @@ -122,7 +178,14 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -135,12 +198,31 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { @@ -153,6 +235,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } } + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index e4e1f2174..2f242852e 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -29,9 +29,9 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); - // setVariantDefined( Base_OpenMP ); - // setVariantDefined( Lambda_OpenMP ); - // setVariantDefined( RAJA_OpenMP ); + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); // setVariantDefined( Base_OpenMPTarget ); // setVariantDefined( RAJA_OpenMPTarget ); From 5dbab534d1e165511d6813fb2b039f74769bf367 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 12:28:43 -0700 Subject: [PATCH 012/454] Implement ompt variants --- src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp | 74 +++++++++++++++++++++---- src/apps/MPI_HALOEXCHANGE.cpp | 4 +- 2 files changed, 66 insertions(+), 12 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp b/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp index e93ddfe3c..f30d0ae59 100644 --- a/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp @@ -29,7 +29,6 @@ namespace apps void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - #if 0 const Index_type run_reps = getRunReps(); MPI_HALOEXCHANGE_DATA_SETUP; @@ -40,7 +39,14 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -52,12 +58,31 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp target is_device_ptr(buffer, list, var) device( did ) @@ -69,6 +94,8 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU } } + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); @@ -80,7 +107,14 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -93,12 +127,31 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { @@ -111,13 +164,14 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU } } + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); } else { getCout() << "\n MPI_HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; } -#endif } } // end namespace apps diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index 2f242852e..a32a25954 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -33,8 +33,8 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) setVariantDefined( Lambda_OpenMP ); setVariantDefined( RAJA_OpenMP ); - // setVariantDefined( Base_OpenMPTarget ); - // setVariantDefined( RAJA_OpenMPTarget ); + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); // setVariantDefined( Base_CUDA ); // setVariantDefined( RAJA_CUDA ); From a69407822e066d986f7a2b1e8a491e0febaae60f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 12:33:44 -0700 Subject: [PATCH 013/454] Impelement cuda variants --- src/apps/MPI_HALOEXCHANGE-Cuda.cpp | 78 +++++++++++++++++++++++++----- src/apps/MPI_HALOEXCHANGE.cpp | 4 +- 2 files changed, 68 insertions(+), 14 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp index 431b5e5a7..23a5ff045 100644 --- a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp @@ -49,7 +49,6 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) { -#if 0 const Index_type run_reps = getRunReps(); MPI_HALOEXCHANGE_DATA_SETUP; @@ -60,7 +59,14 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -71,13 +77,32 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaGetLastError() ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + synchronize(); + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - synchronize(); - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -89,6 +114,8 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } synchronize(); + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); @@ -100,7 +127,14 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -113,13 +147,32 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + synchronize(); + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - synchronize(); - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { @@ -133,13 +186,14 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } synchronize(); + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); } else { getCout() << "\n MPI_HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; } -#endif } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MPI_HALOEXCHANGE, Cuda) diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index a32a25954..16cc4bcdc 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -36,8 +36,8 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) setVariantDefined( Base_OpenMPTarget ); setVariantDefined( RAJA_OpenMPTarget ); - // setVariantDefined( Base_CUDA ); - // setVariantDefined( RAJA_CUDA ); + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); // setVariantDefined( Base_HIP ); // setVariantDefined( RAJA_HIP ); From e9f26dc43f67a562f556cb245c27fe16c46612d3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 12:34:59 -0700 Subject: [PATCH 014/454] implement hip variants --- src/apps/MPI_HALOEXCHANGE-Hip.cpp | 78 ++++++++++++++++++++++++++----- src/apps/MPI_HALOEXCHANGE.cpp | 4 +- 2 files changed, 68 insertions(+), 14 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE-Hip.cpp b/src/apps/MPI_HALOEXCHANGE-Hip.cpp index 78cc7903b..e43a4c772 100644 --- a/src/apps/MPI_HALOEXCHANGE-Hip.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Hip.cpp @@ -49,7 +49,6 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) { -#if 0 const Index_type run_reps = getRunReps(); MPI_HALOEXCHANGE_DATA_SETUP; @@ -60,7 +59,14 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -72,13 +78,32 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) hipErrchk( hipGetLastError() ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + synchronize(); + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - synchronize(); - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -91,6 +116,8 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) } synchronize(); + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); @@ -102,7 +129,14 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Index_type len = unpack_index_list_lengths[l]; + int mpi_rank = mpi_ranks[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -115,13 +149,32 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + + synchronize(); + int mpi_rank = mpi_ranks[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); } - synchronize(); - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + for (Index_type ll = 0; ll < num_neighbors; ++ll) { + int l = -1; + MPI_Waitany(num_neighbors, unpack_mpi_requests.data(), &l, MPI_STATUS_IGNORE); + + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { @@ -135,13 +188,14 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) } synchronize(); + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + } stopTimer(); } else { getCout() << "\n MPI_HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; } -#endif } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MPI_HALOEXCHANGE, Hip) diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index 16cc4bcdc..17ac58709 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -39,8 +39,8 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); - // setVariantDefined( Base_HIP ); - // setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); } MPI_HALOEXCHANGE::~MPI_HALOEXCHANGE() From a82c2fc697b2077aa5eec57aa419f7f0e4b761d0 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 1 Jun 2023 13:24:28 -0700 Subject: [PATCH 015/454] Move mpi ranks setup into HALOEXCHANGE base --- src/apps/HALOEXCHANGE_base.cpp | 26 ++++++++++++++++++++++++++ src/apps/HALOEXCHANGE_base.hpp | 12 ++++++++++++ src/apps/MPI_HALOEXCHANGE.cpp | 6 +++--- src/apps/MPI_HALOEXCHANGE.hpp | 1 + 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp index f1d3ece7c..2e98e5337 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -453,5 +453,31 @@ void HALOEXCHANGE_base::destroy_unpack_lists( } } + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +void HALOEXCHANGE_base::create_rank_list( + int my_mpi_rank, int mpi_size, + std::vector& mpi_ranks, + const Index_type num_neighbors, + VariantID RAJAPERF_UNUSED_ARG(vid)) +{ + for (Index_type l = 0; l < num_neighbors; ++l) { + mpi_ranks[l] = my_mpi_rank; // send and recv to own rank + } +} + +// +// Function to destroy unpacking index lists. +// +void HALOEXCHANGE_base::destroy_rank_list( + const Index_type RAJAPERF_UNUSED_ARG(num_neighbors), + VariantID RAJAPERF_UNUSED_ARG(vid)) +{ + +} + +#endif + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp index 153387b5d..991fda85b 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -155,6 +155,18 @@ class HALOEXCHANGE_base : public KernelBase std::vector& unpack_index_lists, const Index_type num_neighbors, VariantID vid); + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + void create_rank_list( + int my_mpi_rank, int mpi_size, + std::vector& mpi_ranks, + const Index_type num_neighbors, + VariantID vid); + + void destroy_rank_list( + const Index_type num_neighbors, + VariantID vid); +#endif }; } // end namespace apps diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index 17ac58709..a75a14e77 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -51,12 +51,11 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) { HALOEXCHANGE_base::setUp(vid, tune_idx); + MPI_Comm_size(MPI_COMM_WORLD, &m_mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &m_my_mpi_rank); m_mpi_ranks.resize(s_num_neighbors, -1); - for (Index_type l = 0; l < s_num_neighbors; ++l) { - m_mpi_ranks[l] = m_my_mpi_rank; // send and recv to own rank - } + HALOEXCHANGE_base::create_rank_list(m_my_mpi_rank, m_mpi_size, m_mpi_ranks, s_num_neighbors, vid); const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); @@ -114,6 +113,7 @@ void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) m_send_buffers.clear(); m_pack_buffers.clear(); + HALOEXCHANGE_base::destroy_rank_list(s_num_neighbors, vid); m_mpi_ranks.clear(); HALOEXCHANGE_base::tearDown(vid, tune_idx); diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/apps/MPI_HALOEXCHANGE.hpp index f6c607f59..5a91ab243 100644 --- a/src/apps/MPI_HALOEXCHANGE.hpp +++ b/src/apps/MPI_HALOEXCHANGE.hpp @@ -105,6 +105,7 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = gpu_block_size::make_list_type; + int m_mpi_size = -1; int m_my_mpi_rank = -1; std::vector m_mpi_ranks; From cf7bd1560265409fe3ff8f39e8c896685733b70b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 2 Jun 2023 09:12:04 -0700 Subject: [PATCH 016/454] Add initial multi rank communciation Note that this is incorrect for periodic boundaries --- src/apps/HALOEXCHANGE_base.cpp | 311 ++++++++++----------------------- src/apps/HALOEXCHANGE_base.hpp | 17 +- src/apps/MPI_HALOEXCHANGE.cpp | 8 +- 3 files changed, 111 insertions(+), 225 deletions(-) diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp index 2e98e5337..34c01a10b 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -109,52 +109,84 @@ void HALOEXCHANGE_base::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } -enum struct location : int -{ - low_phony, - low_interior, - all_interior, - high_interior, - high_phony +static constexpr int neighbor_offsets[26][3]{ + + // faces + {-1, 0, 0}, + { 1, 0, 0}, + { 0, -1, 0}, + { 0, 1, 0}, + { 0, 0, -1}, + { 0, 0, 1}, + + // edges + {-1, -1, 0}, + {-1, 1, 0}, + { 1, -1, 0}, + { 1, 1, 0}, + {-1, 0, -1}, + {-1, 0, 1}, + { 1, 0, -1}, + { 1, 0, 1}, + { 0, -1, -1}, + { 0, -1, 1}, + { 0, 1, -1}, + { 0, 1, 1}, + + // corners + {-1, -1, -1}, + {-1, -1, 1}, + {-1, 1, -1}, + {-1, 1, 1}, + { 1, -1, -1}, + { 1, -1, 1}, + { 1, 1, -1}, + { 1, 1, 1} + }; -HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_extent( - HALOEXCHANGE_base::location x_extent, - HALOEXCHANGE_base::location y_extent, - HALOEXCHANGE_base::location z_extent, +HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( + const HALOEXCHANGE_base::message_type msg_type, + const int (&neighbor_offset)[3], const Index_type halo_width, const Index_type* grid_dims) { - auto get_bounds = [&](location loc, Index_type dim_size) { + if (msg_type != message_type::send && + msg_type != message_type::recv) { + throw std::runtime_error("make_boundary_extent: Invalid message type"); + } + auto get_bounds = [&](int offset, Index_type dim_size) { std::pair bounds; - switch (loc) { - case location::low_phony: - bounds.first = 0; - bounds.second = halo_width; - break; - case location::low_interior: - bounds.first = halo_width; - bounds.second = halo_width + halo_width; + switch (offset) { + case -1: + if (msg_type == message_type::send) { + bounds.first = halo_width; + bounds.second = halo_width + halo_width; + } else { // (msg_type == message_type::recv) + bounds.first = 0; + bounds.second = halo_width; + } break; - case location::all_interior: + case 0: bounds.first = halo_width; bounds.second = halo_width + dim_size; break; - case location::high_interior: - bounds.first = halo_width + dim_size - halo_width; - bounds.second = halo_width + dim_size; - break; - case location::high_phony: - bounds.first = halo_width + dim_size; - bounds.second = halo_width + dim_size + halo_width; + case 1: + if (msg_type == message_type::send) { + bounds.first = halo_width + dim_size - halo_width; + bounds.second = halo_width + dim_size; + } else { // (msg_type == message_type::recv) + bounds.first = halo_width + dim_size; + bounds.second = halo_width + dim_size + halo_width; + } break; default: throw std::runtime_error("make_extent: Invalid location"); } return bounds; }; - auto x_bounds = get_bounds(x_extent, grid_dims[0]); - auto y_bounds = get_bounds(y_extent, grid_dims[1]); - auto z_bounds = get_bounds(z_extent, grid_dims[2]); + auto x_bounds = get_bounds(neighbor_offset[0], grid_dims[0]); + auto y_bounds = get_bounds(neighbor_offset[1], grid_dims[1]); + auto z_bounds = get_bounds(neighbor_offset[2], grid_dims[2]); return {x_bounds.first, x_bounds.second, y_bounds.first, y_bounds.second, z_bounds.first, z_bounds.second}; @@ -171,101 +203,14 @@ void HALOEXCHANGE_base::create_pack_lists( const Index_type num_neighbors, VariantID vid) { - std::vector pack_index_list_extents(num_neighbors); - - // The pack extents have high and low flipped compared to the unpack extents. - - // faces - pack_index_list_extents[0] = make_extent(location::low_interior, - location::all_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[1] = make_extent(location::high_interior, - location::all_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[2] = make_extent(location::all_interior, - location::low_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[3] = make_extent(location::all_interior, - location::high_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[4] = make_extent(location::all_interior, - location::all_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[5] = make_extent(location::all_interior, - location::all_interior, - location::high_interior, halo_width, grid_dims); - - // edges - pack_index_list_extents[6] = make_extent(location::low_interior, - location::low_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[7] = make_extent(location::low_interior, - location::high_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[8] = make_extent(location::high_interior, - location::low_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[9] = make_extent(location::high_interior, - location::high_interior, - location::all_interior, halo_width, grid_dims); - pack_index_list_extents[10] = make_extent(location::low_interior, - location::all_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[11] = make_extent(location::low_interior, - location::all_interior, - location::high_interior, halo_width, grid_dims); - pack_index_list_extents[12] = make_extent(location::high_interior, - location::all_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[13] = make_extent(location::high_interior, - location::all_interior, - location::high_interior, halo_width, grid_dims); - pack_index_list_extents[14] = make_extent(location::all_interior, - location::low_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[15] = make_extent(location::all_interior, - location::low_interior, - location::high_interior, halo_width, grid_dims); - pack_index_list_extents[16] = make_extent(location::all_interior, - location::high_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[17] = make_extent(location::all_interior, - location::high_interior, - location::high_interior, halo_width, grid_dims); - - // corners - pack_index_list_extents[18] = make_extent(location::low_interior, - location::low_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[19] = make_extent(location::low_interior, - location::low_interior, - location::high_interior, halo_width, grid_dims); - pack_index_list_extents[20] = make_extent(location::low_interior, - location::high_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[21] = make_extent(location::low_interior, - location::high_interior, - location::high_interior, halo_width, grid_dims); - pack_index_list_extents[22] = make_extent(location::high_interior, - location::low_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[23] = make_extent(location::high_interior, - location::low_interior, - location::high_interior, halo_width, grid_dims); - pack_index_list_extents[24] = make_extent(location::high_interior, - location::high_interior, - location::low_interior, halo_width, grid_dims); - pack_index_list_extents[25] = make_extent(location::high_interior, - location::high_interior, - location::high_interior, halo_width, grid_dims); - const Index_type grid_i_stride = 1; const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); for (Index_type l = 0; l < num_neighbors; ++l) { - Extent extent = pack_index_list_extents[l]; + Extent extent = make_boundary_extent(message_type::send, neighbor_offsets[l], + halo_width, grid_dims); pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * (extent.j_max - extent.j_min) * @@ -282,8 +227,8 @@ void HALOEXCHANGE_base::create_pack_lists( for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { Index_type pack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; + jj * grid_j_stride + + kk * grid_k_stride ; pack_list[list_idx] = pack_idx; @@ -317,101 +262,14 @@ void HALOEXCHANGE_base::create_unpack_lists( const Index_type num_neighbors, VariantID vid) { - std::vector unpack_index_list_extents(num_neighbors); - - // The pack extents have high and low flipped compared to the unpack extents. - - // faces - unpack_index_list_extents[0] = make_extent(location::high_phony, - location::all_interior, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[1] = make_extent(location::low_phony, - location::all_interior, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[2] = make_extent(location::all_interior, - location::high_phony, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[3] = make_extent(location::all_interior, - location::low_phony, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[4] = make_extent(location::all_interior, - location::all_interior, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[5] = make_extent(location::all_interior, - location::all_interior, - location::low_phony, halo_width, grid_dims); - - // edges - unpack_index_list_extents[6] = make_extent(location::high_phony, - location::high_phony, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[7] = make_extent(location::high_phony, - location::low_phony, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[8] = make_extent(location::low_phony, - location::high_phony, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[9] = make_extent(location::low_phony, - location::low_phony, - location::all_interior, halo_width, grid_dims); - unpack_index_list_extents[10] = make_extent(location::high_phony, - location::all_interior, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[11] = make_extent(location::high_phony, - location::all_interior, - location::low_phony, halo_width, grid_dims); - unpack_index_list_extents[12] = make_extent(location::low_phony, - location::all_interior, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[13] = make_extent(location::low_phony, - location::all_interior, - location::low_phony, halo_width, grid_dims); - unpack_index_list_extents[14] = make_extent(location::all_interior, - location::high_phony, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[15] = make_extent(location::all_interior, - location::high_phony, - location::low_phony, halo_width, grid_dims); - unpack_index_list_extents[16] = make_extent(location::all_interior, - location::low_phony, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[17] = make_extent(location::all_interior, - location::low_phony, - location::low_phony, halo_width, grid_dims); - - // corners - unpack_index_list_extents[18] = make_extent(location::high_phony, - location::high_phony, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[19] = make_extent(location::high_phony, - location::high_phony, - location::low_phony, halo_width, grid_dims); - unpack_index_list_extents[20] = make_extent(location::high_phony, - location::low_phony, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[21] = make_extent(location::high_phony, - location::low_phony, - location::low_phony, halo_width, grid_dims); - unpack_index_list_extents[22] = make_extent(location::low_phony, - location::high_phony, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[23] = make_extent(location::low_phony, - location::high_phony, - location::low_phony, halo_width, grid_dims); - unpack_index_list_extents[24] = make_extent(location::low_phony, - location::low_phony, - location::high_phony, halo_width, grid_dims); - unpack_index_list_extents[25] = make_extent(location::low_phony, - location::low_phony, - location::low_phony, halo_width, grid_dims); - const Index_type grid_i_stride = 1; const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); for (Index_type l = 0; l < num_neighbors; ++l) { - Extent extent = unpack_index_list_extents[l]; + Extent extent = make_boundary_extent(message_type::recv, neighbor_offsets[l], + halo_width, grid_dims); unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * (extent.j_max - extent.j_min) * @@ -428,8 +286,8 @@ void HALOEXCHANGE_base::create_unpack_lists( for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { Index_type unpack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; + jj * grid_j_stride + + kk * grid_k_stride ; unpack_list[list_idx] = unpack_idx; @@ -457,13 +315,36 @@ void HALOEXCHANGE_base::destroy_unpack_lists( #if defined(RAJA_PERFSUITE_ENABLE_MPI) void HALOEXCHANGE_base::create_rank_list( - int my_mpi_rank, int mpi_size, + int my_mpi_rank, + const int (&mpi_dims)[3], std::vector& mpi_ranks, const Index_type num_neighbors, VariantID RAJAPERF_UNUSED_ARG(vid)) { + int my_mpi_idx[3]{-1,-1,-1}; + my_mpi_idx[2] = my_mpi_rank / (mpi_dims[0]*mpi_dims[1]); + my_mpi_idx[1] = (my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1])) / mpi_dims[0]; + my_mpi_idx[0] = my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1]) - my_mpi_idx[1]*mpi_dims[0]; + for (Index_type l = 0; l < num_neighbors; ++l) { - mpi_ranks[l] = my_mpi_rank; // send and recv to own rank + + const int (&mpi_offset)[3] = neighbor_offsets[l]; + + int neighbor_mpi_idx[3] = {my_mpi_idx[0]+mpi_offset[0], + my_mpi_idx[1]+mpi_offset[1], + my_mpi_idx[2]+mpi_offset[2]}; + + // fix neighbor indices on periodic boundaries + // this assumes that the offsets are at most 1 and at least -1 + for (int dim = 0; dim < 3; ++dim) { + if (neighbor_mpi_idx[dim] >= mpi_dims[dim]) { + neighbor_mpi_idx[dim] = 0; + } else if (neighbor_mpi_idx[dim] < 0) { + neighbor_mpi_idx[dim] = mpi_dims[dim]-1; + } + } + + mpi_ranks[l] = neighbor_mpi_idx[0] + mpi_dims[0]*(neighbor_mpi_idx[1] + mpi_dims[1]*neighbor_mpi_idx[2]); } } diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp index 991fda85b..028a62bf3 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -88,13 +88,10 @@ class HALOEXCHANGE_base : public KernelBase void tearDown(VariantID vid, size_t tune_idx); protected: - enum struct location : int + enum struct message_type : int { - low_phony, - low_interior, - all_interior, - high_interior, - high_phony + send, + recv }; struct Extent @@ -128,8 +125,9 @@ class HALOEXCHANGE_base : public KernelBase std::vector m_unpack_index_lists; std::vector m_unpack_index_list_lengths; - Extent make_extent( - location x_extent, location y_extent, location z_extent, + Extent make_boundary_extent( + const message_type msg_type, + const int (&neighbor_offset)[3], const Index_type halo_width, const Index_type* grid_dims); void create_pack_lists( @@ -158,7 +156,8 @@ class HALOEXCHANGE_base : public KernelBase #if defined(RAJA_PERFSUITE_ENABLE_MPI) void create_rank_list( - int my_mpi_rank, int mpi_size, + int my_mpi_rank, + const int (&mpi_dims)[3], std::vector& mpi_ranks, const Index_type num_neighbors, VariantID vid); diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index a75a14e77..cf6d5b27a 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -54,8 +54,14 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) MPI_Comm_size(MPI_COMM_WORLD, &m_mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &m_my_mpi_rank); + const int mpi_dim = std::cbrt(m_mpi_size); + const int mpi_dims[3] = {mpi_dim, mpi_dim, mpi_dim}; + if (mpi_dims[0] * mpi_dims[1] * mpi_dims[2] != m_mpi_size) { + throw std::runtime_error("mpi dims do not match mpi size"); + } + m_mpi_ranks.resize(s_num_neighbors, -1); - HALOEXCHANGE_base::create_rank_list(m_my_mpi_rank, m_mpi_size, m_mpi_ranks, s_num_neighbors, vid); + HALOEXCHANGE_base::create_rank_list(m_my_mpi_rank, mpi_dims, m_mpi_ranks, s_num_neighbors, vid); const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); From c10b272dddfff1f2a2a8edd787d77dbb7a5b3d33 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 2 Jun 2023 10:24:49 -0700 Subject: [PATCH 017/454] Only run MPI_HALOEXCHANGE if ranks can form a perfect cube Remove all variants of MPI_HALOEXCHANGE when the number of ranks is not a perfect cube of an integer. Restore periodic boundaries. --- src/apps/HALOEXCHANGE.cpp | 6 +- src/apps/HALOEXCHANGE_FUSED.cpp | 6 +- src/apps/HALOEXCHANGE_base.cpp | 244 +++++++++++++++----------------- src/apps/HALOEXCHANGE_base.hpp | 41 ++---- src/apps/MPI_HALOEXCHANGE.cpp | 54 ++++--- src/apps/MPI_HALOEXCHANGE.hpp | 2 +- 6 files changed, 158 insertions(+), 195 deletions(-) diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index 08f1849a5..28f7952ef 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -46,7 +46,9 @@ HALOEXCHANGE::~HALOEXCHANGE() void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) { - HALOEXCHANGE_base::setUp(vid, tune_idx); + int my_mpi_rank = 0; + const int mpi_dims[3] = {1,1,1}; + setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx); m_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { @@ -62,7 +64,7 @@ void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) } m_buffers.clear(); - HALOEXCHANGE_base::tearDown(vid, tune_idx); + tearDown_base(vid, tune_idx); } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/apps/HALOEXCHANGE_FUSED.cpp index 34b4553c2..2ecf98d73 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/apps/HALOEXCHANGE_FUSED.cpp @@ -46,7 +46,9 @@ HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) { - HALOEXCHANGE_base::setUp(vid, tune_idx); + int my_mpi_rank = 0; + const int mpi_dims[3] = {1,1,1}; + setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx); m_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { @@ -62,7 +64,7 @@ void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) } m_buffers.clear(); - HALOEXCHANGE_base::tearDown(vid, tune_idx); + tearDown_base(vid, tune_idx); } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp index 34c01a10b..cdd0b5b7a 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -62,7 +62,8 @@ HALOEXCHANGE_base::~HALOEXCHANGE_base() { } -void HALOEXCHANGE_base::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int (&mpi_dims)[3], + VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_vars.resize(m_num_vars, nullptr); for (Index_type v = 0; v < m_num_vars; ++v) { @@ -76,13 +77,16 @@ void HALOEXCHANGE_base::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } } + m_mpi_ranks.resize(s_num_neighbors, -1); m_pack_index_lists.resize(s_num_neighbors, nullptr); m_pack_index_list_lengths.resize(s_num_neighbors, 0); - create_pack_lists(m_pack_index_lists, m_pack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); - m_unpack_index_lists.resize(s_num_neighbors, nullptr); m_unpack_index_list_lengths.resize(s_num_neighbors, 0); - create_unpack_lists(m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); + create_lists(my_mpi_rank, mpi_dims, m_mpi_ranks, + m_pack_index_lists, m_pack_index_list_lengths, + m_unpack_index_lists, m_unpack_index_list_lengths, + m_halo_width, m_grid_dims, + s_num_neighbors, vid); } void HALOEXCHANGE_base::updateChecksum(VariantID vid, size_t tune_idx) @@ -92,15 +96,14 @@ void HALOEXCHANGE_base::updateChecksum(VariantID vid, size_t tune_idx) } } -void HALOEXCHANGE_base::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - destroy_unpack_lists(m_unpack_index_lists, s_num_neighbors, vid); + destroy_lists(m_pack_index_lists, m_unpack_index_lists, s_num_neighbors, vid); m_unpack_index_list_lengths.clear(); m_unpack_index_lists.clear(); - - destroy_pack_lists(m_pack_index_lists, s_num_neighbors, vid); m_pack_index_list_lengths.clear(); m_pack_index_lists.clear(); + m_mpi_ranks.clear(); for (int v = 0; v < m_num_vars; ++v) { deallocData(m_vars[v], vid); @@ -109,7 +112,7 @@ void HALOEXCHANGE_base::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } -static constexpr int neighbor_offsets[26][3]{ +const int HALOEXCHANGE_base::neighbor_offsets[HALOEXCHANGE_base::s_num_neighbors][3]{ // faces {-1, 0, 0}, @@ -148,13 +151,14 @@ static constexpr int neighbor_offsets[26][3]{ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( const HALOEXCHANGE_base::message_type msg_type, const int (&neighbor_offset)[3], + const bool (&crossing_periodic_boundary)[3], const Index_type halo_width, const Index_type* grid_dims) { if (msg_type != message_type::send && msg_type != message_type::recv) { throw std::runtime_error("make_boundary_extent: Invalid message type"); } - auto get_bounds = [&](int offset, Index_type dim_size) { + auto get_bounds = [&](int offset, Index_type dim_size, bool periodic) { std::pair bounds; switch (offset) { case -1: @@ -162,8 +166,13 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( bounds.first = halo_width; bounds.second = halo_width + halo_width; } else { // (msg_type == message_type::recv) - bounds.first = 0; - bounds.second = halo_width; + if (periodic) { + bounds.first = halo_width + dim_size; + bounds.second = halo_width + dim_size + halo_width; + } else { + bounds.first = 0; + bounds.second = halo_width; + } } break; case 0: @@ -175,8 +184,13 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( bounds.first = halo_width + dim_size - halo_width; bounds.second = halo_width + dim_size; } else { // (msg_type == message_type::recv) - bounds.first = halo_width + dim_size; - bounds.second = halo_width + dim_size + halo_width; + if (periodic) { + bounds.first = 0; + bounds.second = halo_width; + } else { + bounds.first = halo_width + dim_size; + bounds.second = halo_width + dim_size + halo_width; + } } break; default: @@ -184,9 +198,9 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( } return bounds; }; - auto x_bounds = get_bounds(neighbor_offset[0], grid_dims[0]); - auto y_bounds = get_bounds(neighbor_offset[1], grid_dims[1]); - auto z_bounds = get_bounds(neighbor_offset[2], grid_dims[2]); + auto x_bounds = get_bounds(neighbor_offset[0], grid_dims[0], crossing_periodic_boundary[0]); + auto y_bounds = get_bounds(neighbor_offset[1], grid_dims[1], crossing_periodic_boundary[1]); + auto z_bounds = get_bounds(neighbor_offset[2], grid_dims[2], crossing_periodic_boundary[2]); return {x_bounds.first, x_bounds.second, y_bounds.first, y_bounds.second, z_bounds.first, z_bounds.second}; @@ -194,171 +208,137 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( // -// Function to generate index lists for packing. +// Function to generate mpi decomposition and index lists for packing and unpacking. // -void HALOEXCHANGE_base::create_pack_lists( + +void HALOEXCHANGE_base::create_lists( + int my_mpi_rank, + const int (&mpi_dims)[3], + std::vector& mpi_ranks, std::vector& pack_index_lists, std::vector& pack_index_list_lengths, + std::vector& unpack_index_lists, + std::vector& unpack_index_list_lengths, const Index_type halo_width, const Index_type* grid_dims, const Index_type num_neighbors, VariantID vid) { + int my_mpi_idx[3]{-1,-1,-1}; + my_mpi_idx[2] = my_mpi_rank / (mpi_dims[0]*mpi_dims[1]); + my_mpi_idx[1] = (my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1])) / mpi_dims[0]; + my_mpi_idx[0] = my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1]) - my_mpi_idx[1]*mpi_dims[0]; + const Index_type grid_i_stride = 1; const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); for (Index_type l = 0; l < num_neighbors; ++l) { - Extent extent = make_boundary_extent(message_type::send, neighbor_offsets[l], - halo_width, grid_dims); - - pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * - (extent.j_max - extent.j_min) * - (extent.k_max - extent.k_min) ; - - allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); - - Int_ptr pack_list = pack_index_lists[l]; - - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - - Index_type pack_idx = ii * grid_i_stride + - jj * grid_j_stride + - kk * grid_k_stride ; + const int (&mpi_offset)[3] = neighbor_offsets[l]; - pack_list[list_idx] = pack_idx; + int neighbor_mpi_idx[3] = {my_mpi_idx[0]+mpi_offset[0], + my_mpi_idx[1]+mpi_offset[1], + my_mpi_idx[2]+mpi_offset[2]}; + bool crossing_periodic_boundary[3] = {false, false, false}; - list_idx += 1; - } + // fix neighbor indices on periodic boundaries + // this assumes that the offsets are at most 1 and at least -1 + for (int dim = 0; dim < 3; ++dim) { + if (neighbor_mpi_idx[dim] >= mpi_dims[dim]) { + neighbor_mpi_idx[dim] = 0; + crossing_periodic_boundary[dim] = true; + } else if (neighbor_mpi_idx[dim] < 0) { + neighbor_mpi_idx[dim] = mpi_dims[dim]-1; + crossing_periodic_boundary[dim] = true; } } - } -} -// -// Function to destroy packing index lists. -// -void HALOEXCHANGE_base::destroy_pack_lists( - std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(pack_index_lists[l], vid); - } -} - -// -// Function to generate index lists for unpacking. -// -void HALOEXCHANGE_base::create_unpack_lists( - std::vector& unpack_index_lists, - std::vector& unpack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid) -{ - const Index_type grid_i_stride = 1; - const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; - const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); - - for (Index_type l = 0; l < num_neighbors; ++l) { + mpi_ranks[l] = neighbor_mpi_idx[0] + mpi_dims[0]*(neighbor_mpi_idx[1] + mpi_dims[1]*neighbor_mpi_idx[2]); - Extent extent = make_boundary_extent(message_type::recv, neighbor_offsets[l], - halo_width, grid_dims); + { + // pack and send + Extent extent = make_boundary_extent(message_type::send, + neighbor_offsets[l], + crossing_periodic_boundary, + halo_width, grid_dims); - unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * (extent.j_max - extent.j_min) * (extent.k_max - extent.k_min) ; - allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); - auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + allocAndInitData(pack_index_lists[l], pack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(pack_index_lists[l], pack_index_list_lengths[l], vid); - Int_ptr unpack_list = unpack_index_lists[l]; + Int_ptr pack_list = pack_index_lists[l]; - Index_type list_idx = 0; - for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { - for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { - for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { + Index_type list_idx = 0; + for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { + for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { + for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - Index_type unpack_idx = ii * grid_i_stride + + Index_type pack_idx = ii * grid_i_stride + jj * grid_j_stride + kk * grid_k_stride ; - unpack_list[list_idx] = unpack_idx; + pack_list[list_idx] = pack_idx; - list_idx += 1; + list_idx += 1; + } } } } - } -} -// -// Function to destroy unpacking index lists. -// -void HALOEXCHANGE_base::destroy_unpack_lists( - std::vector& unpack_index_lists, - const Index_type num_neighbors, - VariantID vid) -{ - for (Index_type l = 0; l < num_neighbors; ++l) { - deallocData(unpack_index_lists[l], vid); - } -} + { + // receive and unpack + Extent extent = make_boundary_extent(message_type::recv, + neighbor_offsets[l], + crossing_periodic_boundary, + halo_width, grid_dims); + unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * + (extent.j_max - extent.j_min) * + (extent.k_max - extent.k_min) ; -#if defined(RAJA_PERFSUITE_ENABLE_MPI) + allocAndInitData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); + auto reset_list = scopedMoveData(unpack_index_lists[l], unpack_index_list_lengths[l], vid); -void HALOEXCHANGE_base::create_rank_list( - int my_mpi_rank, - const int (&mpi_dims)[3], - std::vector& mpi_ranks, - const Index_type num_neighbors, - VariantID RAJAPERF_UNUSED_ARG(vid)) -{ - int my_mpi_idx[3]{-1,-1,-1}; - my_mpi_idx[2] = my_mpi_rank / (mpi_dims[0]*mpi_dims[1]); - my_mpi_idx[1] = (my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1])) / mpi_dims[0]; - my_mpi_idx[0] = my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1]) - my_mpi_idx[1]*mpi_dims[0]; + Int_ptr unpack_list = unpack_index_lists[l]; - for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type list_idx = 0; + for (Index_type kk = extent.k_min; kk < extent.k_max; ++kk) { + for (Index_type jj = extent.j_min; jj < extent.j_max; ++jj) { + for (Index_type ii = extent.i_min; ii < extent.i_max; ++ii) { - const int (&mpi_offset)[3] = neighbor_offsets[l]; + Index_type unpack_idx = ii * grid_i_stride + + jj * grid_j_stride + + kk * grid_k_stride ; - int neighbor_mpi_idx[3] = {my_mpi_idx[0]+mpi_offset[0], - my_mpi_idx[1]+mpi_offset[1], - my_mpi_idx[2]+mpi_offset[2]}; + unpack_list[list_idx] = unpack_idx; - // fix neighbor indices on periodic boundaries - // this assumes that the offsets are at most 1 and at least -1 - for (int dim = 0; dim < 3; ++dim) { - if (neighbor_mpi_idx[dim] >= mpi_dims[dim]) { - neighbor_mpi_idx[dim] = 0; - } else if (neighbor_mpi_idx[dim] < 0) { - neighbor_mpi_idx[dim] = mpi_dims[dim]-1; + list_idx += 1; + } + } } } - - mpi_ranks[l] = neighbor_mpi_idx[0] + mpi_dims[0]*(neighbor_mpi_idx[1] + mpi_dims[1]*neighbor_mpi_idx[2]); } } // -// Function to destroy unpacking index lists. +// Function to destroy packing and unpacking index lists. // -void HALOEXCHANGE_base::destroy_rank_list( - const Index_type RAJAPERF_UNUSED_ARG(num_neighbors), - VariantID RAJAPERF_UNUSED_ARG(vid)) +void HALOEXCHANGE_base::destroy_lists( + std::vector& pack_index_lists, + std::vector& unpack_index_lists, + const Index_type num_neighbors, + VariantID vid) { - + for (Index_type l = 0; l < num_neighbors; ++l) { + deallocData(pack_index_lists[l], vid); + } + for (Index_type l = 0; l < num_neighbors; ++l) { + deallocData(unpack_index_lists[l], vid); + } } -#endif - } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp index 028a62bf3..1cb86fc1a 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -83,9 +83,10 @@ class HALOEXCHANGE_base : public KernelBase ~HALOEXCHANGE_base(); - void setUp(VariantID vid, size_t tune_idx); + void setUp_base(const int my_mpi_rank, const int (&mpi_dims)[3], + VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); - void tearDown(VariantID vid, size_t tune_idx); + void tearDown_base(VariantID vid, size_t tune_idx); protected: enum struct message_type : int @@ -105,6 +106,7 @@ class HALOEXCHANGE_base : public KernelBase }; static const int s_num_neighbors = 26; + static const int neighbor_offsets[s_num_neighbors][3]; Index_type m_grid_dims[3]; Index_type m_halo_width; @@ -120,6 +122,8 @@ class HALOEXCHANGE_base : public KernelBase std::vector m_vars; + std::vector m_mpi_ranks; + std::vector m_pack_index_lists; std::vector m_pack_index_list_lengths; std::vector m_unpack_index_lists; @@ -127,45 +131,26 @@ class HALOEXCHANGE_base : public KernelBase Extent make_boundary_extent( const message_type msg_type, - const int (&neighbor_offset)[3], + const int (&neighbor_offset)[3], const bool (&crossing_periodic_boundary)[3], const Index_type halo_width, const Index_type* grid_dims); - void create_pack_lists( + void create_lists( + int my_mpi_rank, + const int (&mpi_dims)[3], + std::vector& mpi_ranks, std::vector& pack_index_lists, std::vector& pack_index_list_lengths, - const Index_type halo_width, const Index_type* grid_dims, - const Index_type num_neighbors, - VariantID vid); - - void destroy_pack_lists( - std::vector& pack_index_lists, - const Index_type num_neighbors, - VariantID vid); - - void create_unpack_lists( std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, const Index_type halo_width, const Index_type* grid_dims, const Index_type num_neighbors, VariantID vid); - void destroy_unpack_lists( + void destroy_lists( + std::vector& pack_index_lists, std::vector& unpack_index_lists, const Index_type num_neighbors, VariantID vid); - -#if defined(RAJA_PERFSUITE_ENABLE_MPI) - void create_rank_list( - int my_mpi_rank, - const int (&mpi_dims)[3], - std::vector& mpi_ranks, - const Index_type num_neighbors, - VariantID vid); - - void destroy_rank_list( - const Index_type num_neighbors, - VariantID vid); -#endif }; } // end namespace apps diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index cf6d5b27a..e4a822796 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -22,25 +22,35 @@ namespace apps MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) : HALOEXCHANGE_base(rajaperf::Apps_MPI_HALOEXCHANGE, params) { + MPI_Comm_size(MPI_COMM_WORLD, &m_mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &m_my_mpi_rank); + + const int mpi_dim = std::cbrt(m_mpi_size); + m_mpi_dims[0] = mpi_dim; + m_mpi_dims[1] = mpi_dim; + m_mpi_dims[2] = mpi_dim; + setUsesFeature(Forall); setUsesFeature(MPI); - setVariantDefined( Base_Seq ); - setVariantDefined( Lambda_Seq ); - setVariantDefined( RAJA_Seq ); + if (m_mpi_dims[0] * m_mpi_dims[1] * m_mpi_dims[2] == m_mpi_size) { + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); - setVariantDefined( Base_OpenMP ); - setVariantDefined( Lambda_OpenMP ); - setVariantDefined( RAJA_OpenMP ); + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); - setVariantDefined( Base_OpenMPTarget ); - setVariantDefined( RAJA_OpenMPTarget ); + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); - setVariantDefined( Base_CUDA ); - setVariantDefined( RAJA_CUDA ); + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); - setVariantDefined( Base_HIP ); - setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + } } MPI_HALOEXCHANGE::~MPI_HALOEXCHANGE() @@ -49,19 +59,7 @@ MPI_HALOEXCHANGE::~MPI_HALOEXCHANGE() void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) { - HALOEXCHANGE_base::setUp(vid, tune_idx); - - MPI_Comm_size(MPI_COMM_WORLD, &m_mpi_size); - MPI_Comm_rank(MPI_COMM_WORLD, &m_my_mpi_rank); - - const int mpi_dim = std::cbrt(m_mpi_size); - const int mpi_dims[3] = {mpi_dim, mpi_dim, mpi_dim}; - if (mpi_dims[0] * mpi_dims[1] * mpi_dims[2] != m_mpi_size) { - throw std::runtime_error("mpi dims do not match mpi size"); - } - - m_mpi_ranks.resize(s_num_neighbors, -1); - HALOEXCHANGE_base::create_rank_list(m_my_mpi_rank, mpi_dims, m_mpi_ranks, s_num_neighbors, vid); + setUp_base(m_my_mpi_rank, m_mpi_dims, vid, tune_idx); const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); @@ -90,7 +88,6 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) m_recv_buffers[l] = m_unpack_buffers[l]; } } - } void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) @@ -119,10 +116,7 @@ void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) m_send_buffers.clear(); m_pack_buffers.clear(); - HALOEXCHANGE_base::destroy_rank_list(s_num_neighbors, vid); - m_mpi_ranks.clear(); - - HALOEXCHANGE_base::tearDown(vid, tune_idx); + tearDown_base(vid, tune_idx); } } // end namespace apps diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/apps/MPI_HALOEXCHANGE.hpp index 5a91ab243..00383ab89 100644 --- a/src/apps/MPI_HALOEXCHANGE.hpp +++ b/src/apps/MPI_HALOEXCHANGE.hpp @@ -107,7 +107,7 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base int m_mpi_size = -1; int m_my_mpi_rank = -1; - std::vector m_mpi_ranks; + int m_mpi_dims[3] = {-1, -1, -1}; std::vector m_pack_buffers; std::vector m_unpack_buffers; From 6808bc8da3240d1008d6b24e4eb530274ff6245c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 2 Jun 2023 14:04:46 -0700 Subject: [PATCH 018/454] Add mpi 3d division argument --- src/apps/HALOEXCHANGE_base.cpp | 4 +-- src/apps/HALOEXCHANGE_base.hpp | 4 +-- src/apps/MPI_HALOEXCHANGE.cpp | 16 ++++------ src/apps/MPI_HALOEXCHANGE.hpp | 3 +- src/common/RunParams.cpp | 58 ++++++++++++++++++++++++++++++++++ src/common/RunParams.hpp | 9 ++++++ 6 files changed, 79 insertions(+), 15 deletions(-) diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp index cdd0b5b7a..835f4b158 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -62,7 +62,7 @@ HALOEXCHANGE_base::~HALOEXCHANGE_base() { } -void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int (&mpi_dims)[3], +void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_vars.resize(m_num_vars, nullptr); @@ -213,7 +213,7 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( void HALOEXCHANGE_base::create_lists( int my_mpi_rank, - const int (&mpi_dims)[3], + const int* mpi_dims, std::vector& mpi_ranks, std::vector& pack_index_lists, std::vector& pack_index_list_lengths, diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp index 1cb86fc1a..7851a697b 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -83,7 +83,7 @@ class HALOEXCHANGE_base : public KernelBase ~HALOEXCHANGE_base(); - void setUp_base(const int my_mpi_rank, const int (&mpi_dims)[3], + void setUp_base(const int my_mpi_rank, const int* mpi_dims, VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown_base(VariantID vid, size_t tune_idx); @@ -136,7 +136,7 @@ class HALOEXCHANGE_base : public KernelBase void create_lists( int my_mpi_rank, - const int (&mpi_dims)[3], + const int* mpi_dims, std::vector& mpi_ranks, std::vector& pack_index_lists, std::vector& pack_index_list_lengths, diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index e4a822796..88990fc36 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -12,7 +12,7 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include +#include namespace rajaperf { @@ -22,18 +22,14 @@ namespace apps MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) : HALOEXCHANGE_base(rajaperf::Apps_MPI_HALOEXCHANGE, params) { - MPI_Comm_size(MPI_COMM_WORLD, &m_mpi_size); - MPI_Comm_rank(MPI_COMM_WORLD, &m_my_mpi_rank); - - const int mpi_dim = std::cbrt(m_mpi_size); - m_mpi_dims[0] = mpi_dim; - m_mpi_dims[1] = mpi_dim; - m_mpi_dims[2] = mpi_dim; + m_mpi_size = params.getMPISize(); + m_my_mpi_rank = params.getMPIRank(); + m_mpi_dims = params.getMPI3DDivision(); setUsesFeature(Forall); setUsesFeature(MPI); - if (m_mpi_dims[0] * m_mpi_dims[1] * m_mpi_dims[2] == m_mpi_size) { + if (params.validMPI3DDivision()) { setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); @@ -59,7 +55,7 @@ MPI_HALOEXCHANGE::~MPI_HALOEXCHANGE() void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) { - setUp_base(m_my_mpi_rank, m_mpi_dims, vid, tune_idx); + setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/apps/MPI_HALOEXCHANGE.hpp index 00383ab89..981c15a24 100644 --- a/src/apps/MPI_HALOEXCHANGE.hpp +++ b/src/apps/MPI_HALOEXCHANGE.hpp @@ -71,6 +71,7 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) #include +#include namespace rajaperf { @@ -107,7 +108,7 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base int m_mpi_size = -1; int m_my_mpi_rank = -1; - int m_mpi_dims[3] = {-1, -1, -1}; + std::array m_mpi_dims = {-1, -1, -1}; std::vector m_pack_buffers; std::vector m_unpack_buffers; diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 838e24eec..8c40f0b6c 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -36,6 +36,9 @@ RunParams::RunParams(int argc, char** argv) size_factor(0.0), data_alignment(RAJA::DATA_ALIGN), gpu_block_sizes(), + mpi_size(1), + mpi_rank(0), + mpi_3d_division({-1, -1, -1}), pf_tol(0.1), checkrun_reps(1), reference_variant(), @@ -105,6 +108,11 @@ void RunParams::print(std::ostream& str) const for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { str << "\n\t" << gpu_block_sizes[j]; } + str << "\n mpi_size = " << mpi_size; + str << "\n mpi_3d_division = "; + for (size_t j = 0; j < 3; ++j) { + str << "\n\t" << mpi_3d_division[j]; + } str << "\n pf_tol = " << pf_tol; str << "\n checkrun_reps = " << checkrun_reps; str << "\n reference_variant = " << reference_variant; @@ -190,6 +198,11 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) { getCout() << "\n\nReading command line input..." << std::endl; +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); +#endif + for (int i = 1; i < argc; ++i) { std::string opt(argv[i]); @@ -392,6 +405,37 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--mpi_3d_division") ) { + + int num_got = 0; + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + num_got += 1; + int number = ::atoi( opt.c_str() ); + if ( number <= 0 ) { + getCout() << "\nBad input:" + << " must give --mpi_3d_division POSITIVE values (int)" + << std::endl; + input_state = BadInput; + } else if (num_got <= 3) { + mpi_3d_division[num_got-1] = number; + } + ++i; + } + } + if (num_got != 3) { + getCout() << "\nBad input:" + << " must give --mpi_3d_division three values (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--pass-fail-tol") || opt == std::string("-pftol") ) { @@ -691,6 +735,14 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) if (npasses_combiner_input.empty()) { npasses_combiners.emplace_back(CombinerOpt::Average); } + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + if (mpi_3d_division[0] == -1) { + mpi_3d_division[0] = std::ceil(std::cbrt(mpi_size)); + mpi_3d_division[1] = mpi_3d_division[0]; + mpi_3d_division[2] = mpi_3d_division[0]; + } +#endif } @@ -758,6 +810,12 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --gpu_block_size 128 256 512 (runs kernels with gpu_block_size 128, 256, and 512)\n\n"; + str << "\t --mpi_3d_division [no default]\n" + << "\t (number of mpi ranks in each dimension in a 3d grid)\n" + << "\t (3D MPI kernels will be skipped if the product of mpi_3d_division is not equal to the number of ranks)\n"; + str << "\t\t Example...\n" + << "\t\t --mpi_3d_division 2 3 5 (runs 3d MPI kernels on a 2 by 3 by 5 grid)\n\n"; + str << "\t --pass-fail-tol, -pftol [default is 0.1; i.e., 10%]\n" << "\t (slowdown tolerance for RAJA vs. Base variants in FOM report)\n"; str << "\t\t Example...\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 4a07f4ddd..e6e1cd81d 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include "RAJAPerfSuite.hpp" @@ -138,6 +139,11 @@ class RunParams { return false; } + int getMPISize() const { return mpi_size; } + int getMPIRank() const { return mpi_rank; } + bool validMPI3DDivision() const { return (mpi_3d_division[0]*mpi_3d_division[1]*mpi_3d_division[2] == mpi_size); } + std::array const& getMPI3DDivision() const { return mpi_3d_division; } + DataSpace getSeqDataSpace() const { return seqDataSpace; } DataSpace getOmpDataSpace() const { return ompDataSpace; } DataSpace getOmpTargetDataSpace() const { return ompTargetDataSpace; } @@ -253,6 +259,9 @@ class RunParams { double size_factor; /*!< default kernel size multipier (input option) */ size_t data_alignment; std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ + int mpi_size; /*!< Number of MPI ranks */ + int mpi_rank; /*!< Rank of this MPI process */ + std::array mpi_3d_division; /*!< Number of MPI ranks in each dimension of a 3D grid */ double pf_tol; /*!< pct RAJA variant run time can exceed base for each PM case to pass/fail acceptance */ From 066a553b8844bb02a2f80f4819ac78f163d36474 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 5 Jun 2023 15:57:29 -0700 Subject: [PATCH 019/454] Fix periodicity and message tags --- src/apps/HALOEXCHANGE-Cuda.cpp | 8 +-- src/apps/HALOEXCHANGE-Hip.cpp | 8 +-- src/apps/HALOEXCHANGE-OMP.cpp | 12 ++-- src/apps/HALOEXCHANGE-OMPTarget.cpp | 8 +-- src/apps/HALOEXCHANGE-Seq.cpp | 12 ++-- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 8 +-- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 8 +-- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 12 ++-- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 8 +-- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 12 ++-- src/apps/HALOEXCHANGE_base.cpp | 79 ++++++++++++----------- src/apps/HALOEXCHANGE_base.hpp | 11 +++- src/apps/MPI_HALOEXCHANGE-Cuda.cpp | 12 ++-- src/apps/MPI_HALOEXCHANGE-Hip.cpp | 12 ++-- src/apps/MPI_HALOEXCHANGE-OMP.cpp | 18 ++---- src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp | 12 ++-- src/apps/MPI_HALOEXCHANGE-Seq.cpp | 18 ++---- 17 files changed, 123 insertions(+), 135 deletions(-) diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 8177b997f..6993c44fb 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -59,7 +59,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -74,7 +74,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) synchronize(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -99,7 +99,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -116,7 +116,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) synchronize(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index 36fbe2821..c89d81b9d 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -59,7 +59,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -75,7 +75,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) synchronize(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -101,7 +101,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -118,7 +118,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) synchronize(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/apps/HALOEXCHANGE-OMP.cpp index 050046479..a62bdbd0a 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/apps/HALOEXCHANGE-OMP.cpp @@ -34,7 +34,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -48,7 +48,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -73,7 +73,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -90,7 +90,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -120,7 +120,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -136,7 +136,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/apps/HALOEXCHANGE-OMPTarget.cpp index e4f0f561e..67e40edb7 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE-OMPTarget.cpp @@ -39,7 +39,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -54,7 +54,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -79,7 +79,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -95,7 +95,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/apps/HALOEXCHANGE-Seq.cpp index b6bd892ba..b40e09b3c 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/apps/HALOEXCHANGE-Seq.cpp @@ -32,7 +32,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -45,7 +45,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -70,7 +70,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -86,7 +86,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -115,7 +115,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -131,7 +131,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 986600282..4d223edaf 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -106,7 +106,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -132,7 +132,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -195,7 +195,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -214,7 +214,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) synchronize(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index b130cf4b5..5ac64db2d 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -106,7 +106,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -132,7 +132,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -199,7 +199,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -218,7 +218,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) synchronize(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 864393d0f..9a5da40fb 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -38,7 +38,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -80,7 +80,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -137,7 +137,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -175,7 +175,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -255,7 +255,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -273,7 +273,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ worksite site_pack = group_pack.run(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 285e283e2..d7f2ad9c9 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -80,7 +80,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -116,7 +116,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -190,7 +190,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -208,7 +208,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U worksite site_pack = group_pack.run(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index 7114de6c8..7ef75b84c 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -36,7 +36,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -60,7 +60,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -100,7 +100,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -122,7 +122,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -186,7 +186,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[send_tags[l]]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -204,7 +204,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG worksite site_pack = group_pack.run(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; + Real_ptr buffer = buffers[recv_tags[l]]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/apps/HALOEXCHANGE_base.cpp index 835f4b158..9e9769f37 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/apps/HALOEXCHANGE_base.cpp @@ -12,6 +12,7 @@ #include #include +#include namespace rajaperf { @@ -78,13 +79,15 @@ void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, } m_mpi_ranks.resize(s_num_neighbors, -1); + m_send_tags.resize(s_num_neighbors, -1); m_pack_index_lists.resize(s_num_neighbors, nullptr); m_pack_index_list_lengths.resize(s_num_neighbors, 0); + m_recv_tags.resize(s_num_neighbors, -1); m_unpack_index_lists.resize(s_num_neighbors, nullptr); m_unpack_index_list_lengths.resize(s_num_neighbors, 0); create_lists(my_mpi_rank, mpi_dims, m_mpi_ranks, - m_pack_index_lists, m_pack_index_list_lengths, - m_unpack_index_lists, m_unpack_index_list_lengths, + m_send_tags, m_pack_index_lists, m_pack_index_list_lengths, + m_recv_tags, m_unpack_index_lists, m_unpack_index_list_lengths, m_halo_width, m_grid_dims, s_num_neighbors, vid); } @@ -101,8 +104,10 @@ void HALOEXCHANGE_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG( destroy_lists(m_pack_index_lists, m_unpack_index_lists, s_num_neighbors, vid); m_unpack_index_list_lengths.clear(); m_unpack_index_lists.clear(); + m_recv_tags.clear(); m_pack_index_list_lengths.clear(); m_pack_index_lists.clear(); + m_send_tags.clear(); m_mpi_ranks.clear(); for (int v = 0; v < m_num_vars; ++v) { @@ -112,7 +117,7 @@ void HALOEXCHANGE_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } -const int HALOEXCHANGE_base::neighbor_offsets[HALOEXCHANGE_base::s_num_neighbors][3]{ +const int HALOEXCHANGE_base::boundary_offsets[HALOEXCHANGE_base::s_num_neighbors][3]{ // faces {-1, 0, 0}, @@ -150,15 +155,14 @@ const int HALOEXCHANGE_base::neighbor_offsets[HALOEXCHANGE_base::s_num_neighbors HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( const HALOEXCHANGE_base::message_type msg_type, - const int (&neighbor_offset)[3], - const bool (&crossing_periodic_boundary)[3], + const int (&boundary_offset)[3], const Index_type halo_width, const Index_type* grid_dims) { if (msg_type != message_type::send && msg_type != message_type::recv) { throw std::runtime_error("make_boundary_extent: Invalid message type"); } - auto get_bounds = [&](int offset, Index_type dim_size, bool periodic) { + auto get_bounds = [&](int offset, Index_type dim_size) { std::pair bounds; switch (offset) { case -1: @@ -166,13 +170,8 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( bounds.first = halo_width; bounds.second = halo_width + halo_width; } else { // (msg_type == message_type::recv) - if (periodic) { - bounds.first = halo_width + dim_size; - bounds.second = halo_width + dim_size + halo_width; - } else { - bounds.first = 0; - bounds.second = halo_width; - } + bounds.first = 0; + bounds.second = halo_width; } break; case 0: @@ -184,13 +183,8 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( bounds.first = halo_width + dim_size - halo_width; bounds.second = halo_width + dim_size; } else { // (msg_type == message_type::recv) - if (periodic) { - bounds.first = 0; - bounds.second = halo_width; - } else { - bounds.first = halo_width + dim_size; - bounds.second = halo_width + dim_size + halo_width; - } + bounds.first = halo_width + dim_size; + bounds.second = halo_width + dim_size + halo_width; } break; default: @@ -198,9 +192,9 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( } return bounds; }; - auto x_bounds = get_bounds(neighbor_offset[0], grid_dims[0], crossing_periodic_boundary[0]); - auto y_bounds = get_bounds(neighbor_offset[1], grid_dims[1], crossing_periodic_boundary[1]); - auto z_bounds = get_bounds(neighbor_offset[2], grid_dims[2], crossing_periodic_boundary[2]); + auto x_bounds = get_bounds(boundary_offset[0], grid_dims[0]); + auto y_bounds = get_bounds(boundary_offset[1], grid_dims[1]); + auto z_bounds = get_bounds(boundary_offset[2], grid_dims[2]); return {x_bounds.first, x_bounds.second, y_bounds.first, y_bounds.second, z_bounds.first, z_bounds.second}; @@ -210,13 +204,14 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( // // Function to generate mpi decomposition and index lists for packing and unpacking. // - void HALOEXCHANGE_base::create_lists( int my_mpi_rank, const int* mpi_dims, std::vector& mpi_ranks, + std::vector& send_tags, std::vector& pack_index_lists, std::vector& pack_index_list_lengths, + std::vector& recv_tags, std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, const Index_type halo_width, const Index_type* grid_dims, @@ -228,28 +223,38 @@ void HALOEXCHANGE_base::create_lists( my_mpi_idx[1] = (my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1])) / mpi_dims[0]; my_mpi_idx[0] = my_mpi_rank - my_mpi_idx[2]*(mpi_dims[0]*mpi_dims[1]) - my_mpi_idx[1]*mpi_dims[0]; + auto get_boundary_idx = [&](const int (&boundary_offset)[3]) { + return (boundary_offset[0]+1) + 3*(boundary_offset[1]+1) + 9*(boundary_offset[2]+1); + }; + + std::map boundary_idx_to_tag; + for (Index_type l = 0; l < num_neighbors; ++l) { + boundary_idx_to_tag[get_boundary_idx(boundary_offsets[l])] = l; + } + const Index_type grid_i_stride = 1; const Index_type grid_j_stride = grid_dims[0] + 2*halo_width; const Index_type grid_k_stride = grid_j_stride * (grid_dims[1] + 2*halo_width); for (Index_type l = 0; l < num_neighbors; ++l) { - const int (&mpi_offset)[3] = neighbor_offsets[l]; + const int (&boundary_offset)[3] = boundary_offsets[l]; + + int neighbor_boundary_offset[3]{-1, -1, -1}; + for (int dim = 0; dim < 3; ++dim) { + neighbor_boundary_offset[dim] = -boundary_offset[dim]; + } - int neighbor_mpi_idx[3] = {my_mpi_idx[0]+mpi_offset[0], - my_mpi_idx[1]+mpi_offset[1], - my_mpi_idx[2]+mpi_offset[2]}; - bool crossing_periodic_boundary[3] = {false, false, false}; + int neighbor_mpi_idx[3] = {my_mpi_idx[0]+boundary_offset[0], + my_mpi_idx[1]+boundary_offset[1], + my_mpi_idx[2]+boundary_offset[2]}; - // fix neighbor indices on periodic boundaries - // this assumes that the offsets are at most 1 and at least -1 + // fix neighbor mpi index on periodic boundaries for (int dim = 0; dim < 3; ++dim) { if (neighbor_mpi_idx[dim] >= mpi_dims[dim]) { neighbor_mpi_idx[dim] = 0; - crossing_periodic_boundary[dim] = true; } else if (neighbor_mpi_idx[dim] < 0) { neighbor_mpi_idx[dim] = mpi_dims[dim]-1; - crossing_periodic_boundary[dim] = true; } } @@ -257,9 +262,9 @@ void HALOEXCHANGE_base::create_lists( { // pack and send + send_tags[l] = boundary_idx_to_tag[get_boundary_idx(boundary_offset)]; Extent extent = make_boundary_extent(message_type::send, - neighbor_offsets[l], - crossing_periodic_boundary, + boundary_offset, halo_width, grid_dims); pack_index_list_lengths[l] = (extent.i_max - extent.i_min) * @@ -290,9 +295,9 @@ void HALOEXCHANGE_base::create_lists( { // receive and unpack + recv_tags[l] = boundary_idx_to_tag[get_boundary_idx(neighbor_boundary_offset)]; Extent extent = make_boundary_extent(message_type::recv, - neighbor_offsets[l], - crossing_periodic_boundary, + boundary_offset, halo_width, grid_dims); unpack_index_list_lengths[l] = (extent.i_max - extent.i_min) * diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/apps/HALOEXCHANGE_base.hpp index 7851a697b..6915f16d2 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/apps/HALOEXCHANGE_base.hpp @@ -50,8 +50,10 @@ \ Index_type num_neighbors = s_num_neighbors; \ Index_type num_vars = m_num_vars; \ + std::vector send_tags = m_send_tags; \ std::vector pack_index_lists = m_pack_index_lists; \ std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ + std::vector recv_tags = m_recv_tags; \ std::vector unpack_index_lists = m_unpack_index_lists; \ std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; @@ -106,7 +108,7 @@ class HALOEXCHANGE_base : public KernelBase }; static const int s_num_neighbors = 26; - static const int neighbor_offsets[s_num_neighbors][3]; + static const int boundary_offsets[s_num_neighbors][3]; Index_type m_grid_dims[3]; Index_type m_halo_width; @@ -124,22 +126,27 @@ class HALOEXCHANGE_base : public KernelBase std::vector m_mpi_ranks; + std::vector m_send_tags; std::vector m_pack_index_lists; std::vector m_pack_index_list_lengths; + + std::vector m_recv_tags; std::vector m_unpack_index_lists; std::vector m_unpack_index_list_lengths; Extent make_boundary_extent( const message_type msg_type, - const int (&neighbor_offset)[3], const bool (&crossing_periodic_boundary)[3], + const int (&boundary_offset)[3], const Index_type halo_width, const Index_type* grid_dims); void create_lists( int my_mpi_rank, const int* mpi_dims, std::vector& mpi_ranks, + std::vector& send_tags, std::vector& pack_index_lists, std::vector& pack_index_list_lengths, + std::vector& recv_tags, std::vector& unpack_index_lists, std::vector& unpack_index_list_lengths, const Index_type halo_width, const Index_type* grid_dims, diff --git a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp index 23a5ff045..8a97d7c5a 100644 --- a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp @@ -60,9 +60,8 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -85,9 +84,8 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } synchronize(); - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { @@ -128,9 +126,8 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -155,9 +152,8 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } synchronize(); - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { diff --git a/src/apps/MPI_HALOEXCHANGE-Hip.cpp b/src/apps/MPI_HALOEXCHANGE-Hip.cpp index e43a4c772..52c3c491e 100644 --- a/src/apps/MPI_HALOEXCHANGE-Hip.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Hip.cpp @@ -60,9 +60,8 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -86,9 +85,8 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) } synchronize(); - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { @@ -130,9 +128,8 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -157,9 +154,8 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) } synchronize(); - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { diff --git a/src/apps/MPI_HALOEXCHANGE-OMP.cpp b/src/apps/MPI_HALOEXCHANGE-OMP.cpp index 11a1b6ece..915c1c071 100644 --- a/src/apps/MPI_HALOEXCHANGE-OMP.cpp +++ b/src/apps/MPI_HALOEXCHANGE-OMP.cpp @@ -37,9 +37,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -61,9 +60,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { @@ -104,9 +102,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -131,9 +128,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { @@ -179,9 +175,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -205,9 +200,8 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { diff --git a/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp b/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp index f30d0ae59..f64ca80d1 100644 --- a/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp +++ b/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp @@ -40,9 +40,8 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -65,9 +64,8 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { @@ -108,9 +106,8 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -134,9 +131,8 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { diff --git a/src/apps/MPI_HALOEXCHANGE-Seq.cpp b/src/apps/MPI_HALOEXCHANGE-Seq.cpp index 3bd185d8b..13b61c02a 100644 --- a/src/apps/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Seq.cpp @@ -35,9 +35,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -58,9 +57,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { @@ -101,9 +99,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -127,9 +124,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { @@ -174,9 +170,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = unpack_index_list_lengths[l]; - int mpi_rank = mpi_ranks[l]; MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &unpack_mpi_requests[l]); + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); } for (Index_type l = 0; l < num_neighbors; ++l) { @@ -200,9 +195,8 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t len*num_vars); } - int mpi_rank = mpi_ranks[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, - mpi_rank, l, MPI_COMM_WORLD, &pack_mpi_requests[l]); + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } for (Index_type ll = 0; ll < num_neighbors; ++ll) { From bb079900796dc5a8446044af08bf86fb689c8fc1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 5 Jun 2023 16:21:48 -0700 Subject: [PATCH 020/454] Use streams in HALOEXCHANGE --- src/apps/HALOEXCHANGE-Cuda.cpp | 19 +++++++++++-------- src/apps/HALOEXCHANGE-Hip.cpp | 19 +++++++++++-------- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 18 ++++++++++-------- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 18 ++++++++++-------- src/apps/MPI_HALOEXCHANGE-Cuda.cpp | 19 +++++++++++-------- src/apps/MPI_HALOEXCHANGE-Hip.cpp | 19 +++++++++++-------- src/common/KernelBase.hpp | 14 ++++++++++++++ 7 files changed, 78 insertions(+), 48 deletions(-) diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 6993c44fb..22fa4e6f4 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -55,6 +55,8 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { + auto stream = camp::resources::Cuda::get_default().get_stream(); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,12 +68,12 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - haloexchange_pack<<>>(buffer, list, var, len); + haloexchange_pack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } } - synchronize(); + synchronize(stream); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -81,12 +83,12 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - haloexchange_unpack<<>>(buffer, list, var, len); + haloexchange_unpack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } } - synchronize(); + synchronize(stream); } stopTimer(); @@ -94,6 +96,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } else if ( vid == RAJA_CUDA ) { using EXEC_POL = RAJA::cuda_exec; + auto res = camp::resources::Cuda::get_default(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -107,13 +110,13 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_PACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_pack_base_lam ); buffer += len; } } - synchronize(); + res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -124,13 +127,13 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_UNPACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_unpack_base_lam ); buffer += len; } } - synchronize(); + res.wait(); } stopTimer(); diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index c89d81b9d..c7cfac47a 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -55,6 +55,8 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { + auto stream = camp::resources::Hip::get_default().get_stream(); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -66,13 +68,13 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, stream, buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; } } - synchronize(); + synchronize(stream); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -82,13 +84,13 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, stream, buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; } } - synchronize(); + synchronize(stream); } stopTimer(); @@ -96,6 +98,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) } else if ( vid == RAJA_HIP ) { using EXEC_POL = RAJA::hip_exec; + auto res = camp::resources::Hip::get_default(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -109,13 +112,13 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_PACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_pack_base_lam ); buffer += len; } } - synchronize(); + res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -126,13 +129,13 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_UNPACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_unpack_base_lam ); buffer += len; } } - synchronize(); + res.wait(); } stopTimer(); diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 4d223edaf..5a67c2643 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -98,6 +98,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + auto stream = camp::resources::Cuda::get_default().get_stream(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -123,10 +124,10 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - haloexchange_fused_pack<<>>( + haloexchange_fused_pack<<>>( pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); cudaErrchk( cudaGetLastError() ); - synchronize(); + synchronize(stream); Index_type unpack_index = 0; Index_type unpack_len_sum = 0; @@ -149,10 +150,10 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - haloexchange_fused_unpack<<>>( + haloexchange_fused_unpack<<>>( unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); cudaErrchk( cudaGetLastError() ); - synchronize(); + synchronize(stream); } stopTimer(); @@ -186,6 +187,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) RAJA::xargs<>, Allocator >; + auto res = camp::resources::Cuda::get_default(); workpool pool_pack (allocatorHolder.template getAllocator()); workpool pool_unpack(allocatorHolder.template getAllocator()); pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); @@ -210,8 +212,8 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } } workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); - synchronize(); + worksite site_pack = group_pack.run(res); + res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -229,8 +231,8 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } } workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - synchronize(); + worksite site_unpack = group_unpack.run(res); + res.wait(); } stopTimer(); diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 5ac64db2d..ef9dcb2e5 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -98,6 +98,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; + auto stream = camp::resources::Hip::get_default().get_stream(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -123,10 +124,10 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, 0, stream, pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); hipErrchk( hipGetLastError() ); - synchronize(); + synchronize(stream); Index_type unpack_index = 0; Index_type unpack_len_sum = 0; @@ -149,10 +150,10 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, 0, stream, unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); hipErrchk( hipGetLastError() ); - synchronize(); + synchronize(stream); } stopTimer(); @@ -190,6 +191,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) RAJA::xargs<>, Allocator >; + auto res = camp::resources::Hip::get_default(); workpool pool_pack (allocatorHolder.template getAllocator()); workpool pool_unpack(allocatorHolder.template getAllocator()); pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); @@ -214,8 +216,8 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } } workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); - synchronize(); + worksite site_pack = group_pack.run(res); + res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -233,8 +235,8 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } } workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - synchronize(); + worksite site_unpack = group_unpack.run(res); + res.wait(); } stopTimer(); diff --git a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp index 8a97d7c5a..ae2d001be 100644 --- a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Cuda.cpp @@ -55,6 +55,8 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { + auto stream = camp::resources::Cuda::get_default().get_stream(); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -72,7 +74,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - haloexchange_pack<<>>(buffer, list, var, len); + haloexchange_pack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -83,7 +85,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) len*num_vars); } - synchronize(); + synchronize(stream); MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } @@ -105,12 +107,12 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - haloexchange_unpack<<>>(buffer, list, var, len); + haloexchange_unpack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } } - synchronize(); + synchronize(stream); MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); @@ -120,6 +122,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) } else if ( vid == RAJA_CUDA ) { using EXEC_POL = RAJA::cuda_exec; + auto res = camp::resources::Cuda::get_default(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -139,7 +142,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_PACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_pack_base_lam ); buffer += len; @@ -151,7 +154,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) len*num_vars); } - synchronize(); + res.wait(); MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } @@ -174,13 +177,13 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_UNPACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_unpack_base_lam ); buffer += len; } } - synchronize(); + res.wait(); MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); diff --git a/src/apps/MPI_HALOEXCHANGE-Hip.cpp b/src/apps/MPI_HALOEXCHANGE-Hip.cpp index 52c3c491e..147050038 100644 --- a/src/apps/MPI_HALOEXCHANGE-Hip.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Hip.cpp @@ -55,6 +55,8 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { + auto stream = camp::resources::Hip::get_default().get_stream(); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -72,7 +74,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, 0, stream, buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; @@ -84,7 +86,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) len*num_vars); } - synchronize(); + synchronize(stream); MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } @@ -106,13 +108,13 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); - hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, 0, + hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, 0, stream, buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; } } - synchronize(); + synchronize(stream); MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); @@ -122,6 +124,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) } else if ( vid == RAJA_HIP ) { using EXEC_POL = RAJA::hip_exec; + auto res = camp::resources::Hip::get_default(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -141,7 +144,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_PACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_pack_base_lam ); buffer += len; @@ -153,7 +156,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) len*num_vars); } - synchronize(); + res.wait(); MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } @@ -176,13 +179,13 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { HALOEXCHANGE_UNPACK_BODY; }; - RAJA::forall( + RAJA::forall(res, RAJA::TypedRangeSegment(0, len), haloexchange_unpack_base_lam ); buffer += len; } } - synchronize(); + res.wait(); MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index c4bd1ce61..c5ee334d4 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -205,6 +205,20 @@ class KernelBase #endif } +#if defined(RAJA_ENABLE_CUDA) + void synchronize(cudaStream_t stream) + { + cudaErrchk( cudaStreamSynchronize(stream) ); + } +#endif + +#if defined(RAJA_ENABLE_HIP) + void synchronize(hipStream_t stream) + { + hipErrchk( hipStreamSynchronize(stream) ); + } +#endif + int getDataAlignment() const; DataSpace getDataSpace(VariantID vid) const; From 40587c23c64119485c17e846cfbd499f461eea2c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 6 Jun 2023 08:46:55 -0700 Subject: [PATCH 021/454] Add MPI_HALOEXCHANGE_FUSED kernel --- src/CMakeLists.txt | 3 + src/apps/CMakeLists.txt | 6 + src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp | 308 ++++++++++++++ src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp | 312 ++++++++++++++ src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp | 398 ++++++++++++++++++ src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp | 297 +++++++++++++ src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp | 327 ++++++++++++++ src/apps/MPI_HALOEXCHANGE_FUSED.cpp | 117 +++++ src/apps/MPI_HALOEXCHANGE_FUSED.hpp | 162 +++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 1937 insertions(+) create mode 100644 src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE_FUSED.cpp create mode 100644 src/apps/MPI_HALOEXCHANGE_FUSED.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b5b8b3599..5dc5bebb6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -66,6 +66,9 @@ blt_add_executable( apps/MPI_HALOEXCHANGE.cpp apps/MPI_HALOEXCHANGE-Seq.cpp apps/MPI_HALOEXCHANGE-OMPTarget.cpp + apps/MPI_HALOEXCHANGE_FUSED.cpp + apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp + apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp apps/LTIMES.cpp apps/LTIMES-Seq.cpp apps/LTIMES-OMPTarget.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index eef675ac1..cac0c1de0 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -58,6 +58,12 @@ blt_add_library( MPI_HALOEXCHANGE-Cuda.cpp MPI_HALOEXCHANGE-OMP.cpp MPI_HALOEXCHANGE-OMPTarget.cpp + MPI_HALOEXCHANGE_FUSED.cpp + MPI_HALOEXCHANGE_FUSED-Seq.cpp + MPI_HALOEXCHANGE_FUSED-Hip.cpp + MPI_HALOEXCHANGE_FUSED-Cuda.cpp + MPI_HALOEXCHANGE_FUSED-OMP.cpp + MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp LTIMES.cpp LTIMES-Seq.cpp LTIMES-Hip.cpp diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp new file mode 100644 index 000000000..79291250d --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp @@ -0,0 +1,308 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ + Real_ptr* pack_buffer_ptrs; \ + Int_ptr* pack_list_ptrs; \ + Real_ptr* pack_var_ptrs; \ + Index_type* pack_len_ptrs; \ + allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, pack_len_ptrs, num_neighbors * num_vars); \ + Real_ptr* unpack_buffer_ptrs; \ + Int_ptr* unpack_list_ptrs; \ + Real_ptr* unpack_var_ptrs; \ + Index_type* unpack_len_ptrs; \ + allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ + deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ + deallocData(DataSpace::CudaPinned, pack_len_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \ + deallocData(DataSpace::CudaPinned, unpack_len_ptrs); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = pack_buffer_ptrs[j]; + Int_ptr list = pack_list_ptrs[j]; + Real_ptr var = pack_var_ptrs[j]; + Index_type len = pack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALOEXCHANGE_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = unpack_buffer_ptrs[j]; + Int_ptr list = unpack_list_ptrs[j]; + Real_ptr var = unpack_var_ptrs[j]; + Index_type len = unpack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALOEXCHANGE_UNPACK_BODY; + } +} + + +template < size_t block_size > +void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + auto stream = camp::resources::Cuda::get_default().get_stream(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + Index_type pack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_buffer_ptrs[pack_index] = buffer; + pack_list_ptrs[pack_index] = list; + pack_var_ptrs[pack_index] = var; + pack_len_ptrs[pack_index] = len; + pack_len_sum += len; + pack_index += 1; + buffer += len; + } + } + Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; + dim3 pack_nthreads_per_block(block_size); + dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); + haloexchange_fused_pack<<>>( + pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); + cudaErrchk( cudaGetLastError() ); + synchronize(stream); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + Index_type unpack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_buffer_ptrs[unpack_index] = buffer; + unpack_list_ptrs[unpack_index] = list; + unpack_var_ptrs[unpack_index] = var; + unpack_len_ptrs[unpack_index] = len; + unpack_len_sum += len; + unpack_index += 1; + buffer += len; + } + } + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + dim3 unpack_nthreads_per_block(block_size); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); + haloexchange_fused_unpack<<>>( + unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); + cudaErrchk( cudaGetLastError() ); + synchronize(stream); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + auto res = camp::resources::Cuda::get_default(); + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + res.wait(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MPI_HALOEXCHANGE_FUSED, Cuda) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp new file mode 100644 index 000000000..147f5817c --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp @@ -0,0 +1,312 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ + Real_ptr* pack_buffer_ptrs; \ + Int_ptr* pack_list_ptrs; \ + Real_ptr* pack_var_ptrs; \ + Index_type* pack_len_ptrs; \ + allocData(DataSpace::HipPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, pack_len_ptrs, num_neighbors * num_vars); \ + Real_ptr* unpack_buffer_ptrs; \ + Int_ptr* unpack_list_ptrs; \ + Real_ptr* unpack_var_ptrs; \ + Index_type* unpack_len_ptrs; \ + allocData(DataSpace::HipPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinned, unpack_len_ptrs, num_neighbors * num_vars); + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ + deallocData(DataSpace::HipPinned, pack_buffer_ptrs); \ + deallocData(DataSpace::HipPinned, pack_list_ptrs); \ + deallocData(DataSpace::HipPinned, pack_var_ptrs); \ + deallocData(DataSpace::HipPinned, pack_len_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_buffer_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_list_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_var_ptrs); \ + deallocData(DataSpace::HipPinned, unpack_len_ptrs); + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = pack_buffer_ptrs[j]; + Int_ptr list = pack_list_ptrs[j]; + Real_ptr var = pack_var_ptrs[j]; + Index_type len = pack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALOEXCHANGE_PACK_BODY; + } +} + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) +{ + Index_type j = blockIdx.y; + + Real_ptr buffer = unpack_buffer_ptrs[j]; + Int_ptr list = unpack_list_ptrs[j]; + Real_ptr var = unpack_var_ptrs[j]; + Index_type len = unpack_len_ptrs[j]; + + for (Index_type i = threadIdx.x + blockIdx.x * block_size; + i < len; + i += block_size * gridDim.x) { + HALOEXCHANGE_UNPACK_BODY; + } +} + + +template < size_t block_size > +void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == Base_HIP ) { + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; + auto stream = camp::resources::Hip::get_default().get_stream(); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + Index_type pack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_buffer_ptrs[pack_index] = buffer; + pack_list_ptrs[pack_index] = list; + pack_var_ptrs[pack_index] = var; + pack_len_ptrs[pack_index] = len; + pack_len_sum += len; + pack_index += 1; + buffer += len; + } + } + Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; + dim3 pack_nthreads_per_block(block_size); + dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); + hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, 0, stream, + pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); + hipErrchk( hipGetLastError() ); + synchronize(stream); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + Index_type unpack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_buffer_ptrs[unpack_index] = buffer; + unpack_list_ptrs[unpack_index] = list; + unpack_var_ptrs[unpack_index] = var; + unpack_len_ptrs[unpack_index] = len; + unpack_len_sum += len; + unpack_index += 1; + buffer += len; + } + } + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + dim3 unpack_nthreads_per_block(block_size); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); + hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, 0, stream, + unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); + hipErrchk( hipGetLastError() ); + synchronize(stream); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, +#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, +#else + RAJA::ordered, +#endif + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + auto res = camp::resources::Hip::get_default(); + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + res.wait(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MPI_HALOEXCHANGE_FUSED, Hip) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp new file mode 100644 index 000000000..3951a5951 --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp @@ -0,0 +1,398 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } +#if _OPENMP >= 200805 + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < pack_index; j++) { + #pragma omp task firstprivate(j) + { + Real_ptr buffer = pack_ptr_holders[j].buffer; + Int_ptr list = pack_ptr_holders[j].list; + Real_ptr var = pack_ptr_holders[j].var; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_PACK_BODY; + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < pack_index; j++) { + Real_ptr buffer = pack_ptr_holders[j].buffer; + Int_ptr list = pack_ptr_holders[j].list; + Real_ptr var = pack_ptr_holders[j].var; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_PACK_BODY; + } + } +#endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } +#if _OPENMP >= 200805 + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < unpack_index; j++) { + #pragma omp task firstprivate(j) + { + Real_ptr buffer = unpack_ptr_holders[j].buffer; + Int_ptr list = unpack_ptr_holders[j].list; + Real_ptr var = unpack_ptr_holders[j].var; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_UNPACK_BODY; + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < unpack_index; j++) { + Real_ptr buffer = unpack_ptr_holders[j].buffer; + Int_ptr list = unpack_ptr_holders[j].list; + Real_ptr var = unpack_ptr_holders[j].var; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_UNPACK_BODY; + } + } +#endif + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + + break; + } + + case Lambda_OpenMP : { + + MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } +#if _OPENMP >= 200805 + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < pack_index; j++) { + #pragma omp task firstprivate(j) + { + auto pack_lambda = pack_lambdas[j]; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + pack_lambda(i); + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < pack_index; j++) { + auto pack_lambda = pack_lambdas[j]; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + pack_lambda(i); + } + } +#endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } +#if _OPENMP >= 200805 + #pragma omp parallel + #pragma omp single nowait + for (Index_type j = 0; j < unpack_index; j++) { + #pragma omp task firstprivate(j) + { + auto unpack_lambda = unpack_lambdas[j]; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + unpack_lambda(i); + } + } + } +#else + #pragma omp parallel for + for (Index_type j = 0; j < unpack_index; j++) { + auto unpack_lambda = unpack_lambdas[j]; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + unpack_lambda(i); + } + } +#endif + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + + break; + } + + case RAJA_OpenMP : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[send_tags[l]]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[recv_tags[l]]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp new file mode 100644 index 000000000..11a5356e8 --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -0,0 +1,297 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + // + // Define threads per team for target execution (unused) + // +//const size_t threads_per_team = 256; + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ + void** pack_ptrs; \ + allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ + Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* pack_list_ptrs = reinterpret_cast(pack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* pack_var_ptrs = reinterpret_cast(pack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* pack_len_ptrs = reinterpret_cast(pack_ptrs) + 3 * num_neighbors * num_vars; \ + void** h_pack_ptrs = new void*[4 * num_neighbors * num_vars]; \ + Real_ptr* h_pack_buffer_ptrs = reinterpret_cast(h_pack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* h_pack_list_ptrs = reinterpret_cast(h_pack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* h_pack_var_ptrs = reinterpret_cast(h_pack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* h_pack_len_ptrs = reinterpret_cast(h_pack_ptrs) + 3 * num_neighbors * num_vars; \ + void** unpack_ptrs; \ + allocData(DataSpace::OmpTarget, unpack_ptrs, 4 * num_neighbors * num_vars); \ + Real_ptr* unpack_buffer_ptrs = reinterpret_cast(unpack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* unpack_list_ptrs = reinterpret_cast(unpack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* unpack_var_ptrs = reinterpret_cast(unpack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* unpack_len_ptrs = reinterpret_cast(unpack_ptrs) + 3 * num_neighbors * num_vars; \ + void** h_unpack_ptrs = new void*[4 * num_neighbors * num_vars]; \ + Real_ptr* h_unpack_buffer_ptrs = reinterpret_cast(h_unpack_ptrs) + 0 * num_neighbors * num_vars; \ + Int_ptr* h_unpack_list_ptrs = reinterpret_cast(h_unpack_ptrs) + 1 * num_neighbors * num_vars; \ + Real_ptr* h_unpack_var_ptrs = reinterpret_cast(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \ + Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ + initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ + initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ + deallocData(DataSpace::OmpTarget, pack_ptrs); \ + delete[] h_pack_ptrs; \ + deallocData(DataSpace::OmpTarget, unpack_ptrs); \ + delete[] h_unpack_ptrs; + + +void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + Index_type pack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + h_pack_buffer_ptrs[pack_index] = buffer; + h_pack_list_ptrs[pack_index] = list; + h_pack_var_ptrs[pack_index] = var; + h_pack_len_ptrs[pack_index] = len; + pack_len_sum += len; + pack_index += 1; + buffer += len; + } + } + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; + Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; + #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did ) + #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) + for (Index_type j = 0; j < pack_index; j++) { + for (Index_type ii = 0; ii < pack_len_ave; ii++) { + + Real_ptr buffer = pack_buffer_ptrs[j]; + Int_ptr list = pack_list_ptrs[j]; + Real_ptr var = pack_var_ptrs[j]; + Index_type len = pack_len_ptrs[j]; + + for (Index_type i = ii; i < len; i += pack_len_ave) { + HALOEXCHANGE_PACK_BODY; + } + } + } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + Index_type unpack_len_sum = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + h_unpack_buffer_ptrs[unpack_index] = buffer; + h_unpack_list_ptrs[unpack_index] = list; + h_unpack_var_ptrs[unpack_index] = var; + h_unpack_len_ptrs[unpack_index] = len; + unpack_len_sum += len; + unpack_index += 1; + buffer += len; + } + } + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did ) + #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) + for (Index_type j = 0; j < unpack_index; j++) { + for (Index_type ii = 0; ii < unpack_len_ave; ii++) { + + Real_ptr buffer = unpack_buffer_ptrs[j]; + Int_ptr list = unpack_list_ptrs[j]; + Real_ptr var = unpack_var_ptrs[j]; + Index_type len = unpack_len_ptrs[j]; + + for (Index_type i = ii; i < len; i += unpack_len_ave) { + HALOEXCHANGE_UNPACK_BODY; + } + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_target_work /**/, + RAJA::ordered, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[recv_tags[l]]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp new file mode 100644 index 000000000..72ea82b1f --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp @@ -0,0 +1,327 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < pack_index; j++) { + Real_ptr buffer = pack_ptr_holders[j].buffer; + Int_ptr list = pack_ptr_holders[j].list; + Real_ptr var = pack_ptr_holders[j].var; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_PACK_BODY; + } + } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < unpack_index; j++) { + Real_ptr buffer = unpack_ptr_holders[j].buffer; + Int_ptr list = unpack_ptr_holders[j].list; + Real_ptr var = unpack_ptr_holders[j].var; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_UNPACK_BODY; + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + Index_type pack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < pack_index; j++) { + auto pack_lambda = pack_lambdas[j]; + Index_type len = pack_lens[j]; + for (Index_type i = 0; i < len; i++) { + pack_lambda(i); + } + } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + Index_type unpack_index = 0; + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + } + } + for (Index_type j = 0; j < unpack_index; j++) { + auto unpack_lambda = unpack_lambdas[j]; + Index_type len = unpack_lens[j]; + for (Index_type i = 0; i < len; i++) { + unpack_lambda(i); + } + } + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + + break; + } + + case RAJA_Seq : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::loop_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED.cpp new file mode 100644 index 000000000..3e79b3e96 --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE_FUSED.cpp @@ -0,0 +1,117 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOEXCHANGE_FUSED.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) + : HALOEXCHANGE_base(rajaperf::Apps_MPI_HALOEXCHANGE_FUSED, params) +{ + m_mpi_size = params.getMPISize(); + m_my_mpi_rank = params.getMPIRank(); + m_mpi_dims = params.getMPI3DDivision(); + + setUsesFeature(Workgroup); + setUsesFeature(MPI); + + if (params.validMPI3DDivision()) { + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + } +} + +MPI_HALOEXCHANGE_FUSED::~MPI_HALOEXCHANGE_FUSED() +{ +} + +void MPI_HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) +{ + setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } + } +} + +void MPI_HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } + } + m_recv_buffers.clear(); + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } + } + m_send_buffers.clear(); + m_pack_buffers.clear(); + + tearDown_base(vid, tune_idx); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED.hpp b/src/apps/MPI_HALOEXCHANGE_FUSED.hpp new file mode 100644 index 000000000..64879f680 --- /dev/null +++ b/src/apps/MPI_HALOEXCHANGE_FUSED.hpp @@ -0,0 +1,162 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// MPI_HALOEXCHANGE_FUSED kernel reference implementation: +/// +/// // pack message for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// HALOEXCHANGE_PACK_BODY; +/// } +/// buffer += len; +/// } +/// // send message to neighbor +/// } +/// +/// // unpack messages for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// // receive message from neighbor +/// Real_ptr buffer = buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// HALOEXCHANGE_UNPACK_BODY; +/// } +/// buffer += len; +/// } +/// } +/// + +#ifndef RAJAPerf_Apps_MPI_HALOEXCHANGE_FUSED_HPP +#define RAJAPerf_Apps_MPI_HALOEXCHANGE_FUSED_HPP + +#define MPI_HALOEXCHANGE_FUSED_DATA_SETUP \ + HALOEXCHANGE_base_DATA_SETUP \ + \ + std::vector mpi_ranks = m_mpi_ranks; \ + \ + std::vector pack_mpi_requests(num_neighbors); \ + std::vector unpack_mpi_requests(num_neighbors); \ + \ + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ + struct ptr_holder { \ + Real_ptr buffer; \ + Int_ptr list; \ + Real_ptr var; \ + }; \ + ptr_holder* pack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ + Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ + ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ + Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ + delete[] pack_ptr_holders; \ + delete[] pack_lens; \ + delete[] unpack_ptr_holders; \ + delete[] unpack_lens; + + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ + auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ + return [=](Index_type i) { \ + HALOEXCHANGE_PACK_BODY; \ + }; \ + }; \ + using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ + pack_lambda_type* pack_lambdas = reinterpret_cast( \ + malloc(sizeof(pack_lambda_type) * (num_neighbors * num_vars))); \ + Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ + auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ + return [=](Index_type i) { \ + HALOEXCHANGE_UNPACK_BODY; \ + }; \ + }; \ + using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ + unpack_lambda_type* unpack_lambdas = reinterpret_cast( \ + malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \ + Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; + +#define MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ + free(pack_lambdas); \ + delete[] pack_lens; \ + free(unpack_lambdas); \ + delete[] unpack_lens; + + +#include "HALOEXCHANGE_base.hpp" + +#include "RAJA/RAJA.hpp" + +namespace rajaperf +{ +namespace apps +{ + +class MPI_HALOEXCHANGE_FUSED : public HALOEXCHANGE_base +{ +public: + + MPI_HALOEXCHANGE_FUSED(const RunParams& params); + + ~MPI_HALOEXCHANGE_FUSED(); + + void setUp(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 1024; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + int m_mpi_size = -1; + int m_my_mpi_rank = -1; + std::array m_mpi_dims = {-1, -1, -1}; + + std::vector m_pack_buffers; + std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 3108ca33f..c6c0bf126 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -87,6 +87,7 @@ #include "apps/HALOEXCHANGE_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) #include "apps/MPI_HALOEXCHANGE.hpp" +#include "apps/MPI_HALOEXCHANGE_FUSED.hpp" #endif #include "apps/LTIMES.hpp" #include "apps/LTIMES_NOVIEW.hpp" @@ -225,6 +226,7 @@ static const std::string KernelNames [] = std::string("Apps_HALOEXCHANGE_FUSED"), #if defined(RAJA_PERFSUITE_ENABLE_MPI) std::string("Apps_MPI_HALOEXCHANGE"), + std::string("Apps_MPI_HALOEXCHANGE_FUSED"), #endif std::string("Apps_LTIMES"), std::string("Apps_LTIMES_NOVIEW"), @@ -892,6 +894,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::MPI_HALOEXCHANGE(run_params); break; } + case Apps_MPI_HALOEXCHANGE_FUSED : { + kernel = new apps::MPI_HALOEXCHANGE_FUSED(run_params); + break; + } #endif case Apps_LTIMES : { kernel = new apps::LTIMES(run_params); diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index f1b88cc75..22b81247d 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -140,6 +140,7 @@ enum KernelID { Apps_HALOEXCHANGE_FUSED, #if defined(RAJA_PERFSUITE_ENABLE_MPI) Apps_MPI_HALOEXCHANGE, + Apps_MPI_HALOEXCHANGE_FUSED, #endif Apps_LTIMES, Apps_LTIMES_NOVIEW, From 749f24fafef736a63aaa827ce3a1349a7ecae3b3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 6 Jun 2023 08:47:15 -0700 Subject: [PATCH 022/454] fix spacing --- src/apps/MPI_HALOEXCHANGE-Seq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/MPI_HALOEXCHANGE-Seq.cpp b/src/apps/MPI_HALOEXCHANGE-Seq.cpp index 13b61c02a..b11acebd4 100644 --- a/src/apps/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/apps/MPI_HALOEXCHANGE-Seq.cpp @@ -106,7 +106,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { From 86ae03be0828cf6bd58c1f138aded9619ed76bd5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 6 Jun 2023 08:47:34 -0700 Subject: [PATCH 023/454] Make sync in HALOEXCHANGE consistent with MPI version --- src/apps/HALOEXCHANGE-Cuda.cpp | 4 ++-- src/apps/HALOEXCHANGE-Hip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 22fa4e6f4..774cfa3a3 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -72,8 +72,8 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaGetLastError() ); buffer += len; } + synchronize(stream); } - synchronize(stream); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -115,8 +115,8 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) haloexchange_pack_base_lam ); buffer += len; } + res.wait(); } - res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index c7cfac47a..983109665 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -73,8 +73,8 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) hipErrchk( hipGetLastError() ); buffer += len; } + synchronize(stream); } - synchronize(stream); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; @@ -117,8 +117,8 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) haloexchange_pack_base_lam ); buffer += len; } + res.wait(); } - res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = buffers[recv_tags[l]]; From 81a5c313daf1f0f2892700ffc276dd9c53606a5b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Jun 2023 09:42:29 -0700 Subject: [PATCH 024/454] automatically generate MPI 3d divisions Note that this may result in ranks communicating mostly with themselves if the number of mpi ranks does not have 3 prime factors --- src/common/Executor.cpp | 7 ++++++ src/common/RunParams.cpp | 47 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index e54bc9214..9f18cabc2 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -709,6 +709,13 @@ void Executor::reportRunSummary(ostream& str) const str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; str << "\t Output files will be named " << ofiles << endl; +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + str << "\nRunning with " << run_params.getMPISize() << " MPI procs" << endl; + auto div3d = run_params.getMPI3DDivision(); + const char* valid3d = run_params.validMPI3DDivision() ? "" : "invalid"; + str << "\t 3D division = " << div3d[0] << " x " << div3d[1] << " x " << div3d[2] << " " << valid3d << endl; +#endif + str << "\nThe following kernels and variants (when available for a kernel) will be run:" << endl; str << "\nData Spaces" diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 8c40f0b6c..06e821200 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -737,10 +737,51 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } #if defined(RAJA_PERFSUITE_ENABLE_MPI) + + // assumes number is >= 0 + // returns {0} if number is 0 + // {1} if number is 1 + // {prime factors in non-decreasing order} otherwise + auto factorize = [](int number) { + std::vector prime_factors; + int factor = 2; + while (factor <= std::sqrt(number)) { + int quotient = number / factor; + if (quotient * factor == number) { + prime_factors.emplace_back(factor); + number = quotient; + } else { + factor++; + } + } + prime_factors.emplace_back(number); + return prime_factors; + }; + + // Uses prime factors to set division + // to a relatively square grid + auto set_division = [](int* division, const int dims, + std::vector const& prime_factors) { + for (int d = 0; d < dims; ++d) { + division[d] = 1; + } + + for (int factor : prime_factors) { + + int min_d = 0; + for (int d = 1; d < dims; ++d) { + if (division[d] < division[min_d]) { + min_d = d; + } + } + + division[min_d] *= factor; + } + }; + if (mpi_3d_division[0] == -1) { - mpi_3d_division[0] = std::ceil(std::cbrt(mpi_size)); - mpi_3d_division[1] = mpi_3d_division[0]; - mpi_3d_division[2] = mpi_3d_division[0]; + std::vector prime_factors = factorize(mpi_size); + set_division(mpi_3d_division.data(), 3, prime_factors); } #endif } From a42fcf59906dd728311df90bb07d3b1842cab5d3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 9 Jun 2023 14:45:22 -0700 Subject: [PATCH 025/454] Add MPI guards in MPI_HALOEXCHANGE_FUSED --- src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp | 2 +- src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp | 2 +- src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp | 4 ++++ src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp | 4 ++++ src/apps/MPI_HALOEXCHANGE_FUSED.cpp | 4 ++++ src/apps/MPI_HALOEXCHANGE_FUSED.hpp | 3 +++ 7 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp index 79291250d..4f1ad60f7 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -#if defined(RAJA_ENABLE_CUDA) +#if defined(RAJA_ENABLE_CUDA) && defined(RAJA_ENABLE_TARGET_OPENMP) #include "common/CudaDataUtils.hpp" diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp index 147f5817c..44f8b7b23 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -#if defined(RAJA_ENABLE_HIP) +#if defined(RAJA_ENABLE_HIP) && defined(RAJA_ENABLE_TARGET_OPENMP) #include "common/HipDataUtils.hpp" diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp index 3951a5951..934613141 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + #include namespace rajaperf @@ -396,3 +398,5 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU } // end namespace apps } // end namespace rajaperf + +#endif diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp index 11a5356e8..9e9034c04 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -#if defined(RAJA_ENABLE_TARGET_OPENMP) +#if defined(RAJA_ENABLE_TARGET_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP) #include "common/OpenMPTargetDataUtils.hpp" diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp index 72ea82b1f..cb27cc440 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + #include namespace rajaperf @@ -325,3 +327,5 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED } // end namespace apps } // end namespace rajaperf + +#endif diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED.cpp index 3e79b3e96..ac3f20300 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED.cpp @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + #include namespace rajaperf @@ -115,3 +117,5 @@ void MPI_HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) } // end namespace apps } // end namespace rajaperf + +#endif diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED.hpp b/src/apps/MPI_HALOEXCHANGE_FUSED.hpp index 64879f680..ab716aec9 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED.hpp @@ -112,6 +112,8 @@ #include "RAJA/RAJA.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + namespace rajaperf { namespace apps @@ -159,4 +161,5 @@ class MPI_HALOEXCHANGE_FUSED : public HALOEXCHANGE_base } // end namespace apps } // end namespace rajaperf +#endif #endif // closing endif for header file include guard From 0218691700d6a7248a8bdb9c887034016620fb68 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 9 Jun 2023 14:51:00 -0700 Subject: [PATCH 026/454] Add todo for MPI data movement bytes per rep --- src/apps/MPI_HALOEXCHANGE.cpp | 2 ++ src/apps/MPI_HALOEXCHANGE_FUSED.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/apps/MPI_HALOEXCHANGE.cpp index 88990fc36..158221e7e 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/apps/MPI_HALOEXCHANGE.cpp @@ -26,6 +26,8 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) m_my_mpi_rank = params.getMPIRank(); m_mpi_dims = params.getMPI3DDivision(); + // TODO: Figure out how to count MPI data movement in BytesPerRep + setUsesFeature(Forall); setUsesFeature(MPI); diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED.cpp index ac3f20300..8ddf264c2 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED.cpp @@ -26,6 +26,8 @@ MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) m_my_mpi_rank = params.getMPIRank(); m_mpi_dims = params.getMPI3DDivision(); + // TODO: Figure out how to count MPI data movement in BytesPerRep + setUsesFeature(Workgroup); setUsesFeature(MPI); From 194cfee4ee6f85b0ef063791db55571740de09ac Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:03:56 +0200 Subject: [PATCH 027/454] Uniformize build_and_test script: add ability to not use /dev/shm --- scripts/gitlab/build_and_test.sh | 34 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index de837bed9..850ad80d7 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -21,17 +21,26 @@ hostname="$(hostname)" truehostname=${hostname//[0-9]/} project_dir="$(pwd)" -build_root=${BUILD_ROOT:-""} hostconfig=${HOST_CONFIG:-""} spec=${SPEC:-""} +module_list=${MODULE_LIST:-""} job_unique_id=${CI_JOB_ID:-""} +use_dev_shm=${USE_DEV_SHM:-true} + raja_version=${UPDATE_RAJA:-""} sys_type=${SYS_TYPE:-""} -use_dev_shm=${USE_DEV_SHM:-true} spack_upstream_path=${SPACK_UPSTREAM_PATH:-"/usr/workspace/umdev/RAJAPerf/upstream"} update_spack_upstream=${UPDATE_SPACK_UPSTREAM:-false} +if [[ -n ${module_list} ]] +then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Modules to load: ${module_list}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + module load ${module_list} +fi + prefix="" if [[ ${update_spack_upstream} == true ]] @@ -55,8 +64,9 @@ then prefix="${prefix}-${job_unique_id}" mkdir -p ${prefix} else - prefix="spack-and-build-root" - mkdir ${prefix} + # We set the prefix in the parent directory so that spack dependencies are not installed inside the source tree. + prefix="$(pwd)/../spack-and-build-root" + mkdir -p ${prefix} fi # Dependencies @@ -131,17 +141,8 @@ fi hostconfig=$(basename ${hostconfig_path}) # Build Directory -if [[ -z ${build_root} ]] -then - if [[ -d /dev/shm && ${use_dev_shm} == true ]] - then - build_root="${prefix}" - else - build_root="$(pwd)" - fi -else - build_root="${build_root}" -fi +# When using /dev/shm, we use prefix for both spack builds and source build, unless BUILD_ROOT was defined +build_root=${BUILD_ROOT:-"${prefix}"} build_dir="${build_root}/build_${hostconfig//.cmake/}" @@ -187,7 +188,6 @@ then mkdir -p ${build_dir} && cd ${build_dir} date - if [[ "${truehostname}" == "corona" || "${truehostname}" == "tioga" ]] then module unload rocm @@ -200,11 +200,11 @@ then echo "ERROR: compilation failed, building with verbose output..." $cmake_exe --build . --verbose -j 1 fi + date echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ RAJA Perf Suite Built" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - date fi if [[ ! -d ${build_dir} ]] From 78f031508c813e5076b476015465fed0dd8d8c24 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 22 Aug 2023 11:04:28 +0200 Subject: [PATCH 028/454] From RAJA: From RSC: Shared CI update + RSC update + Add RAJA and RAJAPerf releases --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 9b5f61edf..5e1f5062f 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 9b5f61edf3aa1e6fdbc9a4b30828c81504639963 +Subproject commit 5e1f5062fd2391e75f97b0fd2d0becf95cd12a92 From b6a0d3243b1432789a1e5b7603f8a478ed649b5e Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 22 Aug 2023 11:31:41 +0200 Subject: [PATCH 029/454] Update Shared CI --- .gitlab-ci.yml | 24 +++++++++++++----------- .gitlab/lassen-build-and-test-extra.yml | 14 +------------- .gitlab/ruby-build-and-test-extra.yml | 8 +++++++- .gitlab/tioga-build-and-test-extra.yml | 8 +------- .uberenv_config.json | 2 +- 5 files changed, 23 insertions(+), 33 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index abc7b14b9..e7997e0d1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,13 +12,14 @@ # # This entire pipeline is LLNL-specific # -# Important note: This file is a copy of the template provided by -# llnl/radiuss-shared-ci. It should not require any change from the project to -# get started but could feature project-specific stages. +# Important note: This file is a template provided by +# llnl/radiuss-shared-ci. Changes needed consists in setting variable values, +# change the reference to the radiuss-shared-ci repo, opt-in and out optional +# features. The project can then extend it with additional stages. # -# Instead, each project should provide: -# - .gitlab/subscribed-pipelines.yml +# However, each project should provide: # - .gitlab/custom-jobs-and-variables.yml +# - .gitlab/subscribed-pipelines.yml # - .gitlab/${MACHINE}-build-and-test-extra.yml ############################################################################### @@ -40,12 +41,13 @@ variables: BUILD_ROOT: ${CI_PROJECT_DIR} # Set the build-and-test command. BUILD_AND_TEST_CMD: "./scripts/gitlab/build_and_test.sh" -# Override the list of branch that will skip the "draft PR test". -# Add protected branches here. Defaults to "develop main master". -# ALWAYS_RUN_LIST: "develop main" +# Override the pattern describing branches that will skip the "draft PR test". +# Add protected branches here. See default value in +# preliminary-ignore-draft-pr.yml. +# ALWAYS_RUN_PATTERN: "^develop$|^main$|^v[0-9.]*-RC$" -# We organize the CI on Gitlab in sub-pipelines. Each sub-pipeline corresponds -# to a test phase on a given machine. +# We organize the build-and-test stage in sub-pipelines. Each sub-pipeline +# corresponds to a test batch on a given machine. # High level stages stages: @@ -59,7 +61,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: v2023.06.0 + ref: v2023.08.0 file: '${CI_MACHINE}-build-and-test.yml' - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml' strategy: depend diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml index 68850e5e8..f9610a1d1 100644 --- a/.gitlab/lassen-build-and-test-extra.yml +++ b/.gitlab/lassen-build-and-test-extra.yml @@ -21,14 +21,6 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" extends: .build_and_test_on_lassen -# Overriding shared spec: Longer allocation + extra flags -xl_2022_08_19_gcc_8_3_1_cuda_11_7_0: - variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.7.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" - MODULE_LIST: "cuda/11.7.0" - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" - extends: .build_and_test_on_lassen - ############ # Extra jobs @@ -37,10 +29,6 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_7_0: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -########## -# CUDA -########## - gcc_8_3_1_cuda_11_5_0_ats_disabled: extends: .build_and_test_on_lassen variables: @@ -67,5 +55,5 @@ clang_13_0_1_libcpp: # Activated in RAJA, but we don't use desul atomics here #gcc_8_3_1_cuda_10_1_168_desul_atomics: # variables: -# SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" +# SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers" # extends: .build_and_test_on_lassen diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml index da320f4f8..cffa3504c 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/ruby-build-and-test-extra.yml @@ -25,12 +25,18 @@ gcc_10_3_1: RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" extends: .build_and_test_on_ruby -intel_19_1_2_gcc_8_5_0: +intel_19_1_2_gcc_10_3_1: variables: SPEC: " +openmp %intel@19.1.2.gcc.8.5.0" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" extends: .build_and_test_on_ruby +intel_2022_1_0: + variables: + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" + allow_failure: true + extends: .build_and_test_on_ruby + ############ # Extra jobs ############ diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/tioga-build-and-test-extra.yml index 02a2feef6..d3d054b4a 100644 --- a/.gitlab/tioga-build-and-test-extra.yml +++ b/.gitlab/tioga-build-and-test-extra.yml @@ -15,8 +15,6 @@ # No overridden jobs so far. -# In post-build phase, deallocate resources. - ############ # Extra jobs ############ @@ -24,11 +22,7 @@ # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -# With GitLab CI, included files cannot be empty. -#variables: -# INCLUDED_FILE_CANNOT_BE_EMPTY: "True" - rocmcc_5_4_3_hip_openmp: variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.4.3 ^hip@5.4.3 ^blt@develop" + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@develop" extends: .build_and_test_on_tioga diff --git a/.uberenv_config.json b/.uberenv_config.json index e2353e1c9..79039830f 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -4,7 +4,7 @@ "package_final_phase" : "initconfig", "package_source_dir" : "../..", "spack_url": "https://github.com/spack/spack.git", -"spack_branch": "e4s-23.02", +"spack_branch": "v0.20.1", "spack_activate" : {}, "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", "spack_packages_path": "tpl/RAJA/scripts/radiuss-spack-configs/packages", From 6c316925ed234cf5b1faa4f7e72a27d49c302f5f Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 22 Aug 2023 11:55:34 +0200 Subject: [PATCH 030/454] Fix spec --- .gitlab/ruby-build-and-test-extra.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml index cffa3504c..965142c5f 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/ruby-build-and-test-extra.yml @@ -27,7 +27,7 @@ gcc_10_3_1: intel_19_1_2_gcc_10_3_1: variables: - SPEC: " +openmp %intel@19.1.2.gcc.8.5.0" + SPEC: " +openmp %intel@19.1.2.gcc.10.3.1" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" extends: .build_and_test_on_ruby From 39cc7a136508dd04bf3a81706415efd655f86a2b Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 29 Aug 2023 19:12:24 +0200 Subject: [PATCH 031/454] Update RAJA to develop with Shared CI 2023-08-0 changes --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 5e1f5062f..e78b1eb03 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 5e1f5062fd2391e75f97b0fd2d0becf95cd12a92 +Subproject commit e78b1eb03cbcd9f954c9f54ea79b5f6f479bde45 From 9104970e81efd5ebf37b3089f6c1c9230df657db Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 10:33:29 -0700 Subject: [PATCH 032/454] Disable INDEXLIST Base_HIP It deadlocks sometimes. --- src/basic/INDEXLIST.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index cb559c8b2..45737a2e4 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -48,7 +48,10 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Base_CUDA ); - setVariantDefined( Base_HIP ); + if (0) { + // deadlocks + setVariantDefined( Base_HIP ); + } } INDEXLIST::~INDEXLIST() From cbf4a8bb071633ba17647e38ef152d5cb14149eb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 10:54:27 -0700 Subject: [PATCH 033/454] Update indexlist comments --- src/basic/INDEXLIST-Cuda.cpp | 2 +- src/basic/INDEXLIST-Hip.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index cb6c88a9e..861341b04 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -208,7 +208,7 @@ __global__ void indexlist(Real_ptr x, Index_type* len, Index_type iend) { - // blocks do start running in order in cuda and hip, so a block with a higher + // blocks do start running in order in cuda, so a block with a higher // index can wait on a block with a lower index without deadlocking // (replace with an atomicInc if this changes) const int block_id = blockIdx.x; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 9b0555057..5d36e20aa 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -208,9 +208,9 @@ __global__ void indexlist(Real_ptr x, Index_type* len, Index_type iend) { - // blocks do start running in order in cuda and hip, so a block with a higher - // index can wait on a block with a lower index without deadlocking - // (replace with an atomicInc if this changes) + // It looks like blocks do not start running in order in hip, so a block + // with a higher index can't wait on a block with a lower index without + // deadlocking (have to replace with an atomicInc) const int block_id = blockIdx.x; Index_type vals[items_per_thread]; From e44b20285244acfdc255a18bd9fbba9db73aa383 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:40:24 -0700 Subject: [PATCH 034/454] Add occupancy calculator wrapper That is the occupancy calculator functions in Cuda and Hip --- src/common/CudaDataUtils.hpp | 37 ++++++++++++++++++++++++++++++++++++ src/common/HipDataUtils.hpp | 27 ++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 0c3504d69..7907b7286 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -84,6 +84,43 @@ __global__ void lambda_cuda(Lambda body) namespace detail { +/*! + * \brief Get current cuda device. + */ +inline int getCudaDevice() +{ + int device = -1; + cudaErrchk( cudaGetDevice( &device ) ); + return device; +} + +/*! + * \brief Get properties of the current cuda device. + */ +inline cudaDeviceProp getCudaDeviceProp() +{ + cudaDeviceProp prop; + cudaErrchk(cudaGetDeviceProperties(&prop, getCudaDevice())); + return prop; +} + +/*! + * \brief Get max occupancy in blocks for the given kernel for the current + * cuda device. + */ +template < typename Func > +RAJA_INLINE +int getCudaOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size) +{ + int max_blocks = -1; + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, func, num_threads, shmem_size)); + + size_t multiProcessorCount = getCudaDeviceProp().multiProcessorCount; + + return max_blocks * multiProcessorCount; +} + /* * Copy memory len bytes from src to dst. */ diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 8046fe785..dbfc31c2a 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -81,6 +81,33 @@ inline int getHipDevice() return device; } +/*! + * \brief Get properties of the current hip device. + */ +inline hipDeviceProp_t getHipDeviceProp() +{ + hipDeviceProp_t prop; + hipErrchk(hipGetDeviceProperties(&prop, getHipDevice())); + return prop; +} + +/*! + * \brief Get max occupancy in blocks for the given kernel for the current + * hip device. + */ +template < typename Func > +RAJA_INLINE +int getHipOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size) +{ + int max_blocks = -1; + hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, func, num_threads, shmem_size)); + + size_t multiProcessorCount = getHipDeviceProp().multiProcessorCount; + + return max_blocks * multiProcessorCount; +} + /* * Copy memory len bytes from src to dst. */ From e5ffd62da81750ae9b1bcb41efb0f5f52d2c5f5a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:38:15 -0700 Subject: [PATCH 035/454] Add occgs version of REDUCE_SUM occgs meaning occupancy calculator grid size where the occupacny calculator is used to determine the maximum number of blocks to launch and mapping work to those blocks using a grid stride loop. --- src/algorithm/REDUCE_SUM-Cuda.cpp | 110 ++++++++++++++++++++++-------- src/algorithm/REDUCE_SUM-Hip.cpp | 109 +++++++++++++++++++++-------- src/algorithm/REDUCE_SUM.hpp | 4 ++ 3 files changed, 169 insertions(+), 54 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index d36614f9e..a28f68777 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -18,6 +18,8 @@ #include "cub/util_allocator.cuh" #include +#include + namespace rajaperf { @@ -190,11 +192,79 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } -void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) +template < size_t block_size > +void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_SUM_DATA_SETUP; + if ( vid == Base_CUDA ) { - size_t t = 0; + Real_ptr dsum; + allocData(DataSpace::CudaDevice, dsum, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( + (reduce_sum), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + reduce_sum<<>>( x, + dsum, m_sum_init, + iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaMemcpyAsync( &m_sum, dsum, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + + } + stopTimer(); + + deallocData(DataSpace::CudaDevice, dsum); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA ) { if (tune_idx == t) { @@ -204,12 +274,17 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) t += 1; + } + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { if (tune_idx == t) { + setBlockSize(block_size); runCudaVariantBlock(vid); @@ -217,22 +292,10 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) t += 1; - } - - }); - - } else if ( vid == RAJA_CUDA ) { - - size_t t = 0; - - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runCudaVariantBlock(vid); + setBlockSize(block_size); + runCudaVariantOccGS(vid); } @@ -256,18 +319,9 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, "cub"); - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - - } + } - }); - - } else if ( vid == RAJA_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -276,6 +330,8 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + } }); diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 88a16f331..60f1af923 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -23,6 +23,8 @@ #include "common/HipDataUtils.hpp" #include +#include + namespace rajaperf { @@ -216,11 +218,78 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } -void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) +template < size_t block_size > +void REDUCE_SUM::runHipVariantOccGS(VariantID vid) { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_SUM_DATA_SETUP; + if ( vid == Base_HIP ) { - size_t t = 0; + Real_ptr dsum; + allocData(DataSpace::HipDevice, dsum, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( + (reduce_sum), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + hipLaunchKernelGGL( (reduce_sum), dim3(grid_size), dim3(block_size), + shmem, res.get_stream(), + x, dsum, m_sum_init, iend ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipMemcpyAsync( &m_sum, dsum, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, dsum); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP ) { if (tune_idx == t) { @@ -230,12 +299,17 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) t += 1; + } + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { if (tune_idx == t) { + setBlockSize(block_size); runHipVariantBlock(vid); @@ -243,22 +317,10 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) t += 1; - } - - }); - - } else if ( vid == RAJA_HIP ) { - - size_t t = 0; - - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - runHipVariantBlock(vid); + setBlockSize(block_size); + runHipVariantOccGS(vid); } @@ -286,18 +348,9 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) addVariantTuningName(vid, "cub"); #endif - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - - addVariantTuningName(vid, "block_"+std::to_string(block_size)); - - } + } - }); - - } else if ( vid == RAJA_HIP ) { + if ( vid == Base_HIP || vid == RAJA_HIP ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -306,6 +359,8 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + } }); diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index ba9e9308b..9174b2170 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -66,7 +66,11 @@ class REDUCE_SUM : public KernelBase template < size_t block_size > void runCudaVariantBlock(VariantID vid); template < size_t block_size > + void runCudaVariantOccGS(VariantID vid); + template < size_t block_size > void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 5e3922a4bdfa83311bb78396a8c5e27774eb8d7f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:38:29 -0700 Subject: [PATCH 036/454] Add occgs version of PI_REDUCE --- src/basic/PI_REDUCE-Cuda.cpp | 131 ++++++++++++++++++++++++++++++++++- src/basic/PI_REDUCE-Hip.cpp | 131 ++++++++++++++++++++++++++++++++++- src/basic/PI_REDUCE.hpp | 8 ++- 3 files changed, 264 insertions(+), 6 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index b0577bc58..eca0dd168 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -15,6 +15,8 @@ #include "common/CudaDataUtils.hpp" #include +#include + namespace rajaperf { @@ -59,7 +61,7 @@ __global__ void pi_reduce(Real_type dx, template < size_t block_size > -void PI_REDUCE::runCudaVariantImpl(VariantID vid) +void PI_REDUCE::runCudaVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -120,7 +122,132 @@ void PI_REDUCE::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Cuda) +template < size_t block_size > +void PI_REDUCE::runCudaVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + Real_ptr dpi; + allocData(DataSpace::CudaDevice, dpi, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( + (pi_reduce), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + pi_reduce<<>>( dx, + dpi, m_pi_init, + iend ); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaMemcpyAsync( &m_pi, dpi, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + m_pi *= 4.0; + + } + stopTimer(); + + deallocData(DataSpace::CudaDevice, dpi); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * static_cast(pi.get()); + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index dd56426c2..5a28adb85 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -15,6 +15,8 @@ #include "common/HipDataUtils.hpp" #include +#include + namespace rajaperf { @@ -59,7 +61,7 @@ __global__ void pi_reduce(Real_type dx, template < size_t block_size > -void PI_REDUCE::runHipVariantImpl(VariantID vid) +void PI_REDUCE::runHipVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -119,7 +121,132 @@ void PI_REDUCE::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Hip) +template < size_t block_size > +void PI_REDUCE::runHipVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == Base_HIP ) { + + Real_ptr dpi; + allocData(DataSpace::HipDevice, dpi, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( + (pi_reduce), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), + shmem, res.get_stream(), + dx, dpi, m_pi_init, iend ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipMemcpyAsync( &m_pi, dpi, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + m_pi *= 4.0; + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, dpi); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * static_cast(pi.get()); + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } +} + +void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void PI_REDUCE::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 49fca096d..4efdf6d21 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -60,9 +60,13 @@ class PI_REDUCE : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runCudaVariantOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 5215bb1a47751d4df50136257d7f33cc7913b5f8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:39:00 -0700 Subject: [PATCH 037/454] Add occgs version of REDUCE3_INT --- src/basic/REDUCE3_INT-Cuda.cpp | 147 +++++++++++++++++++++++++++++++- src/basic/REDUCE3_INT-Hip.cpp | 148 ++++++++++++++++++++++++++++++++- src/basic/REDUCE3_INT.hpp | 8 +- 3 files changed, 297 insertions(+), 6 deletions(-) diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 6843dcab3..e2aebdf4c 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -15,6 +15,8 @@ #include "common/CudaDataUtils.hpp" #include +#include + namespace rajaperf { @@ -74,7 +76,7 @@ __global__ void reduce3int(Int_ptr vec, template < size_t block_size > -void REDUCE3_INT::runCudaVariantImpl(VariantID vid) +void REDUCE3_INT::runCudaVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -151,7 +153,148 @@ void REDUCE3_INT::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Cuda) +template < size_t block_size > +void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + Int_ptr vmem_init; + allocData(DataSpace::CudaPinned, vmem_init, 3); + + Int_ptr vmem; + allocData(DataSpace::CudaDevice, vmem, 3); + + constexpr size_t shmem = 3*sizeof(Int_type)*block_size; + const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( + (reduce3int), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + vmem_init[0] = m_vsum_init; + vmem_init[1] = m_vmin_init; + vmem_init[2] = m_vmax_init; + cudaErrchk( cudaMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + reduce3int<<>>(vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); + cudaErrchk( cudaGetLastError() ); + + Int_type lmem[3]; + cudaErrchk( cudaMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + m_vsum += lmem[0]; + m_vmin = RAJA_MIN(m_vmin, lmem[1]); + m_vmax = RAJA_MAX(m_vmax, lmem[2]); + + } + stopTimer(); + + deallocData(DataSpace::CudaDevice, vmem); + deallocData(DataSpace::CudaPinned, vmem_init); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index bd524565a..32c1c8244 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -15,6 +15,8 @@ #include "common/HipDataUtils.hpp" #include +#include + namespace rajaperf { @@ -74,7 +76,7 @@ __global__ void reduce3int(Int_ptr vec, template < size_t block_size > -void REDUCE3_INT::runHipVariantImpl(VariantID vid) +void REDUCE3_INT::runHipVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -151,7 +153,149 @@ void REDUCE3_INT::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Hip) +template < size_t block_size > +void REDUCE3_INT::runHipVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + Int_ptr vmem_init; + allocData(DataSpace::HipPinned, vmem_init, 3); + + Int_ptr vmem; + allocData(DataSpace::HipDevice, vmem, 3); + + constexpr size_t shmem = 3*sizeof(Int_type)*block_size; + const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( + (reduce3int), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + vmem_init[0] = m_vsum_init; + vmem_init[1] = m_vmin_init; + vmem_init[2] = m_vmax_init; + hipErrchk( hipMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); + hipErrchk( hipGetLastError() ); + + Int_type lmem[3]; + hipErrchk( hipMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + m_vsum += lmem[0]; + m_vmin = RAJA_MIN(m_vmin, lmem[1]); + m_vmax = RAJA_MAX(m_vmax, lmem[2]); + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, vmem); + deallocData(DataSpace::HipPinned, vmem_init); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index e82c2cf05..01b7f226e 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -75,9 +75,13 @@ class REDUCE3_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runCudaVariantOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From b5134089912b8a9b5330fa77f66c1d11f618e94e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:39:10 -0700 Subject: [PATCH 038/454] Add occgs version of REDUCE_STRUCT --- src/basic/REDUCE_STRUCT-Cuda.cpp | 152 +++++++++++++++++++++++++++++- src/basic/REDUCE_STRUCT-Hip.cpp | 153 ++++++++++++++++++++++++++++++- src/basic/REDUCE_STRUCT.hpp | 8 +- 3 files changed, 307 insertions(+), 6 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 2961af4cc..0025d0825 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -15,6 +15,8 @@ #include "common/CudaDataUtils.hpp" #include +#include + namespace rajaperf { @@ -96,7 +98,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) +void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -178,7 +180,153 @@ void REDUCE_STRUCT::runCudaVariantImpl(VariantID vid) } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_STRUCT, Cuda) +template < size_t block_size > +void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax + allocData(DataSpace::CudaDevice, mem,6); + + constexpr size_t shmem = 6*sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( + (reduce_struct), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + reduce_struct<<>>( + points.x, points.y, + mem, mem+1, mem+2, // xcenter,xmin,xmax + mem+3, mem+4, mem+5, // ycenter,ymin,ymax + m_init_sum, m_init_min, m_init_max, + points.N); + cudaErrchk( cudaGetLastError() ); + + Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + cudaErrchk( cudaMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + + points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); + points.SetXMin(lmem[1]); + points.SetXMax(lmem[2]); + points.SetYMin(lmem[4]); + points.SetYMax(lmem[5]); + m_points=points; + + } + stopTimer(); + + deallocData(DataSpace::CudaDevice, mem); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points=points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + } + +} + +void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n REDUCE_STRUCT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 236e3e7f2..e9c8d8d65 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -15,6 +15,8 @@ #include "common/HipDataUtils.hpp" #include +#include + namespace rajaperf { @@ -97,7 +99,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, template < size_t block_size > -void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) +void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -180,8 +182,155 @@ void REDUCE_STRUCT::runHipVariantImpl(VariantID vid) } } +template < size_t block_size > +void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax + allocData(DataSpace::HipDevice, mem,6); + + constexpr size_t shmem = 6*sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( + (reduce_struct), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + hipLaunchKernelGGL((reduce_struct), + dim3(grid_size), dim3(block_size), + shmem, res.get_stream(), + points.x, points.y, + mem, mem+1, mem+2, // xcenter,xmin,xmax + mem+3, mem+4, mem+5, // ycenter,ymin,ymax + m_init_sum, m_init_min, m_init_max, + points.N); + hipErrchk( hipGetLastError() ); -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_STRUCT, Hip) + Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + hipErrchk( hipMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); + points.SetXMin(lmem[1]); + points.SetXMax(lmem[2]); + points.SetYMin(lmem[4]); + points.SetYMax(lmem[5]); + m_points=points; + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, mem); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points=points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + } + +} + +void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void REDUCE_STRUCT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 425e7796e..063acd5b3 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -90,9 +90,13 @@ class REDUCE_STRUCT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runCudaVariantOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantOccGS(VariantID vid); struct PointsType { Int_type N; From b0972d201ad4a756ad9218319f73461c43142766 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:39:21 -0700 Subject: [PATCH 039/454] Add occgs version of TRAP_INT --- src/basic/TRAP_INT-Cuda.cpp | 134 +++++++++++++++++++++++++++++++++++- src/basic/TRAP_INT-Hip.cpp | 134 +++++++++++++++++++++++++++++++++++- src/basic/TRAP_INT.hpp | 8 ++- 3 files changed, 270 insertions(+), 6 deletions(-) diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index f4de3cf6a..2f46569d6 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -15,6 +15,8 @@ #include "common/CudaDataUtils.hpp" #include +#include + namespace rajaperf { @@ -79,7 +81,7 @@ __global__ void trapint(Real_type x0, Real_type xp, template < size_t block_size > -void TRAP_INT::runCudaVariantImpl(VariantID vid) +void TRAP_INT::runCudaVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -143,7 +145,135 @@ void TRAP_INT::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Cuda) +template < size_t block_size > +void TRAP_INT::runCudaVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + Real_ptr sumx; + allocData(DataSpace::CudaDevice, sumx, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( + (trapint), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + trapint<<>>(x0, xp, + y, yp, + h, + sumx, + iend); + cudaErrchk( cudaGetLastError() ); + + Real_type lsumx; + cudaErrchk( cudaMemcpyAsync( &lsumx, sumx, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + m_sumx += lsumx * h; + + } + stopTimer(); + + deallocData(DataSpace::CudaDevice, sumx); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void TRAP_INT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 1b5f4b2be..d2655d559 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -15,6 +15,8 @@ #include "common/HipDataUtils.hpp" #include +#include + namespace rajaperf { @@ -79,7 +81,7 @@ __global__ void trapint(Real_type x0, Real_type xp, template < size_t block_size > -void TRAP_INT::runHipVariantImpl(VariantID vid) +void TRAP_INT::runHipVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -142,7 +144,135 @@ void TRAP_INT::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Hip) +template < size_t block_size > +void TRAP_INT::runHipVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + Real_ptr sumx; + allocData(DataSpace::HipDevice, sumx, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( + (trapint), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x0, xp, + y, yp, + h, + sumx, + iend); + hipErrchk( hipGetLastError() ); + + Real_type lsumx; + hipErrchk( hipMemcpyAsync( &lsumx, sumx, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + m_sumx += lsumx * h; + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, sumx); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void TRAP_INT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index e64932dbe..4f13c9eca 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -72,9 +72,13 @@ class TRAP_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runCudaVariantOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 92966d4cc03f8762438e9c9741e458737426a971 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:39:29 -0700 Subject: [PATCH 040/454] Add occgs version of FIRST_MIN --- src/lcals/FIRST_MIN-Cuda.cpp | 141 +++++++++++++++++++++++++++++++++- src/lcals/FIRST_MIN-Hip.cpp | 145 ++++++++++++++++++++++++++++++++++- src/lcals/FIRST_MIN.hpp | 8 +- 3 files changed, 288 insertions(+), 6 deletions(-) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index e7d860877..0efac6950 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -15,6 +15,8 @@ #include "common/CudaDataUtils.hpp" #include +#include + namespace rajaperf { @@ -57,7 +59,7 @@ __global__ void first_min(Real_ptr x, template < size_t block_size > -void FIRST_MIN::runCudaVariantImpl(VariantID vid) +void FIRST_MIN::runCudaVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -127,7 +129,142 @@ void FIRST_MIN::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Cuda) +template < size_t block_size > +void FIRST_MIN::runCudaVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + constexpr size_t shmem = sizeof(MyMinLoc)*block_size; + const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( + (first_min), block_size, shmem); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + + MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value + + MyMinLoc* dminloc; + cudaErrchk( cudaMalloc( (void**)&dminloc, + grid_size * sizeof(MyMinLoc) ) ); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + FIRST_MIN_MINLOC_INIT; + + first_min<<>>(x, dminloc, mymin, iend); + cudaErrchk( cudaGetLastError() ); + + cudaErrchk( cudaMemcpyAsync( mymin_block, dminloc, + grid_size * sizeof(MyMinLoc), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + + for (Index_type i = 0; i < static_cast(grid_size); i++) { + if ( mymin_block[i].val < mymin.val ) { + mymin = mymin_block[i]; + } + } + m_minloc = RAJA_MAX(m_minloc, mymin.loc); + + } + stopTimer(); + + cudaErrchk( cudaFree( dminloc ) ); + delete[] mymin_block; + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + FIRST_MIN_BODY_RAJA; + }); + + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } +} } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index bb106ce0e..a9fb41c74 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -15,6 +15,8 @@ #include "common/HipDataUtils.hpp" #include +#include + namespace rajaperf { @@ -57,7 +59,7 @@ __global__ void first_min(Real_ptr x, template < size_t block_size > -void FIRST_MIN::runHipVariantImpl(VariantID vid) +void FIRST_MIN::runHipVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -130,7 +132,146 @@ void FIRST_MIN::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Hip) +template < size_t block_size > +void FIRST_MIN::runHipVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == Base_HIP ) { + + constexpr size_t shmem = sizeof(MyMinLoc)*block_size; + const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( + (first_min), block_size, shmem); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + + MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value + + MyMinLoc* dminloc; + hipErrchk( hipMalloc( (void**)&dminloc, + grid_size * sizeof(MyMinLoc) ) ); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + FIRST_MIN_MINLOC_INIT; + + hipLaunchKernelGGL( (first_min), grid_size, block_size, + shmem, res.get_stream(), x, + dminloc, + mymin, + iend ); + hipErrchk( hipGetLastError() ); + + hipErrchk( hipMemcpyAsync( mymin_block, dminloc, + grid_size * sizeof(MyMinLoc), + hipMemcpyDeviceToHost, res.get_stream() ) ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + + for (Index_type i = 0; i < static_cast(grid_size); i++) { + if ( mymin_block[i].val < mymin.val ) { + mymin = mymin_block[i]; + } + } + m_minloc = mymin.loc; + + } + stopTimer(); + + hipErrchk( hipFree( dminloc ) ); + delete[] mymin_block; + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + FIRST_MIN_BODY_RAJA; + }); + + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + } +} + +void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void FIRST_MIN::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } + +} } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index dd00d4392..1431dad62 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -84,9 +84,13 @@ class FIRST_MIN : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runCudaVariantOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From f4173674f81fb5449056ac3e77f84872bc017603 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 27 Sep 2023 14:39:37 -0700 Subject: [PATCH 041/454] Add occgs version of DOT --- src/stream/DOT-Cuda.cpp | 129 ++++++++++++++++++++++++++++++++++++++- src/stream/DOT-Hip.cpp | 131 +++++++++++++++++++++++++++++++++++++++- src/stream/DOT.hpp | 8 ++- 3 files changed, 262 insertions(+), 6 deletions(-) diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index fbecc979f..ddbf36128 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -15,6 +15,7 @@ #include "common/CudaDataUtils.hpp" #include +#include namespace rajaperf @@ -59,7 +60,7 @@ __global__ void dot(Real_ptr a, Real_ptr b, template < size_t block_size > -void DOT::runCudaVariantImpl(VariantID vid) +void DOT::runCudaVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -119,7 +120,131 @@ void DOT::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Cuda) +template < size_t block_size > +void DOT::runCudaVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + DOT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + Real_ptr dprod; + allocData(DataSpace::CudaDevice, dprod, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( + (dot), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + dot<<>>( + a, b, dprod, m_dot_init, iend ); + cudaErrchk( cudaGetLastError() ); + + Real_type lprod; + cudaErrchk( cudaMemcpyAsync( &lprod, dprod, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + m_dot += lprod; + + } + stopTimer(); + + deallocData(DataSpace::CudaDevice, dprod); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +void DOT::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void DOT::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } +} } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 7bd1ef277..340807cad 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -15,6 +15,7 @@ #include "common/HipDataUtils.hpp" #include +#include namespace rajaperf @@ -60,7 +61,7 @@ __global__ void dot(Real_ptr a, Real_ptr b, template < size_t block_size > -void DOT::runHipVariantImpl(VariantID vid) +void DOT::runHipVariantBlock(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -121,7 +122,133 @@ void DOT::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Hip) +template < size_t block_size > +void DOT::runHipVariantOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + DOT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + Real_ptr dprod; + allocData(DataSpace::HipDevice, dprod, 1); + + constexpr size_t shmem = sizeof(Real_type)*block_size; + const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( + (dot), block_size, shmem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + + const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = std::min(normal_grid_size, max_grid_size); + hipLaunchKernelGGL((dot), dim3(grid_size), dim3(block_size), + shmem, res.get_stream(), + a, b, dprod, m_dot_init, iend ); + hipErrchk( hipGetLastError() ); + + Real_type lprod; + hipErrchk( hipMemcpyAsync( &lprod, dprod, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + m_dot += lprod; + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, dprod); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; + } +} + +void DOT::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantOccGS(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void DOT::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + + } + + }); + + } + +} } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 5912c120a..856caef14 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -56,9 +56,13 @@ class DOT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runCudaVariantOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlock(VariantID vid); + template < size_t block_size > + void runHipVariantOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 75dc718700da6abb4c33b354e593fa22edaab8e5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 29 Sep 2023 14:00:44 -0700 Subject: [PATCH 042/454] Make INDEXLIST_3LOOP implementations consistent This means reading the last member of the counts array instead of using a reducer for the RAJA variants. --- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 14 ++++++++++---- src/basic/INDEXLIST_3LOOP-Hip.cpp | 14 ++++++++++---- src/basic/INDEXLIST_3LOOP-OMP.cpp | 5 +---- src/basic/INDEXLIST_3LOOP-Seq.cpp | 5 +---- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 7b6a9ade6..878e11d2b 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -133,11 +133,12 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_CUDA; + Index_type* len; + allocData(DataSpace::CudaPinned, len, 1); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -152,15 +153,20 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) [=] __device__ (Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; + } + if (i == iend-1) { + *len = counts[i+1]; } }); - m_len = len.get(); + res.wait(); + m_len = *len; } stopTimer(); + deallocData(DataSpace::CudaPinned, len); + INDEXLIST_3LOOP_DATA_TEARDOWN_CUDA; } else { diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index b4d0d26f8..3defd94d1 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -155,11 +155,12 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_HIP; + Index_type* len; + allocData(DataSpace::HipPinned, len, 1); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -174,15 +175,20 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) [=] __device__ (Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; + } + if (i == iend-1) { + *len = counts[i+1]; } }); - m_len = len.get(); + res.wait(); + m_len = *len; } stopTimer(); + deallocData(DataSpace::HipPinned, len); + INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; } else { diff --git a/src/basic/INDEXLIST_3LOOP-OMP.cpp b/src/basic/INDEXLIST_3LOOP-OMP.cpp index d84736ef7..57cb14c23 100644 --- a/src/basic/INDEXLIST_3LOOP-OMP.cpp +++ b/src/basic/INDEXLIST_3LOOP-OMP.cpp @@ -203,8 +203,6 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { @@ -219,11 +217,10 @@ void INDEXLIST_3LOOP::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG [=](Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; } }); - m_len = len.get(); + m_len = counts[iend]; } stopTimer(); diff --git a/src/basic/INDEXLIST_3LOOP-Seq.cpp b/src/basic/INDEXLIST_3LOOP-Seq.cpp index 9de3f3393..3828c5652 100644 --- a/src/basic/INDEXLIST_3LOOP-Seq.cpp +++ b/src/basic/INDEXLIST_3LOOP-Seq.cpp @@ -117,8 +117,6 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum len(0); - RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { @@ -133,11 +131,10 @@ void INDEXLIST_3LOOP::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu [=](Index_type i) { if (counts[i] != counts[i+1]) { list[counts[i]] = i; - len += 1; } }); - m_len = len.get(); + m_len = counts[iend]; } stopTimer(); From 42d9be79269b52e731e4db92f9472b9252701489 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 2 Oct 2023 10:48:39 -0700 Subject: [PATCH 043/454] Add --allow-problematic-implementations --- src/basic/INDEXLIST.cpp | 7 ++++--- src/common/RunParams.cpp | 11 +++++++++++ src/common/RunParams.hpp | 5 +++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 45737a2e4..27aaec8b4 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -46,10 +46,11 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Base_OpenMPTarget ); #endif - setVariantDefined( Base_CUDA ); + if (params.getAllowProblematicImplementations()) { + // these may deadlock depending on the order that blocks are scheduled + + setVariantDefined( Base_CUDA ); - if (0) { - // deadlocks setVariantDefined( Base_HIP ); } } diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 96e6821a8..b988cbded 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -68,6 +68,7 @@ RunParams::RunParams(int argc, char** argv) add_to_spot_config(), #endif disable_warmup(false), + allow_problematic_implementations(false), run_kernels(), run_variants() { @@ -133,6 +134,8 @@ void RunParams::print(std::ostream& str) const #endif str << "\n disable_warmup = " << disable_warmup; + str << "\n allow_problematic_implementations = " + << allow_problematic_implementations; str << "\n seq data space = " << getDataSpaceName(seqDataSpace); str << "\n omp data space = " << getDataSpaceName(ompDataSpace); @@ -693,6 +696,11 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) disable_warmup = true; + } else if ( std::string(argv[i]) == + std::string("--allow-problematic-implementations") ) { + + allow_problematic_implementations = true; + } else if ( std::string(argv[i]) == std::string("--checkrun") ) { input_state = CheckRun; @@ -829,6 +837,9 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --disable-warmup (disable warmup kernels) [Default is run warmup kernels that are relevant to kernels selected to run]\n\n"; + str << "\t --allow-problematic-implementations (allow problematic kernel implementations) [Default is to not allow problematic kernel implementations to run]\n" + << "\t These implementations may deadlock causing the code to hang indefinitely.\n\n"; + str << "\t --kernels, -k [Default is run all]\n" << "\t (names of individual kernels and/or groups of kernels to run)\n" << "\t See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index bfa8f8896..6d6402f6a 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -160,6 +160,9 @@ class RunParams { bool getDisableWarmup() const { return disable_warmup; } + bool getAllowProblematicImplementations() const + { return allow_problematic_implementations; } + const std::set& getKernelIDsToRun() const { return run_kernels; } const std::set& getVariantIDsToRun() const { return run_variants; } VariantID getReferenceVariantID() const { return reference_vid; } @@ -262,6 +265,8 @@ class RunParams { bool disable_warmup; + bool allow_problematic_implementations; + std::set run_kernels; std::set run_variants; From 81cbb65cf10f0caa55069339498f3a4ba72910ba Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 4 Oct 2023 09:50:10 -0700 Subject: [PATCH 044/454] Improve spacing in kernel launches --- src/algorithm/REDUCE_SUM-Cuda.cpp | 2 +- src/basic/PI_REDUCE-Cuda.cpp | 2 +- src/basic/REDUCE3_INT-Cuda.cpp | 2 +- src/basic/REDUCE3_INT-Hip.cpp | 3 ++- src/basic/REDUCE_STRUCT-Hip.cpp | 2 +- src/basic/TRAP_INT-Cuda.cpp | 2 +- src/basic/TRAP_INT-Hip.cpp | 3 ++- 7 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index a28f68777..822a33eaf 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -221,7 +221,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); reduce_sum<<>>( x, + shmem, res.get_stream()>>>( x, dsum, m_sum_init, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index eca0dd168..e3cd15273 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -151,7 +151,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); pi_reduce<<>>( dx, + shmem, res.get_stream()>>>( dx, dpi, m_pi_init, iend ); cudaErrchk( cudaGetLastError() ); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index e2aebdf4c..d5572ddac 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -188,7 +188,7 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); reduce3int<<>>(vec, + shmem, res.get_stream()>>>(vec, vmem + 0, m_vsum_init, vmem + 1, m_vmin_init, vmem + 2, m_vmax_init, diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 32c1c8244..9f429b8e6 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -187,7 +187,8 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), + shmem, res.get_stream(), vec, vmem + 0, m_vsum_init, vmem + 1, m_vmin_init, diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index e9c8d8d65..2b7213c8b 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -212,7 +212,7 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) hipLaunchKernelGGL((reduce_struct), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - points.x, points.y, + points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 2f46569d6..2b1d62851 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -174,7 +174,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); trapint<<>>(x0, xp, + shmem, res.get_stream()>>>(x0, xp, y, yp, h, sumx, diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index d2655d559..a092ecba8 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -172,7 +172,8 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x0, xp, + hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), + shmem, res.get_stream(), x0, xp, y, yp, h, sumx, From 29c81aaed70b0f586e2179b00a6e6af8abb8db87 Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Fri, 6 Oct 2023 13:58:19 -0500 Subject: [PATCH 045/454] Changing len type from int to Size_type. Setting Size_type to size_t. --- src/common/DataUtils.cpp | 52 +++++++++++++++++------------------ src/common/DataUtils.hpp | 58 +++++++++++++++++++-------------------- src/common/KernelBase.hpp | 20 +++++++------- src/common/RPTypes.hpp | 2 ++ 4 files changed, 67 insertions(+), 65 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index f1831cc1f..8097a603b 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -123,7 +123,7 @@ void incDataInitCount() /* * Copy memory len bytes from src to dst. */ -void copyHostData(void* dst_ptr, const void* src_ptr, size_t len) +void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len) { std::memcpy(dst_ptr, src_ptr, len); } @@ -132,7 +132,7 @@ void copyHostData(void* dst_ptr, const void* src_ptr, size_t len) /* * Allocate data arrays of given type. */ -void* allocHostData(size_t len, size_t align) +void* allocHostData(Size_type len, size_t align) { return RAJA::allocate_aligned_type( align, len); @@ -153,7 +153,7 @@ void deallocHostData(void* ptr) /* * Allocate data arrays of given dataSpace. */ -void* allocData(DataSpace dataSpace, int nbytes, int align) +void* allocData(DataSpace dataSpace, Size_type nbytes, int align) { void* ptr = nullptr; @@ -257,7 +257,7 @@ void* allocData(DataSpace dataSpace, int nbytes, int align) */ void copyData(DataSpace dst_dataSpace, void* dst_ptr, DataSpace src_dataSpace, const void* src_ptr, - size_t nbytes) + Size_type nbytes) { if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { @@ -369,23 +369,23 @@ void deallocData(DataSpace dataSpace, void* ptr) * \brief Initialize Int_type data array to * randomly signed positive and negative values. */ -void initData(Int_ptr& ptr, int len) +void initData(Int_ptr& ptr, Size_type len) { srand(4793); Real_type signfact = 0.0; - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { signfact = Real_type(rand())/RAND_MAX; ptr[i] = ( signfact < 0.5 ? -1 : 1 ); }; signfact = Real_type(rand())/RAND_MAX; - Int_type ilo = len * signfact; + Size_type ilo = len * signfact; ptr[ilo] = -58; signfact = Real_type(rand())/RAND_MAX; - Int_type ihi = len * signfact; + Size_type ihi = len * signfact; ptr[ihi] = 19; incDataInitCount(); @@ -396,11 +396,11 @@ void initData(Int_ptr& ptr, int len) * positive values (0.0, 1.0) based on their array position * (index) and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len) +void initData(Real_ptr& ptr, Size_type len) { Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 ); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); } @@ -410,9 +410,9 @@ void initData(Real_ptr& ptr, int len) /* * Initialize Real_type data array to constant values. */ -void initDataConst(Real_ptr& ptr, int len, Real_type val) +void initDataConst(Real_ptr& ptr, Size_type len, Real_type val) { - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = val; }; @@ -422,9 +422,9 @@ void initDataConst(Real_ptr& ptr, int len, Real_type val) /* * Initialize Index_type data array to constant values. */ -void initDataConst(Index_type*& ptr, int len, Index_type val) +void initDataConst(Index_type*& ptr, Size_type len, Index_type val) { - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = val; }; @@ -434,13 +434,13 @@ void initDataConst(Index_type*& ptr, int len, Index_type val) /* * Initialize Real_type data array with random sign. */ -void initDataRandSign(Real_ptr& ptr, int len) +void initDataRandSign(Real_ptr& ptr, Size_type len) { Real_type factor = ( data_init_count % 2 ? 0.1 : 0.2 ); srand(4793); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { Real_type signfact = Real_type(rand())/RAND_MAX; signfact = ( signfact < 0.5 ? -1.0 : 1.0 ); ptr[i] = signfact*factor*(i + 1.1)/(i + 1.12345); @@ -452,11 +452,11 @@ void initDataRandSign(Real_ptr& ptr, int len) /* * Initialize Real_type data array with random values. */ -void initDataRandValue(Real_ptr& ptr, int len) +void initDataRandValue(Real_ptr& ptr, Size_type len) { srand(4793); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = Real_type(rand())/RAND_MAX; }; @@ -466,12 +466,12 @@ void initDataRandValue(Real_ptr& ptr, int len) /* * Initialize Complex_type data array. */ -void initData(Complex_ptr& ptr, int len) +void initData(Complex_ptr& ptr, Size_type len) { Complex_type factor = ( data_init_count % 2 ? Complex_type(0.1,0.2) : Complex_type(0.2,0.3) ); - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = factor*(i + 1.1)/(i + 1.12345); } @@ -492,12 +492,12 @@ void initData(Real_type& d) /* * Calculate and return checksum for data arrays. */ -long double calcChecksum(Int_ptr ptr, int len, +long double calcChecksum(Int_ptr ptr, Size_type len, Real_type scale_factor) { long double tchk = 0.0; long double ckahan = 0.0; - for (Index_type j = 0; j < len; ++j) { + for (Size_type j = 0; j < len; ++j) { long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; long double y = x - ckahan; volatile long double t = tchk + y; @@ -514,12 +514,12 @@ long double calcChecksum(Int_ptr ptr, int len, return tchk; } -long double calcChecksum(Real_ptr ptr, int len, +long double calcChecksum(Real_ptr ptr, Size_type len, Real_type scale_factor) { long double tchk = 0.0; long double ckahan = 0.0; - for (Index_type j = 0; j < len; ++j) { + for (Size_type j = 0; j < len; ++j) { long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; long double y = x - ckahan; volatile long double t = tchk + y; @@ -536,12 +536,12 @@ long double calcChecksum(Real_ptr ptr, int len, return tchk; } -long double calcChecksum(Complex_ptr ptr, int len, +long double calcChecksum(Complex_ptr ptr, Size_type len, Real_type scale_factor) { long double tchk = 0.0; long double ckahan = 0.0; - for (Index_type j = 0; j < len; ++j) { + for (Size_type j = 0; j < len; ++j) { long double x = (std::abs(std::sin(j+1.0))+0.5) * (real(ptr[j])+imag(ptr[j])); long double y = x - ckahan; volatile long double t = tchk + y; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 1b233e574..fc83cbffe 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -44,12 +44,12 @@ void resetDataInitCount(); */ void incDataInitCount(); -void copyHostData(void* dst_ptr, const void* src_ptr, size_t len); +void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len); /*! * \brief Allocate data arrays. */ -void* allocHostData(size_t len, size_t align); +void* allocHostData(Size_type len, size_t align); /*! * \brief Free data arrays. @@ -60,14 +60,14 @@ void deallocHostData(void* ptr); /*! * \brief Allocate data array in dataSpace. */ -void* allocData(DataSpace dataSpace, int nbytes, int align); +void* allocData(DataSpace dataSpace, Size_type nbytes, int align); /*! * \brief Copy data from one dataSpace to another. */ void copyData(DataSpace dst_dataSpace, void* dst_ptr, DataSpace src_dataSpace, const void* src_ptr, - size_t nbytes); + Size_type nbytes); /*! * \brief Free data arrays in dataSpace. @@ -82,7 +82,7 @@ void deallocData(DataSpace dataSpace, void* ptr); * Then, two randomly-chosen entries are reset, one to * a value > 1, one to a value < -1. */ -void initData(Int_ptr& ptr, int len); +void initData(Int_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type data array. @@ -91,21 +91,21 @@ void initData(Int_ptr& ptr, int len); * in the interval (0.0, 1.0) based on their array position (index) * and the order in which this method is called. */ -void initData(Real_ptr& ptr, int len); +void initData(Real_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type data array. * * Array entries are set to given constant value. */ -void initDataConst(Real_ptr& ptr, int len, Real_type val); +void initDataConst(Real_ptr& ptr, Size_type len, Real_type val); /*! * \brief Initialize Index_type data array. * * Array entries are set to given constant value. */ -void initDataConst(Index_type*& ptr, int len, Index_type val); +void initDataConst(Index_type*& ptr, Size_type len, Index_type val); /*! * \brief Initialize Real_type data array with random sign. @@ -113,14 +113,14 @@ void initDataConst(Index_type*& ptr, int len, Index_type val); * Array entries are initialized in the same way as the method * initData(Real_ptr& ptr...) above, but with random sign. */ -void initDataRandSign(Real_ptr& ptr, int len); +void initDataRandSign(Real_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type data array with random values. * * Array entries are initialized with random values in the interval [0.0, 1.0]. */ -void initDataRandValue(Real_ptr& ptr, int len); +void initDataRandValue(Real_ptr& ptr, Size_type len); /*! * \brief Initialize Complex_type data array. @@ -128,7 +128,7 @@ void initDataRandValue(Real_ptr& ptr, int len); * Real and imaginary array entries are initialized in the same way as the * method allocAndInitData(Real_ptr& ptr...) above. */ -void initData(Complex_ptr& ptr, int len); +void initData(Complex_ptr& ptr, Size_type len); /*! * \brief Initialize Real_type scalar data. @@ -147,13 +147,13 @@ void initData(Real_type& d); * * Checksumn is multiplied by given scale factor. */ -long double calcChecksum(Int_ptr d, int len, +long double calcChecksum(Int_ptr d, Size_type len, Real_type scale_factor); /// -long double calcChecksum(Real_ptr d, int len, +long double calcChecksum(Real_ptr d, Size_type len, Real_type scale_factor); /// -long double calcChecksum(Complex_ptr d, int len, +long double calcChecksum(Complex_ptr d, Size_type len, Real_type scale_factor); } // closing brace for detail namespace @@ -171,16 +171,16 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace); * \brief Allocate data array (ptr). */ template -inline void allocData(DataSpace dataSpace, T*& ptr_ref, int len, int align) +inline void allocData(DataSpace dataSpace, T*& ptr_ref, Size_type len, int align) { - size_t nbytes = len*sizeof(T); + Size_type nbytes = len*sizeof(T); T* ptr = static_cast(detail::allocData(dataSpace, nbytes, align)); #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) if (dataSpace == DataSpace::Omp) { // perform first touch on Omp Data #pragma omp parallel for - for (int i = 0; i < len; ++i) { + for (Size_type i = 0; i < len; ++i) { ptr[i] = T{}; }; } @@ -205,9 +205,9 @@ inline void deallocData(DataSpace dataSpace, T*& ptr) template inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, - int len) + Size_type len) { - size_t nbytes = len*sizeof(T); + Size_type nbytes = len*sizeof(T); detail::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, nbytes); } @@ -216,7 +216,7 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, */ template inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, int len, int align) + T*& ptr, Size_type len, int align) { if (new_dataSpace != old_dataSpace) { @@ -237,7 +237,7 @@ template struct AutoDataMover { AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, int len, int align) + T*& ptr, Size_type len, int align) : m_ptr(&ptr) , m_new_dataSpace(new_dataSpace) , m_old_dataSpace(old_dataSpace) @@ -284,7 +284,7 @@ struct AutoDataMover T** m_ptr; DataSpace m_new_dataSpace; DataSpace m_old_dataSpace; - int m_len; + Size_type m_len; int m_align; }; @@ -292,7 +292,7 @@ struct AutoDataMover * \brief Allocate and initialize data array. */ template -inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align) +inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, int align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -310,7 +310,7 @@ inline void allocAndInitData(DataSpace dataSpace, T*& ptr, int len, int align) * Array entries are initialized using the method initDataConst. */ template -inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int align, +inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, int align, T val) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -328,7 +328,7 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, int len, int ali * Array is initialized using method initDataRandSign. */ template -inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int align) +inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len, int align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -346,7 +346,7 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, int len, int * Array is initialized using method initDataRandValue. */ template -inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int align) +inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len, int align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -361,7 +361,7 @@ inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, int len, int * Calculate and return checksum for arrays. */ template -inline long double calcChecksum(DataSpace dataSpace, T* ptr, int len, int align, +inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, int align, Real_type scale_factor) { T* check_ptr = ptr; @@ -428,9 +428,9 @@ struct RAJAPoolAllocatorHolder } /*[[nodiscard]]*/ - value_type* allocate(size_t num) + value_type* allocate(Size_type num) { - if (num > std::numeric_limits::max() / sizeof(value_type)) { + if (num > std::numeric_limits::max() / sizeof(value_type)) { throw std::bad_alloc(); } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index b3ce7c3e3..425f2f104 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -264,7 +264,7 @@ class KernelBase DataSpace getHostAccessibleDataSpace(VariantID vid) const; template - void allocData(DataSpace dataSpace, T& ptr, int len) + void allocData(DataSpace dataSpace, T& ptr, Size_type len) { rajaperf::allocData(dataSpace, ptr, len, getDataAlignment()); @@ -273,7 +273,7 @@ class KernelBase template void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, - int len) + Size_type len) { rajaperf::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, len); } @@ -285,42 +285,42 @@ class KernelBase } template - void allocData(T*& ptr, int len, VariantID vid) + void allocData(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocData(getDataSpace(vid), ptr, len, getDataAlignment()); } template - void allocAndInitData(T*& ptr, int len, VariantID vid) + void allocAndInitData(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocAndInitData(getDataSpace(vid), ptr, len, getDataAlignment()); } template - void allocAndInitDataConst(T*& ptr, int len, T val, VariantID vid) + void allocAndInitDataConst(T*& ptr, Size_type len, T val, VariantID vid) { rajaperf::allocAndInitDataConst(getDataSpace(vid), ptr, len, getDataAlignment(), val); } template - void allocAndInitDataRandSign(T*& ptr, int len, VariantID vid) + void allocAndInitDataRandSign(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocAndInitDataRandSign(getDataSpace(vid), ptr, len, getDataAlignment()); } template - void allocAndInitDataRandValue(T*& ptr, int len, VariantID vid) + void allocAndInitDataRandValue(T*& ptr, Size_type len, VariantID vid) { rajaperf::allocAndInitDataRandValue(getDataSpace(vid), ptr, len, getDataAlignment()); } template - rajaperf::AutoDataMover scopedMoveData(T*& ptr, int len, VariantID vid) + rajaperf::AutoDataMover scopedMoveData(T*& ptr, Size_type len, VariantID vid) { rajaperf::moveData(getHostAccessibleDataSpace(vid), getDataSpace(vid), ptr, len, getDataAlignment()); @@ -341,14 +341,14 @@ class KernelBase } template - long double calcChecksum(T* ptr, int len, VariantID vid) + long double calcChecksum(T* ptr, Size_type len, VariantID vid) { return rajaperf::calcChecksum(getDataSpace(vid), ptr, len, getDataAlignment(), 1.0); } template - long double calcChecksum(T* ptr, int len, Real_type scale_factor, VariantID vid) + long double calcChecksum(T* ptr, Size_type len, Real_type scale_factor, VariantID vid) { return rajaperf::calcChecksum(getDataSpace(vid), ptr, len, getDataAlignment(), scale_factor); diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index b86f6b7b6..4127fc1e2 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -60,6 +60,8 @@ using Index_type = RAJA::Index_type; /// using Index_ptr = Index_type*; +using Size_type = size_t; + /*! ****************************************************************************** From 1430fb1cab64f0968ab7037923c5b847018d245c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 9 Oct 2023 13:25:46 -0700 Subject: [PATCH 046/454] Fix ifdefs in MPI_HALOEXHCANGE_FUSED --- src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp | 2 +- src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp index 2cec72b08..5957968d6 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -#if defined(RAJA_ENABLE_CUDA) && defined(RAJA_ENABLE_TARGET_OPENMP) +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp index 8664fc314..45b898182 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -#if defined(RAJA_ENABLE_HIP) && defined(RAJA_ENABLE_TARGET_OPENMP) +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP) #include "common/HipDataUtils.hpp" From 3d81cb5149203c49d77d98fac90f4365fabb0ff9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 9 Oct 2023 13:27:37 -0700 Subject: [PATCH 047/454] Move HALOEXCHANGE kernels into comm Why does cmake not link in libcomm.a --- src/CMakeLists.txt | 29 ++++---- src/apps/CMakeLists.txt | 25 ------- src/comm/CMakeLists.txt | 37 ++++++++++ src/{apps => comm}/HALOEXCHANGE-Cuda.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE-Hip.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE-OMP.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE-OMPTarget.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE-Seq.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE.cpp | 6 +- src/{apps => comm}/HALOEXCHANGE.hpp | 8 +- .../HALOEXCHANGE_FUSED-Cuda.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE_FUSED-Hip.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE_FUSED-OMP.cpp | 4 +- .../HALOEXCHANGE_FUSED-OMPTarget.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE_FUSED-Seq.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE_FUSED.cpp | 6 +- src/{apps => comm}/HALOEXCHANGE_FUSED.hpp | 8 +- src/{apps => comm}/HALOEXCHANGE_base.cpp | 4 +- src/{apps => comm}/HALOEXCHANGE_base.hpp | 8 +- src/{apps => comm}/MPI_HALOEXCHANGE-Cuda.cpp | 4 +- src/{apps => comm}/MPI_HALOEXCHANGE-Hip.cpp | 4 +- src/{apps => comm}/MPI_HALOEXCHANGE-OMP.cpp | 4 +- .../MPI_HALOEXCHANGE-OMPTarget.cpp | 4 +- src/{apps => comm}/MPI_HALOEXCHANGE-Seq.cpp | 4 +- src/{apps => comm}/MPI_HALOEXCHANGE.cpp | 6 +- src/{apps => comm}/MPI_HALOEXCHANGE.hpp | 8 +- .../MPI_HALOEXCHANGE_FUSED-Cuda.cpp | 4 +- .../MPI_HALOEXCHANGE_FUSED-Hip.cpp | 4 +- .../MPI_HALOEXCHANGE_FUSED-OMP.cpp | 4 +- .../MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp | 4 +- .../MPI_HALOEXCHANGE_FUSED-Seq.cpp | 4 +- src/{apps => comm}/MPI_HALOEXCHANGE_FUSED.cpp | 6 +- src/{apps => comm}/MPI_HALOEXCHANGE_FUSED.hpp | 8 +- src/common/Executor.cpp | 4 +- src/common/RAJAPerfSuite.cpp | 73 +++++++++++-------- src/common/RAJAPerfSuite.hpp | 17 +++-- 36 files changed, 183 insertions(+), 150 deletions(-) create mode 100644 src/comm/CMakeLists.txt rename src/{apps => comm}/HALOEXCHANGE-Cuda.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE-Hip.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE-OMP.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE-OMPTarget.cpp (98%) rename src/{apps => comm}/HALOEXCHANGE-Seq.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE.cpp (94%) rename src/{apps => comm}/HALOEXCHANGE.hpp (95%) rename src/{apps => comm}/HALOEXCHANGE_FUSED-Cuda.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE_FUSED-Hip.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE_FUSED-OMP.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE_FUSED-OMPTarget.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE_FUSED-Seq.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE_FUSED.cpp (94%) rename src/{apps => comm}/HALOEXCHANGE_FUSED.hpp (97%) rename src/{apps => comm}/HALOEXCHANGE_base.cpp (99%) rename src/{apps => comm}/HALOEXCHANGE_base.hpp (97%) rename src/{apps => comm}/MPI_HALOEXCHANGE-Cuda.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE-Hip.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE-OMP.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE-OMPTarget.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE-Seq.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE.cpp (97%) rename src/{apps => comm}/MPI_HALOEXCHANGE.hpp (96%) rename src/{apps => comm}/MPI_HALOEXCHANGE_FUSED-Cuda.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE_FUSED-Hip.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE_FUSED-OMP.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE_FUSED-Seq.cpp (99%) rename src/{apps => comm}/MPI_HALOEXCHANGE_FUSED.cpp (97%) rename src/{apps => comm}/MPI_HALOEXCHANGE_FUSED.hpp (97%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 47b6f4b44..bbfbef48e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,7 @@ add_subdirectory(polybench) add_subdirectory(stream) add_subdirectory(stream-kokkos) add_subdirectory(algorithm) +add_subdirectory(comm) set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS common @@ -29,7 +30,8 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS polybench stream stream-kokkos - algorithm) + algorithm + comm) list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(RAJA_ENABLE_TARGET_OPENMP) @@ -57,18 +59,6 @@ blt_add_executable( apps/PRESSURE.cpp apps/PRESSURE-Seq.cpp apps/PRESSURE-OMPTarget.cpp - apps/HALOEXCHANGE.cpp - apps/HALOEXCHANGE-Seq.cpp - apps/HALOEXCHANGE-OMPTarget.cpp - apps/HALOEXCHANGE_FUSED.cpp - apps/HALOEXCHANGE_FUSED-Seq.cpp - apps/HALOEXCHANGE_FUSED-OMPTarget.cpp - apps/MPI_HALOEXCHANGE.cpp - apps/MPI_HALOEXCHANGE-Seq.cpp - apps/MPI_HALOEXCHANGE-OMPTarget.cpp - apps/MPI_HALOEXCHANGE_FUSED.cpp - apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp - apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp apps/LTIMES.cpp apps/LTIMES-Seq.cpp apps/LTIMES-OMPTarget.cpp @@ -254,6 +244,19 @@ blt_add_executable( algorithm/MEMCPY.cpp algorithm/MEMCPY-Seq.cpp algorithm/MEMCPY-OMPTarget.cpp + comm/HALOEXCHANGE_base.cpp + comm/HALOEXCHANGE.cpp + comm/HALOEXCHANGE-Seq.cpp + comm/HALOEXCHANGE-OMPTarget.cpp + comm/HALOEXCHANGE_FUSED.cpp + comm/HALOEXCHANGE_FUSED-Seq.cpp + comm/HALOEXCHANGE_FUSED-OMPTarget.cpp + comm/MPI_HALOEXCHANGE.cpp + comm/MPI_HALOEXCHANGE-Seq.cpp + comm/MPI_HALOEXCHANGE-OMPTarget.cpp + comm/MPI_HALOEXCHANGE_FUSED.cpp + comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp + comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS} ) install( TARGETS raja-perf-omptarget.exe diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 6c91c8d81..9e35bef84 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -45,31 +45,6 @@ blt_add_library( FIR-Cuda.cpp FIR-OMP.cpp FIR-OMPTarget.cpp - HALOEXCHANGE_base.cpp - HALOEXCHANGE.cpp - HALOEXCHANGE-Seq.cpp - HALOEXCHANGE-Hip.cpp - HALOEXCHANGE-Cuda.cpp - HALOEXCHANGE-OMP.cpp - HALOEXCHANGE-OMPTarget.cpp - HALOEXCHANGE_FUSED.cpp - HALOEXCHANGE_FUSED-Seq.cpp - HALOEXCHANGE_FUSED-Hip.cpp - HALOEXCHANGE_FUSED-Cuda.cpp - HALOEXCHANGE_FUSED-OMP.cpp - HALOEXCHANGE_FUSED-OMPTarget.cpp - MPI_HALOEXCHANGE.cpp - MPI_HALOEXCHANGE-Seq.cpp - MPI_HALOEXCHANGE-Hip.cpp - MPI_HALOEXCHANGE-Cuda.cpp - MPI_HALOEXCHANGE-OMP.cpp - MPI_HALOEXCHANGE-OMPTarget.cpp - MPI_HALOEXCHANGE_FUSED.cpp - MPI_HALOEXCHANGE_FUSED-Seq.cpp - MPI_HALOEXCHANGE_FUSED-Hip.cpp - MPI_HALOEXCHANGE_FUSED-Cuda.cpp - MPI_HALOEXCHANGE_FUSED-OMP.cpp - MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp LTIMES.cpp LTIMES-Seq.cpp LTIMES-Hip.cpp diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt new file mode 100644 index 000000000..3b99decdf --- /dev/null +++ b/src/comm/CMakeLists.txt @@ -0,0 +1,37 @@ +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_library( + NAME comm + SOURCES HALOEXCHANGE_base.cpp + HALOEXCHANGE.cpp + HALOEXCHANGE-Seq.cpp + HALOEXCHANGE-Hip.cpp + HALOEXCHANGE-Cuda.cpp + HALOEXCHANGE-OMP.cpp + HALOEXCHANGE-OMPTarget.cpp + HALOEXCHANGE_FUSED.cpp + HALOEXCHANGE_FUSED-Seq.cpp + HALOEXCHANGE_FUSED-Hip.cpp + HALOEXCHANGE_FUSED-Cuda.cpp + HALOEXCHANGE_FUSED-OMP.cpp + HALOEXCHANGE_FUSED-OMPTarget.cpp + MPI_HALOEXCHANGE.cpp + MPI_HALOEXCHANGE-Seq.cpp + MPI_HALOEXCHANGE-Hip.cpp + MPI_HALOEXCHANGE-Cuda.cpp + MPI_HALOEXCHANGE-OMP.cpp + MPI_HALOEXCHANGE-OMPTarget.cpp + MPI_HALOEXCHANGE_FUSED.cpp + MPI_HALOEXCHANGE_FUSED-Seq.cpp + MPI_HALOEXCHANGE_FUSED-Hip.cpp + MPI_HALOEXCHANGE_FUSED-Cuda.cpp + MPI_HALOEXCHANGE_FUSED-OMP.cpp + MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/comm/HALOEXCHANGE-Cuda.cpp similarity index 99% rename from src/apps/HALOEXCHANGE-Cuda.cpp rename to src/comm/HALOEXCHANGE-Cuda.cpp index b5309ef05..1158cb4cc 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/comm/HALOEXCHANGE-Cuda.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { template < size_t block_size > @@ -146,7 +146,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Cuda) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/comm/HALOEXCHANGE-Hip.cpp similarity index 99% rename from src/apps/HALOEXCHANGE-Hip.cpp rename to src/comm/HALOEXCHANGE-Hip.cpp index 660bad32b..c190c262e 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/comm/HALOEXCHANGE-Hip.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { template < size_t block_size > @@ -148,7 +148,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Hip) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_HIP diff --git a/src/apps/HALOEXCHANGE-OMP.cpp b/src/comm/HALOEXCHANGE-OMP.cpp similarity index 99% rename from src/apps/HALOEXCHANGE-OMP.cpp rename to src/comm/HALOEXCHANGE-OMP.cpp index a62bdbd0a..823ad5940 100644 --- a/src/apps/HALOEXCHANGE-OMP.cpp +++ b/src/comm/HALOEXCHANGE-OMP.cpp @@ -14,7 +14,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -168,5 +168,5 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu #endif } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE-OMPTarget.cpp b/src/comm/HALOEXCHANGE-OMPTarget.cpp similarity index 98% rename from src/apps/HALOEXCHANGE-OMPTarget.cpp rename to src/comm/HALOEXCHANGE-OMPTarget.cpp index 67e40edb7..007a09f1c 100644 --- a/src/apps/HALOEXCHANGE-OMPTarget.cpp +++ b/src/comm/HALOEXCHANGE-OMPTarget.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { // @@ -118,7 +118,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/HALOEXCHANGE-Seq.cpp b/src/comm/HALOEXCHANGE-Seq.cpp similarity index 99% rename from src/apps/HALOEXCHANGE-Seq.cpp rename to src/comm/HALOEXCHANGE-Seq.cpp index a589cf3bb..aa444af74 100644 --- a/src/apps/HALOEXCHANGE-Seq.cpp +++ b/src/comm/HALOEXCHANGE-Seq.cpp @@ -14,7 +14,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -161,5 +161,5 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.cpp b/src/comm/HALOEXCHANGE.cpp similarity index 94% rename from src/apps/HALOEXCHANGE.cpp rename to src/comm/HALOEXCHANGE.cpp index 28f7952ef..58bc5defc 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/comm/HALOEXCHANGE.cpp @@ -14,11 +14,11 @@ namespace rajaperf { -namespace apps +namespace comm { HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Apps_HALOEXCHANGE, params) + : HALOEXCHANGE_base(rajaperf::Comm_HALOEXCHANGE, params) { setUsesFeature(Forall); @@ -67,5 +67,5 @@ void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE.hpp b/src/comm/HALOEXCHANGE.hpp similarity index 95% rename from src/apps/HALOEXCHANGE.hpp rename to src/comm/HALOEXCHANGE.hpp index 602db0efd..df260d31e 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/comm/HALOEXCHANGE.hpp @@ -42,8 +42,8 @@ /// } /// -#ifndef RAJAPerf_Apps_HALOEXCHANGE_HPP -#define RAJAPerf_Apps_HALOEXCHANGE_HPP +#ifndef RAJAPerf_Comm_HALOEXCHANGE_HPP +#define RAJAPerf_Comm_HALOEXCHANGE_HPP #define HALOEXCHANGE_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ @@ -57,7 +57,7 @@ namespace rajaperf { -namespace apps +namespace comm { class HALOEXCHANGE : public HALOEXCHANGE_base @@ -91,7 +91,7 @@ class HALOEXCHANGE : public HALOEXCHANGE_base std::vector m_buffers; }; -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // closing endif for header file include guard diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp similarity index 99% rename from src/apps/HALOEXCHANGE_FUSED-Cuda.cpp rename to src/comm/HALOEXCHANGE_FUSED-Cuda.cpp index e1dd9690e..ffa28ae35 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ @@ -246,7 +246,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Cuda) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/HALOEXCHANGE_FUSED-Hip.cpp similarity index 99% rename from src/apps/HALOEXCHANGE_FUSED-Hip.cpp rename to src/comm/HALOEXCHANGE_FUSED-Hip.cpp index 9ba64799f..fd6e3253e 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-Hip.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ @@ -250,7 +250,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Hip) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_HIP diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/HALOEXCHANGE_FUSED-OMP.cpp similarity index 99% rename from src/apps/HALOEXCHANGE_FUSED-OMP.cpp rename to src/comm/HALOEXCHANGE_FUSED-OMP.cpp index 124aa1a88..b5562ad2a 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-OMP.cpp @@ -14,7 +14,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -311,5 +311,5 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ #endif } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp similarity index 99% rename from src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp rename to src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp index d7f2ad9c9..85141315a 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { // @@ -233,7 +233,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/HALOEXCHANGE_FUSED-Seq.cpp similarity index 99% rename from src/apps/HALOEXCHANGE_FUSED-Seq.cpp rename to src/comm/HALOEXCHANGE_FUSED-Seq.cpp index 69830a66a..b6b8c2e27 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-Seq.cpp @@ -14,7 +14,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -236,5 +236,5 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.cpp b/src/comm/HALOEXCHANGE_FUSED.cpp similarity index 94% rename from src/apps/HALOEXCHANGE_FUSED.cpp rename to src/comm/HALOEXCHANGE_FUSED.cpp index 2ecf98d73..bc4a9e9e2 100644 --- a/src/apps/HALOEXCHANGE_FUSED.cpp +++ b/src/comm/HALOEXCHANGE_FUSED.cpp @@ -14,11 +14,11 @@ namespace rajaperf { -namespace apps +namespace comm { HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Apps_HALOEXCHANGE_FUSED, params) + : HALOEXCHANGE_base(rajaperf::Comm_HALOEXCHANGE_FUSED, params) { setUsesFeature(Workgroup); @@ -67,5 +67,5 @@ void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/comm/HALOEXCHANGE_FUSED.hpp similarity index 97% rename from src/apps/HALOEXCHANGE_FUSED.hpp rename to src/comm/HALOEXCHANGE_FUSED.hpp index 85c710ac8..62d709599 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALOEXCHANGE_FUSED.hpp @@ -42,8 +42,8 @@ /// } /// -#ifndef RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP -#define RAJAPerf_Apps_HALOEXCHANGE_FUSED_HPP +#ifndef RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP +#define RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP #define HALOEXCHANGE_FUSED_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ @@ -101,7 +101,7 @@ namespace rajaperf { -namespace apps +namespace comm { class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base @@ -135,7 +135,7 @@ class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base std::vector m_buffers; }; -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // closing endif for header file include guard diff --git a/src/apps/HALOEXCHANGE_base.cpp b/src/comm/HALOEXCHANGE_base.cpp similarity index 99% rename from src/apps/HALOEXCHANGE_base.cpp rename to src/comm/HALOEXCHANGE_base.cpp index 9e9769f37..2d4573a6b 100644 --- a/src/apps/HALOEXCHANGE_base.cpp +++ b/src/comm/HALOEXCHANGE_base.cpp @@ -16,7 +16,7 @@ namespace rajaperf { -namespace apps +namespace comm { HALOEXCHANGE_base::HALOEXCHANGE_base(KernelID kid, const RunParams& params) @@ -345,5 +345,5 @@ void HALOEXCHANGE_base::destroy_lists( } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_base.hpp b/src/comm/HALOEXCHANGE_base.hpp similarity index 97% rename from src/apps/HALOEXCHANGE_base.hpp rename to src/comm/HALOEXCHANGE_base.hpp index 6915f16d2..5864f2f1a 100644 --- a/src/apps/HALOEXCHANGE_base.hpp +++ b/src/comm/HALOEXCHANGE_base.hpp @@ -42,8 +42,8 @@ /// } /// -#ifndef RAJAPerf_Apps_HALOEXCHANGE_base_HPP -#define RAJAPerf_Apps_HALOEXCHANGE_base_HPP +#ifndef RAJAPerf_Comm_HALOEXCHANGE_base_HPP +#define RAJAPerf_Comm_HALOEXCHANGE_base_HPP #define HALOEXCHANGE_base_DATA_SETUP \ std::vector vars = m_vars; \ @@ -74,7 +74,7 @@ namespace rajaperf { class RunParams; -namespace apps +namespace comm { class HALOEXCHANGE_base : public KernelBase @@ -160,7 +160,7 @@ class HALOEXCHANGE_base : public KernelBase VariantID vid); }; -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // closing endif for header file include guard diff --git a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp b/src/comm/MPI_HALOEXCHANGE-Cuda.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE-Cuda.cpp rename to src/comm/MPI_HALOEXCHANGE-Cuda.cpp index 95142e033..33cc728cf 100644 --- a/src/apps/MPI_HALOEXCHANGE-Cuda.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Cuda.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { template < size_t block_size > @@ -198,7 +198,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE, Cuda) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MPI_HALOEXCHANGE-Hip.cpp b/src/comm/MPI_HALOEXCHANGE-Hip.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE-Hip.cpp rename to src/comm/MPI_HALOEXCHANGE-Hip.cpp index 52d73f498..7a46f0977 100644 --- a/src/apps/MPI_HALOEXCHANGE-Hip.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Hip.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { template < size_t block_size > @@ -200,7 +200,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE, Hip) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_HIP diff --git a/src/apps/MPI_HALOEXCHANGE-OMP.cpp b/src/comm/MPI_HALOEXCHANGE-OMP.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE-OMP.cpp rename to src/comm/MPI_HALOEXCHANGE-OMP.cpp index 915c1c071..636a57f8a 100644 --- a/src/apps/MPI_HALOEXCHANGE-OMP.cpp +++ b/src/comm/MPI_HALOEXCHANGE-OMP.cpp @@ -16,7 +16,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -248,7 +248,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR #endif } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp b/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp rename to src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp index f64ca80d1..9b47d1c64 100644 --- a/src/apps/MPI_HALOEXCHANGE-OMPTarget.cpp +++ b/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { // @@ -170,7 +170,7 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/MPI_HALOEXCHANGE-Seq.cpp b/src/comm/MPI_HALOEXCHANGE-Seq.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE-Seq.cpp rename to src/comm/MPI_HALOEXCHANGE-Seq.cpp index b11acebd4..2e952485f 100644 --- a/src/apps/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Seq.cpp @@ -16,7 +16,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -241,7 +241,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/apps/MPI_HALOEXCHANGE.cpp b/src/comm/MPI_HALOEXCHANGE.cpp similarity index 97% rename from src/apps/MPI_HALOEXCHANGE.cpp rename to src/comm/MPI_HALOEXCHANGE.cpp index 158221e7e..9535edfbf 100644 --- a/src/apps/MPI_HALOEXCHANGE.cpp +++ b/src/comm/MPI_HALOEXCHANGE.cpp @@ -16,11 +16,11 @@ namespace rajaperf { -namespace apps +namespace comm { MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Apps_MPI_HALOEXCHANGE, params) + : HALOEXCHANGE_base(rajaperf::Comm_MPI_HALOEXCHANGE, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -117,7 +117,7 @@ void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/apps/MPI_HALOEXCHANGE.hpp b/src/comm/MPI_HALOEXCHANGE.hpp similarity index 96% rename from src/apps/MPI_HALOEXCHANGE.hpp rename to src/comm/MPI_HALOEXCHANGE.hpp index 981c15a24..806fa8063 100644 --- a/src/apps/MPI_HALOEXCHANGE.hpp +++ b/src/comm/MPI_HALOEXCHANGE.hpp @@ -42,8 +42,8 @@ /// } /// -#ifndef RAJAPerf_Apps_MPI_HALOEXCHANGE_HPP -#define RAJAPerf_Apps_MPI_HALOEXCHANGE_HPP +#ifndef RAJAPerf_Comm_MPI_HALOEXCHANGE_HPP +#define RAJAPerf_Comm_MPI_HALOEXCHANGE_HPP #define MPI_HALOEXCHANGE_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ @@ -75,7 +75,7 @@ namespace rajaperf { -namespace apps +namespace comm { class MPI_HALOEXCHANGE : public HALOEXCHANGE_base @@ -117,7 +117,7 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base std::vector m_recv_buffers; }; -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp rename to src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp index 5957968d6..0e3c9b1ea 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { #define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ @@ -304,7 +304,7 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE_FUSED, Cuda) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp rename to src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp index 45b898182..ae47dea02 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { #define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ @@ -308,7 +308,7 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE_FUSED, Hip) -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_HIP diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp rename to src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp index 934613141..56d42b59f 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp @@ -16,7 +16,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -396,7 +396,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU #endif } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp rename to src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp index 9e9034c04..875b0afcc 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -18,7 +18,7 @@ namespace rajaperf { -namespace apps +namespace comm { // @@ -291,7 +291,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE } } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp similarity index 99% rename from src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp rename to src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp index cb27cc440..b8ccba2b3 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp @@ -16,7 +16,7 @@ namespace rajaperf { -namespace apps +namespace comm { @@ -325,7 +325,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp similarity index 97% rename from src/apps/MPI_HALOEXCHANGE_FUSED.cpp rename to src/comm/MPI_HALOEXCHANGE_FUSED.cpp index 8ddf264c2..66362e465 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp @@ -16,11 +16,11 @@ namespace rajaperf { -namespace apps +namespace comm { MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Apps_MPI_HALOEXCHANGE_FUSED, params) + : HALOEXCHANGE_base(rajaperf::Comm_MPI_HALOEXCHANGE_FUSED, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -117,7 +117,7 @@ void MPI_HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/apps/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp similarity index 97% rename from src/apps/MPI_HALOEXCHANGE_FUSED.hpp rename to src/comm/MPI_HALOEXCHANGE_FUSED.hpp index ab716aec9..2d87312cb 100644 --- a/src/apps/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp @@ -42,8 +42,8 @@ /// } /// -#ifndef RAJAPerf_Apps_MPI_HALOEXCHANGE_FUSED_HPP -#define RAJAPerf_Apps_MPI_HALOEXCHANGE_FUSED_HPP +#ifndef RAJAPerf_Comm_MPI_HALOEXCHANGE_FUSED_HPP +#define RAJAPerf_Comm_MPI_HALOEXCHANGE_FUSED_HPP #define MPI_HALOEXCHANGE_FUSED_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ @@ -116,7 +116,7 @@ namespace rajaperf { -namespace apps +namespace comm { class MPI_HALOEXCHANGE_FUSED : public HALOEXCHANGE_base @@ -158,7 +158,7 @@ class MPI_HALOEXCHANGE_FUSED : public HALOEXCHANGE_base std::vector m_recv_buffers; }; -} // end namespace apps +} // end namespace comm } // end namespace rajaperf #endif diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 24e483b59..f4b15706f 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -24,7 +24,7 @@ #include "basic/REDUCE3_INT.hpp" #include "basic/INDEXLIST_3LOOP.hpp" #include "algorithm/SORT.hpp" -#include "apps/HALOEXCHANGE_FUSED.hpp" +#include "comm/HALOEXCHANGE_FUSED.hpp" #include #include @@ -674,7 +674,7 @@ void Executor::runWarmupKernels() kernel_ids.insert(Basic_INDEXLIST_3LOOP); break; case Workgroup: - kernel_ids.insert(Apps_HALOEXCHANGE_FUSED); break; + kernel_ids.insert(Comm_HALOEXCHANGE_FUSED); break; case Reduction: kernel_ids.insert(Basic_REDUCE3_INT); break; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 0d6b698ae..535a59ad0 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -86,12 +86,6 @@ #include "apps/EDGE3D.hpp" #include "apps/ENERGY.hpp" #include "apps/FIR.hpp" -#include "apps/HALOEXCHANGE.hpp" -#include "apps/HALOEXCHANGE_FUSED.hpp" -#if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include "apps/MPI_HALOEXCHANGE.hpp" -#include "apps/MPI_HALOEXCHANGE_FUSED.hpp" -#endif #include "apps/LTIMES.hpp" #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DEA.hpp" @@ -111,6 +105,16 @@ #include "algorithm/MEMSET.hpp" #include "algorithm/MEMCPY.hpp" +// +// Comm kernels... +// +#include "comm/HALOEXCHANGE.hpp" +#include "comm/HALOEXCHANGE_FUSED.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "comm/MPI_HALOEXCHANGE.hpp" +#include "comm/MPI_HALOEXCHANGE_FUSED.hpp" +#endif + #include @@ -137,6 +141,7 @@ static const std::string GroupNames [] = std::string("Stream"), std::string("Apps"), std::string("Algorithm"), + std::string("Comm"), std::string("Unknown Group") // Keep this at the end and DO NOT remove.... @@ -230,12 +235,6 @@ static const std::string KernelNames [] = std::string("Apps_EDGE3D"), std::string("Apps_ENERGY"), std::string("Apps_FIR"), - std::string("Apps_HALOEXCHANGE"), - std::string("Apps_HALOEXCHANGE_FUSED"), -#if defined(RAJA_PERFSUITE_ENABLE_MPI) - std::string("Apps_MPI_HALOEXCHANGE"), - std::string("Apps_MPI_HALOEXCHANGE_FUSED"), -#endif std::string("Apps_LTIMES"), std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DEA"), @@ -255,6 +254,16 @@ static const std::string KernelNames [] = std::string("Algorithm_MEMSET"), std::string("Algorithm_MEMCPY"), +// +// Comm kernels... +// + std::string("Comm_HALOEXCHANGE"), + std::string("Comm_HALOEXCHANGE_FUSED"), +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + std::string("Comm_MPI_HALOEXCHANGE"), + std::string("Comm_MPI_HALOEXCHANGE_FUSED"), +#endif + std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... }; // END KernelNames @@ -903,24 +912,6 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::FIR(run_params); break; } - case Apps_HALOEXCHANGE : { - kernel = new apps::HALOEXCHANGE(run_params); - break; - } - case Apps_HALOEXCHANGE_FUSED : { - kernel = new apps::HALOEXCHANGE_FUSED(run_params); - break; - } -#if defined(RAJA_PERFSUITE_ENABLE_MPI) - case Apps_MPI_HALOEXCHANGE : { - kernel = new apps::MPI_HALOEXCHANGE(run_params); - break; - } - case Apps_MPI_HALOEXCHANGE_FUSED : { - kernel = new apps::MPI_HALOEXCHANGE_FUSED(run_params); - break; - } -#endif case Apps_LTIMES : { kernel = new apps::LTIMES(run_params); break; @@ -982,6 +973,28 @@ KernelBase* getKernelObject(KernelID kid, break; } +// +// Comm kernels... +// + case Comm_HALOEXCHANGE : { + kernel = new comm::HALOEXCHANGE(run_params); + break; + } + case Comm_HALOEXCHANGE_FUSED : { + kernel = new comm::HALOEXCHANGE_FUSED(run_params); + break; + } +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + case Comm_MPI_HALOEXCHANGE : { + kernel = new comm::MPI_HALOEXCHANGE(run_params); + break; + } + case Comm_MPI_HALOEXCHANGE_FUSED : { + kernel = new comm::MPI_HALOEXCHANGE_FUSED(run_params); + break; + } +#endif + default: { getCout() << "\n Unknown Kernel ID = " << kid << std::endl; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 4b8269aa3..f9ce64902 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -52,6 +52,7 @@ enum GroupID { Stream, Apps, Algorithm, + Comm, NumGroups // Keep this one last and DO NOT remove (!!) @@ -145,12 +146,6 @@ enum KernelID { Apps_EDGE3D, Apps_ENERGY, Apps_FIR, - Apps_HALOEXCHANGE, - Apps_HALOEXCHANGE_FUSED, -#if defined(RAJA_PERFSUITE_ENABLE_MPI) - Apps_MPI_HALOEXCHANGE, - Apps_MPI_HALOEXCHANGE_FUSED, -#endif Apps_LTIMES, Apps_LTIMES_NOVIEW, Apps_MASS3DEA, @@ -170,6 +165,16 @@ enum KernelID { Algorithm_MEMSET, Algorithm_MEMCPY, +// +// Comm kernels... +// + Comm_HALOEXCHANGE, + Comm_HALOEXCHANGE_FUSED, +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + Comm_MPI_HALOEXCHANGE, + Comm_MPI_HALOEXCHANGE_FUSED, +#endif + NumKernels // Keep this one last and NEVER comment out (!!) }; From 8d7b4048ed7ac4f97cf08a5e904fcac33edb1811 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 9 Oct 2023 15:47:39 -0700 Subject: [PATCH 048/454] Fixup MPI feature Add the string conversion. Add the warmup kernel. --- src/common/Executor.cpp | 10 +++++++++- src/common/RAJAPerfSuite.cpp | 4 ++++ src/common/RAJAPerfSuite.hpp | 2 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index f4b15706f..75a4d3f1d 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -25,6 +25,9 @@ #include "basic/INDEXLIST_3LOOP.hpp" #include "algorithm/SORT.hpp" #include "comm/HALOEXCHANGE_FUSED.hpp" +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "comm/MPI_HALOEXCHANGE_FUSED.hpp" +#endif #include #include @@ -684,7 +687,12 @@ void Executor::runWarmupKernels() case View: break; - + +#ifdef RAJA_PERFSUITE_ENABLE_MPI + case MPI: + kernel_ids.insert(Comm_MPI_HALOEXCHANGE_FUSED); break; +#endif + default: break; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 535a59ad0..a21713a1d 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -338,6 +338,10 @@ static const std::string FeatureNames [] = std::string("View"), +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + std::string("MPI"), +#endif + std::string("Unknown Feature") // Keep this at the end and DO NOT remove.... }; // END FeatureNames diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index f9ce64902..3d5b4000d 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -247,7 +247,9 @@ enum FeatureID { View, +#if defined(RAJA_PERFSUITE_ENABLE_MPI) MPI, +#endif NumFeatures // Keep this one last and NEVER comment out (!!) From b60474081d367d9208bab23a6f63578b43e062ae Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 10 Oct 2023 12:50:06 -0700 Subject: [PATCH 049/454] Add missing target for test executable --- test/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 001c81190..c2d21d81d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -13,7 +13,8 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS lcals polybench stream - algorithm) + algorithm + comm) list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) raja_add_test( From 96c3bc2398fa79ba41df575b4cd009ecec49321b Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Tue, 10 Oct 2023 15:16:14 -0700 Subject: [PATCH 050/454] Adding Size_type to a few more places --- src/common/CudaDataUtils.hpp | 8 ++++---- src/common/DataUtils.cpp | 4 ++-- src/common/DataUtils.hpp | 22 +++++++++++----------- src/common/HipDataUtils.hpp | 16 ++++++++-------- src/common/KernelBase.cpp | 2 +- src/common/KernelBase.hpp | 2 +- src/common/OpenMPTargetDataUtils.hpp | 8 ++++---- src/common/RPTypes.hpp | 8 ++++++++ src/common/RunParams.hpp | 5 +++-- 9 files changed, 42 insertions(+), 33 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 7907b7286..4f5741e39 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -124,7 +124,7 @@ int getCudaOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size) /* * Copy memory len bytes from src to dst. */ -inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len) +inline void copyCudaData(void* dst_ptr, const void* src_ptr, Size_type len) { cudaErrchk( cudaMemcpy( dst_ptr, src_ptr, len, cudaMemcpyDefault ) ); @@ -133,7 +133,7 @@ inline void copyCudaData(void* dst_ptr, const void* src_ptr, size_t len) /*! * \brief Allocate CUDA device data array (dptr). */ -inline void* allocCudaDeviceData(size_t len) +inline void* allocCudaDeviceData(Size_type len) { void* dptr = nullptr; cudaErrchk( cudaMalloc( &dptr, len ) ); @@ -143,7 +143,7 @@ inline void* allocCudaDeviceData(size_t len) /*! * \brief Allocate CUDA managed data array (dptr). */ -inline void* allocCudaManagedData(size_t len) +inline void* allocCudaManagedData(Size_type len) { void* mptr = nullptr; cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); @@ -153,7 +153,7 @@ inline void* allocCudaManagedData(size_t len) /*! * \brief Allocate CUDA pinned data array (pptr). */ -inline void* allocCudaPinnedData(size_t len) +inline void* allocCudaPinnedData(Size_type len) { void* pptr = nullptr; cudaErrchk( cudaHostAlloc( &pptr, len, cudaHostAllocMapped ) ); diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 8097a603b..7062f689c 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -132,7 +132,7 @@ void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len) /* * Allocate data arrays of given type. */ -void* allocHostData(Size_type len, size_t align) +void* allocHostData(Size_type len, Size_type align) { return RAJA::allocate_aligned_type( align, len); @@ -153,7 +153,7 @@ void deallocHostData(void* ptr) /* * Allocate data arrays of given dataSpace. */ -void* allocData(DataSpace dataSpace, Size_type nbytes, int align) +void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align) { void* ptr = nullptr; diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index fc83cbffe..67a612c83 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -49,7 +49,7 @@ void copyHostData(void* dst_ptr, const void* src_ptr, Size_type len); /*! * \brief Allocate data arrays. */ -void* allocHostData(Size_type len, size_t align); +void* allocHostData(Size_type len, Size_type align); /*! * \brief Free data arrays. @@ -60,7 +60,7 @@ void deallocHostData(void* ptr); /*! * \brief Allocate data array in dataSpace. */ -void* allocData(DataSpace dataSpace, Size_type nbytes, int align); +void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align); /*! * \brief Copy data from one dataSpace to another. @@ -171,7 +171,7 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace); * \brief Allocate data array (ptr). */ template -inline void allocData(DataSpace dataSpace, T*& ptr_ref, Size_type len, int align) +inline void allocData(DataSpace dataSpace, T*& ptr_ref, Size_type len, Size_type align) { Size_type nbytes = len*sizeof(T); T* ptr = static_cast(detail::allocData(dataSpace, nbytes, align)); @@ -216,7 +216,7 @@ inline void copyData(DataSpace dst_dataSpace, T* dst_ptr, */ template inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, Size_type len, int align) + T*& ptr, Size_type len, Size_type align) { if (new_dataSpace != old_dataSpace) { @@ -237,7 +237,7 @@ template struct AutoDataMover { AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, Size_type len, int align) + T*& ptr, Size_type len, Size_type align) : m_ptr(&ptr) , m_new_dataSpace(new_dataSpace) , m_old_dataSpace(old_dataSpace) @@ -285,14 +285,14 @@ struct AutoDataMover DataSpace m_new_dataSpace; DataSpace m_old_dataSpace; Size_type m_len; - int m_align; + Size_type m_align; }; /*! * \brief Allocate and initialize data array. */ template -inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, int align) +inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -310,7 +310,7 @@ inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, int al * Array entries are initialized using the method initDataConst. */ template -inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, int align, +inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align, T val) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -328,7 +328,7 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, i * Array is initialized using method initDataRandSign. */ template -inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len, int align) +inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -346,7 +346,7 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len * Array is initialized using method initDataRandValue. */ template -inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len, int align) +inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); @@ -361,7 +361,7 @@ inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type le * Calculate and return checksum for arrays. */ template -inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, int align, +inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size_type align, Real_type scale_factor) { T* check_ptr = ptr; diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index dbfc31c2a..ad23ae5d5 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -111,7 +111,7 @@ int getHipOccupancyMaxBlocks(Func&& func, int num_threads, size_t shmem_size) /* * Copy memory len bytes from src to dst. */ -inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len) +inline void copyHipData(void* dst_ptr, const void* src_ptr, Size_type len) { hipErrchk( hipMemcpy( dst_ptr, src_ptr, len, hipMemcpyDefault ) ); @@ -120,7 +120,7 @@ inline void copyHipData(void* dst_ptr, const void* src_ptr, size_t len) /*! * \brief Allocate HIP device data array (dptr). */ -inline void* allocHipDeviceData(size_t len) +inline void* allocHipDeviceData(Size_type len) { void* dptr = nullptr; hipErrchk( hipMalloc( &dptr, len ) ); @@ -130,7 +130,7 @@ inline void* allocHipDeviceData(size_t len) /*! * \brief Allocate HIP fine-grained device data array (dfptr). */ -inline void* allocHipDeviceFineData(size_t len) +inline void* allocHipDeviceFineData(Size_type len) { void* dfptr = nullptr; hipErrchk( hipExtMallocWithFlags( &dfptr, len, @@ -141,7 +141,7 @@ inline void* allocHipDeviceFineData(size_t len) /*! * \brief Allocate HIP managed data array (mptr). */ -inline void* allocHipManagedData(size_t len) +inline void* allocHipManagedData(Size_type len) { void* mptr = nullptr; hipErrchk( hipMallocManaged( &mptr, len, @@ -152,7 +152,7 @@ inline void* allocHipManagedData(size_t len) /*! * \brief Allocate HIP pinned data array (pptr). */ -inline void* allocHipPinnedData(size_t len) +inline void* allocHipPinnedData(Size_type len) { void* pptr = nullptr; hipErrchk( hipHostMalloc( &pptr, len, @@ -163,7 +163,7 @@ inline void* allocHipPinnedData(size_t len) /*! * \brief Allocate HIP fine-grained pinned data array (pfptr). */ -inline void* allocHipPinnedFineData(size_t len) +inline void* allocHipPinnedFineData(Size_type len) { void* pfptr = nullptr; hipErrchk( hipHostMalloc( &pfptr, len, @@ -174,7 +174,7 @@ inline void* allocHipPinnedFineData(size_t len) /*! * \brief Allocate HIP coarse-grained pinned data array (pcptr). */ -inline void* allocHipPinnedCoarseData(size_t len) +inline void* allocHipPinnedCoarseData(Size_type len) { void* pcptr = nullptr; hipErrchk( hipHostMalloc( &pcptr, len, @@ -185,7 +185,7 @@ inline void* allocHipPinnedCoarseData(size_t len) /*! * \brief Apply mem advice to HIP data array (ptr). */ -inline void adviseHipData(void* ptr, int len, hipMemoryAdvise advice, int device) +inline void adviseHipData(void* ptr, size_t len, hipMemoryAdvise advice, int device) { hipErrchk( hipMemAdvise( ptr, len, advice, device ) ); } diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 646c9bd8d..b08fa179e 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -194,7 +194,7 @@ void KernelBase::setVariantDefined(VariantID vid) #endif } -int KernelBase::getDataAlignment() const +Size_type KernelBase::getDataAlignment() const { return run_params.getDataAlignment(); } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 425f2f104..08be3d71e 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -258,7 +258,7 @@ class KernelBase #endif } - int getDataAlignment() const; + Size_type getDataAlignment() const; DataSpace getDataSpace(VariantID vid) const; DataSpace getHostAccessibleDataSpace(VariantID vid) const; diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index b5c98cb97..b24cbd7c4 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -47,7 +47,7 @@ namespace detail /* * Copy memory len bytes from src to dst. */ -inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, +inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, Size_type len, int dst_did, int src_did) { omp_target_memcpy( dst_ptr, const_cast(src_ptr), len, @@ -58,7 +58,7 @@ inline void copyOpenMPTargetData(void* dst_ptr, const void* src_ptr, size_t len, * \brief Allocate device data array (dptr) and copy given hptr (host) * data to device array. */ -inline void* allocOpenMPDeviceData(size_t len, +inline void* allocOpenMPDeviceData(Size_type len, int did = getOpenMPTargetDevice()) { return omp_target_alloc( len, did); @@ -83,7 +83,7 @@ inline void deallocOpenMPDeviceData(void* dptr, * and of propoer size for copy operation to succeed. */ template -void initOpenMPDeviceData(T* dptr, const T* hptr, int len, +void initOpenMPDeviceData(T* dptr, const T* hptr, Size_type len, int did = getOpenMPTargetDevice(), int hid = getOpenMPTargetHost()) { @@ -97,7 +97,7 @@ void initOpenMPDeviceData(T* dptr, const T* hptr, int len, * and of propoer size for copy operation to succeed. */ template -void getOpenMPDeviceData(T* hptr, const T* dptr, int len, +void getOpenMPDeviceData(T* hptr, const T* dptr, Size_type len, int hid = getOpenMPTargetHost(), int did = getOpenMPTargetDevice()) { diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index 4127fc1e2..37e4cb3dd 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -60,6 +60,14 @@ using Index_type = RAJA::Index_type; /// using Index_ptr = Index_type*; + +/*! + ****************************************************************************** + * + * \brief Type used for sizing allocations. + * + ****************************************************************************** + */ using Size_type = size_t; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 6d6402f6a..63c071069 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -15,6 +15,7 @@ #include #include "RAJAPerfSuite.hpp" +#include "RPTypes.hpp" namespace rajaperf { @@ -119,7 +120,7 @@ class RunParams { double getSizeFactor() const { return size_factor; } - size_t getDataAlignment() const { return data_alignment; } + Size_type getDataAlignment() const { return data_alignment; } int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } @@ -211,7 +212,7 @@ class RunParams { SizeMeaning size_meaning; /*!< meaning of size value */ double size; /*!< kernel size to run (input option) */ double size_factor; /*!< default kernel size multipier (input option) */ - size_t data_alignment; + Size_type data_alignment; int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ From fd4905cf2f70319454c2a5418648a5f296c1dbcb Mon Sep 17 00:00:00 2001 From: Sean Miller Date: Tue, 10 Oct 2023 09:08:58 -0700 Subject: [PATCH 051/454] Adding fix for Base_HIP Basic_INDEXLIST benchmark to prevent hangs --- src/basic/INDEXLIST-Hip.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 5d36e20aa..9a3309d9d 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -189,12 +189,6 @@ __device__ void grid_scan(const int block_id, exclusive[ti] = prev_grid_count + exclusive[ti]; inclusive[ti] = prev_grid_count + inclusive[ti]; } - - if (last_block) { - for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) { - while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); - } - } } } @@ -270,12 +264,12 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) allocData(DataSpace::HipDevice, grid_counts, grid_size); unsigned* block_readys; allocData(DataSpace::HipDevice, block_readys, grid_size); - hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); indexlist <<>>( x+ibegin, list+ibegin, From 4e20e1375392a2f9b590b29e712153bdca9846b0 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 11 Oct 2023 08:23:57 -0700 Subject: [PATCH 052/454] Fix buffer use in MPI_HALOEXHCANGE_FUSED --- src/comm/MPI_HALOEXCHANGE.hpp | 4 ++-- src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp | 4 ++-- src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/comm/MPI_HALOEXCHANGE_FUSED.hpp | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/comm/MPI_HALOEXCHANGE.hpp b/src/comm/MPI_HALOEXCHANGE.hpp index 806fa8063..bf7fef72a 100644 --- a/src/comm/MPI_HALOEXCHANGE.hpp +++ b/src/comm/MPI_HALOEXCHANGE.hpp @@ -11,7 +11,7 @@ /// /// // pack message for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; /// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable @@ -28,7 +28,7 @@ /// // unpack messages for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// // receive message from neighbor -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp index 56d42b59f..23722763a 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp @@ -321,7 +321,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -354,7 +354,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp index 875b0afcc..9dc1e3ec0 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -258,7 +258,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp index 2d87312cb..69117ecaa 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp @@ -11,7 +11,7 @@ /// /// // pack message for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; /// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable @@ -28,7 +28,7 @@ /// // unpack messages for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// // receive message from neighbor -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable From 6687e453a2c8e60d50ab6369e6a6da93e5f98cc3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 11 Oct 2023 08:24:26 -0700 Subject: [PATCH 053/454] Add toss4_mvapich2_icpx.sh script --- scripts/lc-builds/toss4_mvapich2_icpx.sh | 57 ++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100755 scripts/lc-builds/toss4_mvapich2_icpx.sh diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh new file mode 100755 index 000000000..9c8cd5b97 --- /dev/null +++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [ "$1" == "" ]; then + echo + echo "You must pass a compiler version number to script. For example," + echo " toss4_mvapich2_icpx.sh 2.3.7 2022.1.0" + exit +fi + +MPI_VER=$1 +COMP_VER=$2 +shift 2 + +BUILD_SUFFIX=lc_toss4-mvapich2-${MPI_VER}-icpx-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpx_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.21.1 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpic++ \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icpx \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=On \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to run with mpi when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " srun -n2 ./bin/raja-perf.exe" +echo +echo "***********************************************************************" From 3077ece942e6c99c1b7b8918110244722a3e1777 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 11 Oct 2023 08:45:11 -0700 Subject: [PATCH 054/454] Attempt to add intel_2022_1_0_mpi spec to gitlab ci --- .gitlab/ruby-build-and-test-extra.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml index 965142c5f..ec86e6ce5 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/ruby-build-and-test-extra.yml @@ -37,6 +37,12 @@ intel_2022_1_0: allow_failure: true extends: .build_and_test_on_ruby +intel_2022_1_0_mpi: + variables: + SPEC: "${PROJECT_RUBY_VARIANTS} +mpi %intel@2022.1.0 ${PROJECT_RUBY_DEPS} ^mvapich2" + allow_failure: true + extends: .build_and_test_on_ruby + ############ # Extra jobs ############ From a66b432f80c5102acb0d92af1b6f6e3f19afb155 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 11 Oct 2023 11:49:50 -0700 Subject: [PATCH 055/454] Move some setup out of base constructor Move some of the setup of kernel properties out of the HALOEXHCANGE_base constructor into the specific classes. --- src/comm/HALOEXCHANGE.cpp | 9 +++++++++ src/comm/HALOEXCHANGE_FUSED.cpp | 9 +++++++++ src/comm/HALOEXCHANGE_base.cpp | 7 ------- src/comm/MPI_HALOEXCHANGE.cpp | 11 ++++++++++- src/comm/MPI_HALOEXCHANGE_FUSED.cpp | 11 ++++++++++- 5 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/comm/HALOEXCHANGE.cpp b/src/comm/HALOEXCHANGE.cpp index 58bc5defc..5ef0c9ec8 100644 --- a/src/comm/HALOEXCHANGE.cpp +++ b/src/comm/HALOEXCHANGE.cpp @@ -20,6 +20,15 @@ namespace comm HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) : HALOEXCHANGE_base(rajaperf::Comm_HALOEXCHANGE, params) { + setDefaultReps(200); + + setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); + setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack + (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setFLOPsPerRep(0); + setUsesFeature(Forall); setVariantDefined( Base_Seq ); diff --git a/src/comm/HALOEXCHANGE_FUSED.cpp b/src/comm/HALOEXCHANGE_FUSED.cpp index bc4a9e9e2..00000d468 100644 --- a/src/comm/HALOEXCHANGE_FUSED.cpp +++ b/src/comm/HALOEXCHANGE_FUSED.cpp @@ -20,6 +20,15 @@ namespace comm HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) : HALOEXCHANGE_base(rajaperf::Comm_HALOEXCHANGE_FUSED, params) { + setDefaultReps(200); + + setKernelsPerRep( 2 ); + setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack + (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setFLOPsPerRep(0); + setUsesFeature(Workgroup); setVariantDefined( Base_Seq ); diff --git a/src/comm/HALOEXCHANGE_base.cpp b/src/comm/HALOEXCHANGE_base.cpp index 2d4573a6b..f18faf0d9 100644 --- a/src/comm/HALOEXCHANGE_base.cpp +++ b/src/comm/HALOEXCHANGE_base.cpp @@ -31,7 +31,6 @@ HALOEXCHANGE_base::HALOEXCHANGE_base(KernelID kid, const RunParams& params) setDefaultProblemSize( m_grid_dims_default[0] * m_grid_dims_default[1] * m_grid_dims_default[2] ); - setDefaultReps(50); double cbrt_run_size = std::cbrt(getTargetProblemSize()); @@ -51,12 +50,6 @@ HALOEXCHANGE_base::HALOEXCHANGE_base(KernelID kid, const RunParams& params) setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); - setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); - setFLOPsPerRep(0); } HALOEXCHANGE_base::~HALOEXCHANGE_base() diff --git a/src/comm/MPI_HALOEXCHANGE.cpp b/src/comm/MPI_HALOEXCHANGE.cpp index 9535edfbf..5ec6df27d 100644 --- a/src/comm/MPI_HALOEXCHANGE.cpp +++ b/src/comm/MPI_HALOEXCHANGE.cpp @@ -26,7 +26,16 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) m_my_mpi_rank = params.getMPIRank(); m_mpi_dims = params.getMPI3DDivision(); - // TODO: Figure out how to count MPI data movement in BytesPerRep + setDefaultReps(50); + + setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); + setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send + (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + // recv + (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setFLOPsPerRep(0); setUsesFeature(Forall); setUsesFeature(MPI); diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp index 66362e465..3dc788cc9 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp @@ -26,7 +26,16 @@ MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) m_my_mpi_rank = params.getMPIRank(); m_mpi_dims = params.getMPI3DDivision(); - // TODO: Figure out how to count MPI data movement in BytesPerRep + setDefaultReps(50); + + setKernelsPerRep( 2 ); + setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send + (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + // recv + (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setFLOPsPerRep(0); setUsesFeature(Workgroup); setUsesFeature(MPI); From 4a27276a6d07038f2f643865ad0fda4f21d3cb92 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 09:52:13 +0200 Subject: [PATCH 056/454] Allow nested srun commands --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index a4081efe1..4a2604d5c 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -17,7 +17,7 @@ variables: # Arguments for top level allocation RUBY_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1" # Arguments for job level allocation - RUBY_BUILD_AND_TEST_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1" + RUBY_BUILD_AND_TEST_JOB_ALLOC: "--overlap --reservation=ci --qos=ci_ruby --time=30 --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby From 0215730706f855b9573ab5cfa133f1711ef08e0f Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 11:10:07 +0200 Subject: [PATCH 057/454] Update radiuss-shared-ci and radiuss-spack-configs: shared CI jobs in RSC --- .gitlab-ci.yml | 62 +++++++++++-------- .gitlab/custom-jobs-and-variables.yml | 16 ++--- .../corona.yml} | 0 .../lassen.yml} | 14 ++--- .../ruby.yml} | 8 +-- .../tioga.yml} | 2 +- .gitlab/subscribed-pipelines.yml | 29 +++++++-- tpl/RAJA | 2 +- 8 files changed, 81 insertions(+), 52 deletions(-) rename .gitlab/{corona-build-and-test-extra.yml => jobs/corona.yml} (100%) rename .gitlab/{lassen-build-and-test-extra.yml => jobs/lassen.yml} (87%) rename .gitlab/{ruby-build-and-test-extra.yml => jobs/ruby.yml} (90%) rename .gitlab/{tioga-build-and-test-extra.yml => jobs/tioga.yml} (96%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e7997e0d1..3b701d28d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,72 +6,82 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +# DESCRIPTION: ############################################################################### # General GitLab pipelines configurations for supercomputers and Linux clusters # at Lawrence Livermore National Laboratory (LLNL). -# # This entire pipeline is LLNL-specific # -# Important note: This file is a template provided by -# llnl/radiuss-shared-ci. Changes needed consists in setting variable values, -# change the reference to the radiuss-shared-ci repo, opt-in and out optional -# features. The project can then extend it with additional stages. +# Important note: This file is a template provided by llnl/radiuss-shared-ci. +# Remains to set variable values, change the reference to the radiuss-shared-ci +# repo, opt-in and out optional features. The project can then extend it with +# additional stages. # -# However, each project should provide: +# In addition, each project should copy over and complete: # - .gitlab/custom-jobs-and-variables.yml # - .gitlab/subscribed-pipelines.yml -# - .gitlab/${MACHINE}-build-and-test-extra.yml +# +# The jobs should be specified in a file local to the project, +# - .gitlab/jobs/${CI_MACHINE}.yml +# or generated (see LLNL/Umpire for an example). ############################################################################### # We define the following GitLab pipeline variables: variables: -# Required information about GitHub repository - GITHUB_PROJECT_NAME: "RAJAPerf" - GITHUB_PROJECT_ORG: "LLNL" -# Use the umdev service user to run CI. This prevents from running pipelines as -# an actual user. +##### LC GITLAB CONFIGURATION +# Use the umdev LLNL service user to run CI. This prevents from running +# pipelines as an actual user. LLNL_SERVICE_USER: rajasa # Use the service user workspace. Solves permission issues, stores everything # at the same location whoever triggers a pipeline. -# CUSTOM_CI_BUILDS_DIR: "" +# CUSTOM_CI_BUILDS_DIR: "/usr/workspace/rajasa/gitlab-runner" # Tells Gitlab to recursively update the submodules when cloning the project. GIT_SUBMODULE_STRATEGY: recursive + +##### PROJECT VARIABLES # We build the projects in the CI clone directory. -# TODO: add a clean-up mechanism +# Used in script/gitlab/build_and_test.sh script. +# TODO: add a clean-up mechanism. BUILD_ROOT: ${CI_PROJECT_DIR} + +##### SHARED_CI CONFIGURATION +# Required information about GitHub repository + GITHUB_PROJECT_NAME: "RAJAPerf" + GITHUB_PROJECT_ORG: "LLNL" # Set the build-and-test command. - BUILD_AND_TEST_CMD: "./scripts/gitlab/build_and_test.sh" -# Override the pattern describing branches that will skip the "draft PR test". -# Add protected branches here. See default value in + JOB_CMD: "./scripts/gitlab/build_and_test.sh" +# Override the pattern describing branches that will skip the "draft PR filter +# test". Add protected branches here. See default value in # preliminary-ignore-draft-pr.yml. # ALWAYS_RUN_PATTERN: "^develop$|^main$|^v[0-9.]*-RC$" -# We organize the build-and-test stage in sub-pipelines. Each sub-pipeline +# We organize the build-and-test stage with sub-pipelines. Each sub-pipeline # corresponds to a test batch on a given machine. # High level stages stages: - - machine-checks + - prerequisites - build-and-test -# Template for jobs triggering a build-and-test sub-pipelines: +# Template for jobs triggering a build-and-test sub-pipeline: .build-and-test: stage: build-and-test trigger: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: v2023.08.0 - file: '${CI_MACHINE}-build-and-test.yml' - - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml' + ref: 'v2023.09.0' + file: 'pipelines/${CI_MACHINE}.yml' + - artifact: '${CI_MACHINE}-jobs.yml' + job: 'generate-job-lists' strategy: depend forward: pipeline_variables: true include: - # checks preliminary to running the actual CI test (optional) + # [Optional] checks preliminary to running the actual CI test #- project: 'radiuss/radiuss-shared-ci' - # ref: v2023.03.1 - # file: 'preliminary-ignore-draft-pr.yml' + # ref: 'v2023.09.0' + # file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index a4081efe1..dd5137f27 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -15,9 +15,9 @@ variables: # Ruby # Arguments for top level allocation - RUBY_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1" + RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1" # Arguments for job level allocation - RUBY_BUILD_AND_TEST_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1" + RUBY_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby @@ -25,9 +25,9 @@ variables: # Corona # Arguments for top level allocation - CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + CORONA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" # Arguments for job level allocation - CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=30m --nodes=1 --begin-time=+5s" + CORONA_JOB_ALLOC: "--time-limit=30m --nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_CORONA_VARIANTS: "~shared ~openmp" # Project specific deps for corona @@ -35,9 +35,9 @@ variables: # Tioga # Arguments for top level allocation - TIOGA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" # Arguments for job level allocation - TIOGA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s" + TIOGA_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_TIOGA_VARIANTS: "~shared ~openmp" # Project specific deps for corona @@ -46,7 +46,7 @@ variables: # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 30" + LASSEN_JOB_ALLOC: "1 -W 30" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen @@ -56,7 +56,7 @@ variables: # Not all configuration can be shared. Here projects can fine tune the # CI behavior. # See Umpire for an example (export junit test reports). -.custom_build_and_test: +.custom_job: artifacts: reports: junit: junit.xml diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/jobs/corona.yml similarity index 100% rename from .gitlab/corona-build-and-test-extra.yml rename to .gitlab/jobs/corona.yml diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/jobs/lassen.yml similarity index 87% rename from .gitlab/lassen-build-and-test-extra.yml rename to .gitlab/jobs/lassen.yml index f9610a1d1..3c72bebf8 100644 --- a/.gitlab/lassen-build-and-test-extra.yml +++ b/.gitlab/jobs/lassen.yml @@ -18,8 +18,8 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.2.0" - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" - extends: .build_and_test_on_lassen + LASSEN_JOB_ALLOC: "1 -W 120" + extends: .job_on_lassen ############ @@ -30,11 +30,11 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: # describe the spec here. gcc_8_3_1_cuda_11_5_0_ats_disabled: - extends: .build_and_test_on_lassen + extends: .job_on_lassen variables: SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers" MODULE_LIST: "cuda/11.5.0" - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 30" + LASSEN_JOB_ALLOC: "1 --atsdisable -W 30" ########## # OTHERS @@ -43,17 +43,17 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled: clang_13_0_1_libcpp: variables: SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\"" - extends: .build_and_test_on_lassen + extends: .job_on_lassen #clang_14_0_5_asan: # variables: # SPEC: " ~shared +openmp %clang@14.0.5 cxxflags==\"-fsanitize=address\"" # ASAN_OPTIONS: "detect_leaks=1" # LSAN_OPTIONS: "suppressions=${CI_PROJECT_DIR}/tpl/RAJA/suppressions.asan" -# extends: .build_and_test_on_lassen +# extends: .job_on_lassen # Activated in RAJA, but we don't use desul atomics here #gcc_8_3_1_cuda_10_1_168_desul_atomics: # variables: # SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers" -# extends: .build_and_test_on_lassen +# extends: .job_on_lassen diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/jobs/ruby.yml similarity index 90% rename from .gitlab/ruby-build-and-test-extra.yml rename to .gitlab/jobs/ruby.yml index 965142c5f..6eafdf9e1 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/jobs/ruby.yml @@ -17,25 +17,25 @@ clang_14_0_6: variables: SPEC: " ~shared +openmp +omptask %clang@14.0.6" - extends: .build_and_test_on_ruby + extends: .job_on_ruby gcc_10_3_1: variables: SPEC: " ~shared +openmp +omptask %gcc@10.3.1" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" - extends: .build_and_test_on_ruby + extends: .job_on_ruby intel_19_1_2_gcc_10_3_1: variables: SPEC: " +openmp %intel@19.1.2.gcc.10.3.1" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" - extends: .build_and_test_on_ruby + extends: .job_on_ruby intel_2022_1_0: variables: SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" allow_failure: true - extends: .build_and_test_on_ruby + extends: .job_on_ruby ############ # Extra jobs diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/jobs/tioga.yml similarity index 96% rename from .gitlab/tioga-build-and-test-extra.yml rename to .gitlab/jobs/tioga.yml index d3d054b4a..1180b74a9 100644 --- a/.gitlab/tioga-build-and-test-extra.yml +++ b/.gitlab/jobs/tioga.yml @@ -25,4 +25,4 @@ rocmcc_5_4_3_hip_openmp: variables: SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@develop" - extends: .build_and_test_on_tioga + extends: .job_on_tioga diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index 108e84a54..ce1f52abd 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -9,7 +9,7 @@ # The template job to test whether a machine is up. # Expects CI_MACHINE defined to machine name. .machine-check: - stage: machine-checks + stage: prerequisites tags: [shell, oslic] variables: GIT_STRATEGY: none @@ -30,6 +30,25 @@ # Comment the jobs for machines you don’t need. ### +# One job to generate the job list for all the subpipelines +generate-job-lists: + stage: prerequisites + tags: [shell, oslic] + variables: + RADIUSS_JOBS_PATH: "scripts/radiuss-spack-configs/gitlab/radiuss-jobs" + LOCAL_JOBS_PATH: ".gitlab/jobs" + script: + - cat ${RADIUSS_JOBS_PATH}/ruby.yml ${LOCAL_JOBS_PATH}/ruby.yml > ruby-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/lassen.yml ${LOCAL_JOBS_PATH}/lassen.yml > lassen-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/corona.yml ${LOCAL_JOBS_PATH}/corona.yml > corona-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/tioga.yml ${LOCAL_JOBS_PATH}/tioga.yml > tioga-jobs.yml + artifacts: + paths: + - ruby-jobs.yml + - lassen-jobs.yml + - corona-jobs.yml + - tioga-jobs.yml + # RUBY ruby-up-check: variables: @@ -39,7 +58,7 @@ ruby-up-check: ruby-build-and-test: variables: CI_MACHINE: "ruby" - needs: [ruby-up-check] + needs: [ruby-up-check, generate-job-lists] extends: [.build-and-test] # CORONA @@ -51,7 +70,7 @@ corona-up-check: corona-build-and-test: variables: CI_MACHINE: "corona" - needs: [corona-up-check] + needs: [corona-up-check, generate-job-lists] extends: [.build-and-test] # TIOGA @@ -63,7 +82,7 @@ tioga-up-check: tioga-build-and-test: variables: CI_MACHINE: "tioga" - needs: [tioga-up-check] + needs: [tioga-up-check, generate-job-lists] extends: [.build-and-test] # LASSEN @@ -75,7 +94,7 @@ lassen-up-check: lassen-build-and-test: variables: CI_MACHINE: "lassen" - needs: [lassen-up-check] + needs: [lassen-up-check, generate-job-lists] extends: [.build-and-test] diff --git a/tpl/RAJA b/tpl/RAJA index e78b1eb03..a33c48335 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit e78b1eb03cbcd9f954c9f54ea79b5f6f479bde45 +Subproject commit a33c48335571ab5b46e68f72547f8aac4711a8d3 From 9af5458714fd1c4ebfc5040d4a10fabd96957336 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 11:17:37 +0200 Subject: [PATCH 058/454] Fix shared jobs location --- .gitlab/subscribed-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index ce1f52abd..5046059c8 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -35,7 +35,7 @@ generate-job-lists: stage: prerequisites tags: [shell, oslic] variables: - RADIUSS_JOBS_PATH: "scripts/radiuss-spack-configs/gitlab/radiuss-jobs" + RADIUSS_JOBS_PATH: "tpl/RAJA/scripts/radiuss-spack-configs/gitlab/radiuss-jobs" LOCAL_JOBS_PATH: ".gitlab/jobs" script: - cat ${RADIUSS_JOBS_PATH}/ruby.yml ${LOCAL_JOBS_PATH}/ruby.yml > ruby-jobs.yml From 2769a27b5839f7efc1ef6165452acfe8dbb30df5 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 11:38:29 +0200 Subject: [PATCH 059/454] Fix: loop_exec -> seq_exec in RAJA --- TODO/WIP-COUPLE.cpp | 2 +- src/apps/EDGE3D-Seq.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/TODO/WIP-COUPLE.cpp b/TODO/WIP-COUPLE.cpp index 2e7c70197..5769b04e6 100644 --- a/TODO/WIP-COUPLE.cpp +++ b/TODO/WIP-COUPLE.cpp @@ -110,7 +110,7 @@ void COUPLE::runKernel(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( + RAJA::forall( RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { COUPLE_BODY; }); diff --git a/src/apps/EDGE3D-Seq.cpp b/src/apps/EDGE3D-Seq.cpp index 6658650b1..658064427 100644 --- a/src/apps/EDGE3D-Seq.cpp +++ b/src/apps/EDGE3D-Seq.cpp @@ -70,7 +70,7 @@ void EDGE3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( + RAJA::forall( RAJA::RangeSegment(ibegin, iend), edge3d_lam); } From 570e6d79758219febcabe7c41116942aaa738f43 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:35:58 +0200 Subject: [PATCH 060/454] From RAJA: From RSC: Add mpi variant to RAJAPerf --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index a33c48335..2b9443bd0 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit a33c48335571ab5b46e68f72547f8aac4711a8d3 +Subproject commit 2b9443bd06b8319070380b227a412ce32ec9ba74 From 8a943d35ed726fe9a89d9948c193f7b1b2fe821a Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:39:44 +0200 Subject: [PATCH 061/454] Fix oversight: rename extends --- .gitlab/jobs/ruby.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index 34dff6e52..2e814accb 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -41,7 +41,7 @@ intel_2022_1_0_mpi: variables: SPEC: "${PROJECT_RUBY_VARIANTS} +mpi %intel@2022.1.0 ${PROJECT_RUBY_DEPS} ^mvapich2" allow_failure: true - extends: .build_and_test_on_ruby + extends: .job_on_ruby ############ # Extra jobs From 0156e9155fd270cc79995f3d6fbfa27a6493d1a4 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:40:41 +0200 Subject: [PATCH 062/454] Remove unnecessary allowed failure --- .gitlab/jobs/ruby.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index 2e814accb..0799d1d9e 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -34,13 +34,11 @@ intel_19_1_2_gcc_10_3_1: intel_2022_1_0: variables: SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" - allow_failure: true extends: .job_on_ruby intel_2022_1_0_mpi: variables: SPEC: "${PROJECT_RUBY_VARIANTS} +mpi %intel@2022.1.0 ${PROJECT_RUBY_DEPS} ^mvapich2" - allow_failure: true extends: .job_on_ruby ############ From b0b9bbc4f96e96295414ad8c44e82782b7b52c72 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:49:12 +0200 Subject: [PATCH 063/454] From RAJA: From RSC: fix copy-paste error --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 2b9443bd0..8531cbfd9 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 2b9443bd06b8319070380b227a412ce32ec9ba74 +Subproject commit 8531cbfd93b53afdb0dfe436f78dbc04f59a93fc From 1c91b865f21f5f7d6e9e3b110859a52747d00ccd Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:55:43 +0200 Subject: [PATCH 064/454] Fix: loop_exec -> seq_exec in RAJA --- src/comm/MPI_HALOEXCHANGE-Seq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/comm/MPI_HALOEXCHANGE-Seq.cpp b/src/comm/MPI_HALOEXCHANGE-Seq.cpp index 2e952485f..1346e6d92 100644 --- a/src/comm/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Seq.cpp @@ -163,7 +163,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t case RAJA_Seq : { - using EXEC_POL = RAJA::loop_exec; + using EXEC_POL = RAJA::seq_exec; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { From cbe13098cf3eb0855393570d2877a5ca014d928c Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:19:38 +0200 Subject: [PATCH 065/454] Fix loop_work -> seq_work --- src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp | 2 +- tpl/RAJA | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp index b8ccba2b3..84ddbaa0e 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp @@ -218,7 +218,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED AllocatorHolder allocatorHolder; using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::loop_work, + RAJA::seq_work, RAJA::ordered, RAJA::constant_stride_array_of_objects >; diff --git a/tpl/RAJA b/tpl/RAJA index 8531cbfd9..007395508 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 8531cbfd93b53afdb0dfe436f78dbc04f59a93fc +Subproject commit 0073955088b8bf4e07a3dca5f4e8cf838a869fb5 From f62258bf6cf4f5139fa5997184a1843f076e207a Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 16:35:40 +0200 Subject: [PATCH 066/454] Attempt at running test-raja-perf-suite with 2 mpi processus --- test/CMakeLists.txt | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c2d21d81d..1f7e36c1d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,12 +15,22 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS stream algorithm comm) -list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) - -raja_add_test( - NAME test-raja-perf-suite - SOURCES test-raja-perf-suite.cpp + +list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS} gtest ${CMAKE_THREAD_LIBS_INIT}) + +set(test_name test-raja-perf-suite) + +raja_add_executable( + NAME ${test_name}.exe + SOURCES ${test_name}.cpp DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} - ) + TEST On) + +blt_add_test( + NAME ${test_name} + NUM_MPI_TASKS 2 + COMMAND ${TEST_DRIVER} ${test_name}) + +raja_set_failtest(${test_name}) target_include_directories(test-raja-perf-suite.exe PRIVATE ${PROJECT_SOURCE_DIR}/src) From 9633d6f28ecbabb6c1f90867baa0c6a59fc19e3a Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 16:51:51 +0200 Subject: [PATCH 067/454] Fix --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1f7e36c1d..1afe17c7a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,7 +29,7 @@ raja_add_executable( blt_add_test( NAME ${test_name} NUM_MPI_TASKS 2 - COMMAND ${TEST_DRIVER} ${test_name}) + COMMAND ${TEST_DRIVER} ${test_name}.exe) raja_set_failtest(${test_name}) From fe7ba13e643b704f1d2d105b00a8bee5da85bc5f Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 17:03:01 +0200 Subject: [PATCH 068/454] Revert "Fix" This reverts commit 9633d6f28ecbabb6c1f90867baa0c6a59fc19e3a. --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1afe17c7a..1f7e36c1d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,7 +29,7 @@ raja_add_executable( blt_add_test( NAME ${test_name} NUM_MPI_TASKS 2 - COMMAND ${TEST_DRIVER} ${test_name}.exe) + COMMAND ${TEST_DRIVER} ${test_name}) raja_set_failtest(${test_name}) From 682d21e4b9cdf2ea03ea4511c4d7423f9e4495e7 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 17:03:14 +0200 Subject: [PATCH 069/454] Revert "Attempt at running test-raja-perf-suite with 2 mpi processus" This reverts commit f62258bf6cf4f5139fa5997184a1843f076e207a. --- test/CMakeLists.txt | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1f7e36c1d..c2d21d81d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,22 +15,12 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS stream algorithm comm) - -list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS} gtest ${CMAKE_THREAD_LIBS_INIT}) - -set(test_name test-raja-perf-suite) - -raja_add_executable( - NAME ${test_name}.exe - SOURCES ${test_name}.cpp +list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) + +raja_add_test( + NAME test-raja-perf-suite + SOURCES test-raja-perf-suite.cpp DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} - TEST On) - -blt_add_test( - NAME ${test_name} - NUM_MPI_TASKS 2 - COMMAND ${TEST_DRIVER} ${test_name}) - -raja_set_failtest(${test_name}) + ) target_include_directories(test-raja-perf-suite.exe PRIVATE ${PROJECT_SOURCE_DIR}/src) From 3f0cddd73d4fe915ff5c6c79ddb3003fad203743 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 12 Oct 2023 11:13:04 -0700 Subject: [PATCH 070/454] Remove now unnecessary sync --- src/basic/INDEXLIST-Hip.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 9a3309d9d..f30bda0c9 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -264,7 +264,6 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) allocData(DataSpace::HipDevice, grid_counts, grid_size); unsigned* block_readys; allocData(DataSpace::HipDevice, block_readys, grid_size); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { From 6f6f7e4c942a85be0711188dc2364e83fdb5637d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 12 Oct 2023 11:13:25 -0700 Subject: [PATCH 071/454] Fix cuda INDEXLIST implementation --- src/basic/INDEXLIST-Cuda.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 861341b04..de674e5a7 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -189,12 +189,6 @@ __device__ void grid_scan(const int block_id, exclusive[ti] = prev_grid_count + exclusive[ti]; inclusive[ti] = prev_grid_count + inclusive[ti]; } - - if (last_block) { - for (unsigned i = threadIdx.x; i < gridDim.x-1; i += block_size) { - while (atomicCAS(&block_readys[i], 2u, 0u) != 2u); - } - } } } @@ -270,12 +264,11 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) allocData(DataSpace::CudaDevice, grid_counts, grid_size); unsigned* block_readys; allocData(DataSpace::CudaDevice, block_readys, grid_size); - cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); indexlist <<>>( x+ibegin, list+ibegin, From ef68b0bc4172db7b729657dcc97c7ef53e227822 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 12 Oct 2023 20:32:25 +0200 Subject: [PATCH 072/454] Update RAJA --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 007395508..9e4093fbe 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 0073955088b8bf4e07a3dca5f4e8cf838a869fb5 +Subproject commit 9e4093fbed314c773efd8bf11c98c5ccd3169c0c From ba667275ae55dba18dec28b25f6f1d89bef90355 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 12 Oct 2023 13:29:50 -0700 Subject: [PATCH 073/454] no need to allow intel failure --- .gitlab/ruby-build-and-test-extra.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml index ec86e6ce5..3ee35bdb3 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/ruby-build-and-test-extra.yml @@ -34,13 +34,11 @@ intel_19_1_2_gcc_10_3_1: intel_2022_1_0: variables: SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" - allow_failure: true extends: .build_and_test_on_ruby intel_2022_1_0_mpi: variables: SPEC: "${PROJECT_RUBY_VARIANTS} +mpi %intel@2022.1.0 ${PROJECT_RUBY_DEPS} ^mvapich2" - allow_failure: true extends: .build_and_test_on_ruby ############ From 6eb5e448c5114f68fcab2911354dc64a8caf382a Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 13 Oct 2023 23:15:03 +0200 Subject: [PATCH 074/454] Update RAJA to latest develop reference --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 9e4093fbe..974b93c34 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 9e4093fbed314c773efd8bf11c98c5ccd3169c0c +Subproject commit 974b93c34fb0e88cf2e0fdd1192609cbd995c5d4 From ed996e7611ab0ff1f737eb72970767c6ab9f8a9a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 13 Oct 2023 15:22:59 -0700 Subject: [PATCH 075/454] Add main to test-raja-perf-suite --- test/test-raja-perf-suite.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 26ebcbda5..f19d9cd89 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -8,6 +8,10 @@ #include "gtest/gtest.h" +#if defined(RUN_KOKKOS) +#include +#endif + #include "common/Executor.hpp" #include "common/KernelBase.hpp" @@ -16,6 +20,33 @@ #include #include +#if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include +#endif + +int main( int argc, char** argv ) +{ + testing::InitGoogleTest(&argc, argv); + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Init(&argc, &argv); +#endif +#if defined(RUN_KOKKOS) + Kokkos::initialize(argc, argv); +#endif + + int res = RUN_ALL_TESTS(); + +#if defined(RUN_KOKKOS) + Kokkos::finalize(); +#endif +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Finalize(); +#endif + + return res; +} + TEST(ShortSuiteTest, Basic) { From 54a1eaa5f929c1c75abeb538148d9e15b5a0b1ba Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 13 Oct 2023 15:23:12 -0700 Subject: [PATCH 076/454] Improve ref implementations --- src/comm/HALOEXCHANGE.hpp | 12 ++++++------ src/comm/HALOEXCHANGE_FUSED.hpp | 12 ++++++------ src/comm/HALOEXCHANGE_base.hpp | 12 ++++++------ src/comm/MPI_HALOEXCHANGE.hpp | 12 ++++++------ src/comm/MPI_HALOEXCHANGE_FUSED.hpp | 12 ++++++------ 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/comm/HALOEXCHANGE.hpp b/src/comm/HALOEXCHANGE.hpp index df260d31e..7f2d63991 100644 --- a/src/comm/HALOEXCHANGE.hpp +++ b/src/comm/HALOEXCHANGE.hpp @@ -9,7 +9,7 @@ /// /// HALOEXCHANGE kernel reference implementation: /// -/// // pack message for each neighbor +/// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = buffers[l]; /// Int_ptr list = pack_index_lists[l]; @@ -18,16 +18,16 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_PACK_BODY; +/// buffer[i] = var[list[i]]; /// } /// buffer += len; /// } -/// // send message to neighbor +/// // send buffer to neighbor /// } /// -/// // unpack messages for each neighbor +/// // unpack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive message from neighbor +/// // receive buffer from neighbor /// Real_ptr buffer = buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -35,7 +35,7 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_UNPACK_BODY; +/// var[list[i]] = buffer[i]; /// } /// buffer += len; /// } diff --git a/src/comm/HALOEXCHANGE_FUSED.hpp b/src/comm/HALOEXCHANGE_FUSED.hpp index 62d709599..156607092 100644 --- a/src/comm/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALOEXCHANGE_FUSED.hpp @@ -9,7 +9,7 @@ /// /// HALOEXCHANGE_FUSED kernel reference implementation: /// -/// // pack message for each neighbor +/// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = buffers[l]; /// Int_ptr list = pack_index_lists[l]; @@ -18,16 +18,16 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_PACK_BODY; +/// buffer[i] = var[list[i]]; /// } /// buffer += len; /// } -/// // send message to neighbor +/// // send buffer to neighbor /// } /// -/// // unpack messages for each neighbor +/// // unpack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive message from neighbor +/// // receive buffer from neighbor /// Real_ptr buffer = buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -35,7 +35,7 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_UNPACK_BODY; +/// var[list[i]] = buffer[i]; /// } /// buffer += len; /// } diff --git a/src/comm/HALOEXCHANGE_base.hpp b/src/comm/HALOEXCHANGE_base.hpp index 5864f2f1a..1f791a280 100644 --- a/src/comm/HALOEXCHANGE_base.hpp +++ b/src/comm/HALOEXCHANGE_base.hpp @@ -9,7 +9,7 @@ /// /// HALOEXCHANGE kernel reference implementation: /// -/// // pack message for each neighbor +/// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = buffers[l]; /// Int_ptr list = pack_index_lists[l]; @@ -18,16 +18,16 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_PACK_BODY; +/// buffer[i] = var[list[i]]; /// } /// buffer += len; /// } -/// // send message to neighbor +/// // send buffer to neighbor /// } /// -/// // unpack messages for each neighbor +/// // unpack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive message from neighbor +/// // receive buffer from neighbor /// Real_ptr buffer = buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -35,7 +35,7 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_UNPACK_BODY; +/// var[list[i]] = buffer[i]; /// } /// buffer += len; /// } diff --git a/src/comm/MPI_HALOEXCHANGE.hpp b/src/comm/MPI_HALOEXCHANGE.hpp index bf7fef72a..66f04353d 100644 --- a/src/comm/MPI_HALOEXCHANGE.hpp +++ b/src/comm/MPI_HALOEXCHANGE.hpp @@ -9,7 +9,7 @@ /// /// MPI_HALOEXCHANGE kernel reference implementation: /// -/// // pack message for each neighbor +/// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; @@ -18,16 +18,16 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_PACK_BODY; +/// buffer[i] = var[list[i]]; /// } /// buffer += len; /// } -/// // send message to neighbor +/// // send buffer to neighbor /// } /// -/// // unpack messages for each neighbor +/// // unpack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive message from neighbor +/// // receive buffer from neighbor /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -35,7 +35,7 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_UNPACK_BODY; +/// var[list[i]] = buffer[i]; /// } /// buffer += len; /// } diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp index 69117ecaa..18a89c4ab 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp @@ -9,7 +9,7 @@ /// /// MPI_HALOEXCHANGE_FUSED kernel reference implementation: /// -/// // pack message for each neighbor +/// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; @@ -18,16 +18,16 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_PACK_BODY; +/// buffer[i] = var[list[i]]; /// } /// buffer += len; /// } -/// // send message to neighbor +/// // send buffer to neighbor /// } /// -/// // unpack messages for each neighbor +/// // unpack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive message from neighbor +/// // receive buffer from neighbor /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -35,7 +35,7 @@ /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; /// for (Index_type i = 0; i < len; i++) { -/// HALOEXCHANGE_UNPACK_BODY; +/// var[list[i]] = buffer[i]; /// } /// buffer += len; /// } From 2bf933bbcdb380b14e1e4aa6ea182485c62e10e4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 13 Oct 2023 15:51:27 -0700 Subject: [PATCH 077/454] Improve ref implementations Add simple MPI calls --- src/comm/HALOEXCHANGE.hpp | 4 ++++ src/comm/HALOEXCHANGE_FUSED.hpp | 14 ++++++++++---- src/comm/MPI_HALOEXCHANGE.hpp | 14 ++++++++++++++ src/comm/MPI_HALOEXCHANGE_FUSED.hpp | 25 +++++++++++++++++++++---- 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/src/comm/HALOEXCHANGE.hpp b/src/comm/HALOEXCHANGE.hpp index 7f2d63991..c67e77f8b 100644 --- a/src/comm/HALOEXCHANGE.hpp +++ b/src/comm/HALOEXCHANGE.hpp @@ -9,6 +9,8 @@ /// /// HALOEXCHANGE kernel reference implementation: /// +/// // post a recv for each neighbor +/// /// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = buffers[l]; @@ -41,6 +43,8 @@ /// } /// } /// +/// // wait for all sends to complete +/// #ifndef RAJAPerf_Comm_HALOEXCHANGE_HPP #define RAJAPerf_Comm_HALOEXCHANGE_HPP diff --git a/src/comm/HALOEXCHANGE_FUSED.hpp b/src/comm/HALOEXCHANGE_FUSED.hpp index 156607092..464373cfb 100644 --- a/src/comm/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALOEXCHANGE_FUSED.hpp @@ -9,7 +9,9 @@ /// /// HALOEXCHANGE_FUSED kernel reference implementation: /// -/// // pack a buffer for each neighbor +/// // post a recv for each neighbor +/// +/// // pack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = buffers[l]; /// Int_ptr list = pack_index_lists[l]; @@ -22,12 +24,14 @@ /// } /// buffer += len; /// } -/// // send buffer to neighbor /// } /// -/// // unpack a buffer for each neighbor +/// // send buffers to neighbors +/// +/// // receive buffers from neighbors +/// +/// // unpack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive buffer from neighbor /// Real_ptr buffer = buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -41,6 +45,8 @@ /// } /// } /// +/// // wait for all sends to complete +/// #ifndef RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP #define RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP diff --git a/src/comm/MPI_HALOEXCHANGE.hpp b/src/comm/MPI_HALOEXCHANGE.hpp index 66f04353d..2bd18c288 100644 --- a/src/comm/MPI_HALOEXCHANGE.hpp +++ b/src/comm/MPI_HALOEXCHANGE.hpp @@ -9,6 +9,13 @@ /// /// MPI_HALOEXCHANGE kernel reference implementation: /// +/// // post a recv for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Index_type len = unpack_index_list_lengths[l]; +/// MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); +/// } +/// /// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; @@ -23,11 +30,14 @@ /// buffer += len; /// } /// // send buffer to neighbor +/// MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); /// } /// /// // unpack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { /// // receive buffer from neighbor +/// MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE); /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -41,6 +51,10 @@ /// } /// } /// +/// // wait for all sends to complete +/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// + #ifndef RAJAPerf_Comm_MPI_HALOEXCHANGE_HPP #define RAJAPerf_Comm_MPI_HALOEXCHANGE_HPP diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp index 18a89c4ab..65bfa1a6e 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp @@ -9,7 +9,14 @@ /// /// MPI_HALOEXCHANGE_FUSED kernel reference implementation: /// -/// // pack a buffer for each neighbor +/// // post a recv for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Index_type len = unpack_index_list_lengths[l]; +/// MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); +/// } +/// +/// // pack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; @@ -22,12 +29,19 @@ /// } /// buffer += len; /// } -/// // send buffer to neighbor /// } /// -/// // unpack a buffer for each neighbor +/// // send buffers to neighbors +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); +/// } +/// +/// // wait for all recvs to complete +/// MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// +/// // unpack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive buffer from neighbor /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; @@ -41,6 +55,9 @@ /// } /// } /// +/// // wait for all sends to complete +/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// #ifndef RAJAPerf_Comm_MPI_HALOEXCHANGE_FUSED_HPP #define RAJAPerf_Comm_MPI_HALOEXCHANGE_FUSED_HPP From ea2930dbf3087539c52196eeeb77a6cfd196604d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 13 Oct 2023 16:24:55 -0700 Subject: [PATCH 078/454] Move vars logic out of HALOEXCHANGE_base This allows us to use the same index list logic with different communication patterns. --- src/comm/HALOEXCHANGE.cpp | 28 ++++++++++++++ src/comm/HALOEXCHANGE.hpp | 9 +++++ src/comm/HALOEXCHANGE_FUSED.cpp | 28 ++++++++++++++ src/comm/HALOEXCHANGE_FUSED.hpp | 9 +++++ src/comm/HALOEXCHANGE_base.cpp | 57 +++++++---------------------- src/comm/HALOEXCHANGE_base.hpp | 20 +++------- src/comm/MPI_HALOEXCHANGE.cpp | 28 ++++++++++++++ src/comm/MPI_HALOEXCHANGE.hpp | 9 +++++ src/comm/MPI_HALOEXCHANGE_FUSED.cpp | 28 ++++++++++++++ src/comm/MPI_HALOEXCHANGE_FUSED.hpp | 9 +++++ 10 files changed, 168 insertions(+), 57 deletions(-) diff --git a/src/comm/HALOEXCHANGE.cpp b/src/comm/HALOEXCHANGE.cpp index 5ef0c9ec8..5429620e7 100644 --- a/src/comm/HALOEXCHANGE.cpp +++ b/src/comm/HALOEXCHANGE.cpp @@ -22,6 +22,10 @@ HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) { setDefaultReps(200); + m_num_vars = s_num_vars_default; + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack @@ -59,6 +63,18 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) const int mpi_dims[3] = {1,1,1}; setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx); + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + m_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; @@ -66,6 +82,13 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) } } +void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } +} + void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) { for (int l = 0; l < s_num_neighbors; ++l) { @@ -73,6 +96,11 @@ void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) } m_buffers.clear(); + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + tearDown_base(vid, tune_idx); } diff --git a/src/comm/HALOEXCHANGE.hpp b/src/comm/HALOEXCHANGE.hpp index c67e77f8b..f7a93bd7f 100644 --- a/src/comm/HALOEXCHANGE.hpp +++ b/src/comm/HALOEXCHANGE.hpp @@ -52,6 +52,9 @@ #define HALOEXCHANGE_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ \ + Index_type num_vars = m_num_vars; \ + std::vector vars = m_vars; \ + \ std::vector buffers = m_buffers; @@ -73,6 +76,7 @@ class HALOEXCHANGE : public HALOEXCHANGE_base ~HALOEXCHANGE(); void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); void runSeqVariant(VariantID vid, size_t tune_idx); @@ -92,6 +96,11 @@ class HALOEXCHANGE : public HALOEXCHANGE_base static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = gpu_block_size::make_list_type; + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_vars; + std::vector m_buffers; }; diff --git a/src/comm/HALOEXCHANGE_FUSED.cpp b/src/comm/HALOEXCHANGE_FUSED.cpp index 00000d468..3c4d5440f 100644 --- a/src/comm/HALOEXCHANGE_FUSED.cpp +++ b/src/comm/HALOEXCHANGE_FUSED.cpp @@ -22,6 +22,10 @@ HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) { setDefaultReps(200); + m_num_vars = s_num_vars_default; + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 ); setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack @@ -59,6 +63,18 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) const int mpi_dims[3] = {1,1,1}; setUp_base(my_mpi_rank, mpi_dims, vid, tune_idx); + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + m_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; @@ -66,6 +82,13 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) } } +void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } +} + void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) { for (int l = 0; l < s_num_neighbors; ++l) { @@ -73,6 +96,11 @@ void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) } m_buffers.clear(); + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + tearDown_base(vid, tune_idx); } diff --git a/src/comm/HALOEXCHANGE_FUSED.hpp b/src/comm/HALOEXCHANGE_FUSED.hpp index 464373cfb..305ee147b 100644 --- a/src/comm/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALOEXCHANGE_FUSED.hpp @@ -54,6 +54,9 @@ #define HALOEXCHANGE_FUSED_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ \ + Index_type num_vars = m_num_vars; \ + std::vector vars = m_vars; \ + \ std::vector buffers = m_buffers; #define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ @@ -119,6 +122,7 @@ class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base ~HALOEXCHANGE_FUSED(); void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); void runSeqVariant(VariantID vid, size_t tune_idx); @@ -138,6 +142,11 @@ class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base static const size_t default_gpu_block_size = 1024; using gpu_block_sizes_type = gpu_block_size::make_list_type; + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_vars; + std::vector m_buffers; }; diff --git a/src/comm/HALOEXCHANGE_base.cpp b/src/comm/HALOEXCHANGE_base.cpp index f18faf0d9..41f3c89b7 100644 --- a/src/comm/HALOEXCHANGE_base.cpp +++ b/src/comm/HALOEXCHANGE_base.cpp @@ -19,37 +19,32 @@ namespace rajaperf namespace comm { +Index_type HALOEXCHANGE_base::s_grid_dims_default[3] {100, 100, 100}; +Index_type HALOEXCHANGE_base::s_halo_width_default = 1; +Index_type HALOEXCHANGE_base::s_num_vars_default = 3; + HALOEXCHANGE_base::HALOEXCHANGE_base(KernelID kid, const RunParams& params) : KernelBase(kid, params) { - m_grid_dims_default[0] = 100; - m_grid_dims_default[1] = 100; - m_grid_dims_default[2] = 100; - m_halo_width_default = 1; - m_num_vars_default = 3; - - setDefaultProblemSize( m_grid_dims_default[0] * - m_grid_dims_default[1] * - m_grid_dims_default[2] ); + setDefaultProblemSize( s_grid_dims_default[0] * + s_grid_dims_default[1] * + s_grid_dims_default[2] ); double cbrt_run_size = std::cbrt(getTargetProblemSize()); m_grid_dims[0] = cbrt_run_size; m_grid_dims[1] = cbrt_run_size; m_grid_dims[2] = cbrt_run_size; - m_halo_width = m_halo_width_default; - m_num_vars = m_num_vars_default; + m_halo_width = s_halo_width_default; m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; m_grid_plus_halo_dims[2] = m_grid_dims[2] + 2*m_halo_width; - m_var_size = m_grid_plus_halo_dims[0] * - m_grid_plus_halo_dims[1] * - m_grid_plus_halo_dims[2] ; + m_grid_plus_halo_size = m_grid_plus_halo_dims[0] * + m_grid_plus_halo_dims[1] * + m_grid_plus_halo_dims[2] ; setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); - - setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); } HALOEXCHANGE_base::~HALOEXCHANGE_base() @@ -59,18 +54,6 @@ HALOEXCHANGE_base::~HALOEXCHANGE_base() void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - m_vars.resize(m_num_vars, nullptr); - for (Index_type v = 0; v < m_num_vars; ++v) { - allocAndInitData(m_vars[v], m_var_size, vid); - auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); - - Real_ptr var = m_vars[v]; - - for (Index_type i = 0; i < m_var_size; i++) { - var[i] = i + v; - } - } - m_mpi_ranks.resize(s_num_neighbors, -1); m_send_tags.resize(s_num_neighbors, -1); m_pack_index_lists.resize(s_num_neighbors, nullptr); @@ -85,13 +68,6 @@ void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, s_num_neighbors, vid); } -void HALOEXCHANGE_base::updateChecksum(VariantID vid, size_t tune_idx) -{ - for (Real_ptr var : m_vars) { - checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); - } -} - void HALOEXCHANGE_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { destroy_lists(m_pack_index_lists, m_unpack_index_lists, s_num_neighbors, vid); @@ -102,15 +78,10 @@ void HALOEXCHANGE_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG( m_pack_index_lists.clear(); m_send_tags.clear(); m_mpi_ranks.clear(); - - for (int v = 0; v < m_num_vars; ++v) { - deallocData(m_vars[v], vid); - } - m_vars.clear(); } -const int HALOEXCHANGE_base::boundary_offsets[HALOEXCHANGE_base::s_num_neighbors][3]{ +const int HALOEXCHANGE_base::s_boundary_offsets[HALOEXCHANGE_base::s_num_neighbors][3]{ // faces {-1, 0, 0}, @@ -222,7 +193,7 @@ void HALOEXCHANGE_base::create_lists( std::map boundary_idx_to_tag; for (Index_type l = 0; l < num_neighbors; ++l) { - boundary_idx_to_tag[get_boundary_idx(boundary_offsets[l])] = l; + boundary_idx_to_tag[get_boundary_idx(s_boundary_offsets[l])] = l; } const Index_type grid_i_stride = 1; @@ -231,7 +202,7 @@ void HALOEXCHANGE_base::create_lists( for (Index_type l = 0; l < num_neighbors; ++l) { - const int (&boundary_offset)[3] = boundary_offsets[l]; + const int (&boundary_offset)[3] = s_boundary_offsets[l]; int neighbor_boundary_offset[3]{-1, -1, -1}; for (int dim = 0; dim < 3; ++dim) { diff --git a/src/comm/HALOEXCHANGE_base.hpp b/src/comm/HALOEXCHANGE_base.hpp index 1f791a280..04fc01e33 100644 --- a/src/comm/HALOEXCHANGE_base.hpp +++ b/src/comm/HALOEXCHANGE_base.hpp @@ -46,10 +46,7 @@ #define RAJAPerf_Comm_HALOEXCHANGE_base_HPP #define HALOEXCHANGE_base_DATA_SETUP \ - std::vector vars = m_vars; \ - \ Index_type num_neighbors = s_num_neighbors; \ - Index_type num_vars = m_num_vars; \ std::vector send_tags = m_send_tags; \ std::vector pack_index_lists = m_pack_index_lists; \ std::vector pack_index_list_lengths = m_pack_index_list_lengths; \ @@ -87,7 +84,6 @@ class HALOEXCHANGE_base : public KernelBase void setUp_base(const int my_mpi_rank, const int* mpi_dims, VariantID vid, size_t tune_idx); - void updateChecksum(VariantID vid, size_t tune_idx); void tearDown_base(VariantID vid, size_t tune_idx); protected: @@ -108,21 +104,17 @@ class HALOEXCHANGE_base : public KernelBase }; static const int s_num_neighbors = 26; - static const int boundary_offsets[s_num_neighbors][3]; + static const int s_boundary_offsets[s_num_neighbors][3]; + + static Index_type s_grid_dims_default[3]; + static Index_type s_halo_width_default; + static Index_type s_num_vars_default; Index_type m_grid_dims[3]; Index_type m_halo_width; - Index_type m_num_vars; - - Index_type m_grid_dims_default[3]; - Index_type m_halo_width_default; - Index_type m_num_vars_default; Index_type m_grid_plus_halo_dims[3]; - Index_type m_var_size; - Index_type m_var_halo_size; - - std::vector m_vars; + Index_type m_grid_plus_halo_size; std::vector m_mpi_ranks; diff --git a/src/comm/MPI_HALOEXCHANGE.cpp b/src/comm/MPI_HALOEXCHANGE.cpp index 5ec6df27d..006bc294c 100644 --- a/src/comm/MPI_HALOEXCHANGE.cpp +++ b/src/comm/MPI_HALOEXCHANGE.cpp @@ -28,6 +28,10 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) setDefaultReps(50); + m_num_vars = s_num_vars_default; + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack @@ -68,6 +72,18 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); m_pack_buffers.resize(s_num_neighbors, nullptr); @@ -97,6 +113,13 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) } } +void MPI_HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } +} + void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); @@ -123,6 +146,11 @@ void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) m_send_buffers.clear(); m_pack_buffers.clear(); + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + tearDown_base(vid, tune_idx); } diff --git a/src/comm/MPI_HALOEXCHANGE.hpp b/src/comm/MPI_HALOEXCHANGE.hpp index 2bd18c288..83569304d 100644 --- a/src/comm/MPI_HALOEXCHANGE.hpp +++ b/src/comm/MPI_HALOEXCHANGE.hpp @@ -62,6 +62,9 @@ #define MPI_HALOEXCHANGE_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ \ + Index_type num_vars = m_num_vars; \ + std::vector vars = m_vars; \ + \ std::vector mpi_ranks = m_mpi_ranks; \ \ std::vector pack_mpi_requests(num_neighbors); \ @@ -101,6 +104,7 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base ~MPI_HALOEXCHANGE(); void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); void runSeqVariant(VariantID vid, size_t tune_idx); @@ -124,6 +128,11 @@ class MPI_HALOEXCHANGE : public HALOEXCHANGE_base int m_my_mpi_rank = -1; std::array m_mpi_dims = {-1, -1, -1}; + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_vars; + std::vector m_pack_buffers; std::vector m_unpack_buffers; diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp index 3dc788cc9..36401da88 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp @@ -28,6 +28,10 @@ MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) setDefaultReps(50); + m_num_vars = s_num_vars_default; + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 ); setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack @@ -68,6 +72,18 @@ void MPI_HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); + m_vars.resize(m_num_vars, nullptr); + for (Index_type v = 0; v < m_num_vars; ++v) { + allocAndInitData(m_vars[v], m_var_size, vid); + auto reset_var = scopedMoveData(m_vars[v], m_var_size, vid); + + Real_ptr var = m_vars[v]; + + for (Index_type i = 0; i < m_var_size; i++) { + var[i] = i + v; + } + } + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); m_pack_buffers.resize(s_num_neighbors, nullptr); @@ -97,6 +113,13 @@ void MPI_HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) } } +void MPI_HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +{ + for (Real_ptr var : m_vars) { + checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); + } +} + void MPI_HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); @@ -123,6 +146,11 @@ void MPI_HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) m_send_buffers.clear(); m_pack_buffers.clear(); + for (int v = 0; v < m_num_vars; ++v) { + deallocData(m_vars[v], vid); + } + m_vars.clear(); + tearDown_base(vid, tune_idx); } diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp index 65bfa1a6e..fde967245 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp @@ -65,6 +65,9 @@ #define MPI_HALOEXCHANGE_FUSED_DATA_SETUP \ HALOEXCHANGE_base_DATA_SETUP \ \ + Index_type num_vars = m_num_vars; \ + std::vector vars = m_vars; \ + \ std::vector mpi_ranks = m_mpi_ranks; \ \ std::vector pack_mpi_requests(num_neighbors); \ @@ -145,6 +148,7 @@ class MPI_HALOEXCHANGE_FUSED : public HALOEXCHANGE_base ~MPI_HALOEXCHANGE_FUSED(); void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); void runSeqVariant(VariantID vid, size_t tune_idx); @@ -168,6 +172,11 @@ class MPI_HALOEXCHANGE_FUSED : public HALOEXCHANGE_base int m_my_mpi_rank = -1; std::array m_mpi_dims = {-1, -1, -1}; + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_vars; + std::vector m_pack_buffers; std::vector m_unpack_buffers; From 51efadbf56559e9398103e791c71ba9795a371e4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 13 Oct 2023 16:42:17 -0700 Subject: [PATCH 079/454] Set HALOEXCHANGE default reps to same for MPI and non-MPI --- src/comm/MPI_HALOEXCHANGE.cpp | 2 +- src/comm/MPI_HALOEXCHANGE_FUSED.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/comm/MPI_HALOEXCHANGE.cpp b/src/comm/MPI_HALOEXCHANGE.cpp index 006bc294c..684fe1c48 100644 --- a/src/comm/MPI_HALOEXCHANGE.cpp +++ b/src/comm/MPI_HALOEXCHANGE.cpp @@ -26,7 +26,7 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) m_my_mpi_rank = params.getMPIRank(); m_mpi_dims = params.getMPI3DDivision(); - setDefaultReps(50); + setDefaultReps(200); m_num_vars = s_num_vars_default; m_var_size = m_grid_plus_halo_size ; diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp index 36401da88..9706c0471 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp @@ -26,7 +26,7 @@ MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) m_my_mpi_rank = params.getMPIRank(); m_mpi_dims = params.getMPI3DDivision(); - setDefaultReps(50); + setDefaultReps(200); m_num_vars = s_num_vars_default; m_var_size = m_grid_plus_halo_size ; From bcfcc091f77fc4bac5b355782f75fd73b7180a81 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 25 Oct 2023 15:46:57 -0700 Subject: [PATCH 080/454] Add rdc to cuda builds Required for using function pointers. Not sure how we got away without this. --- scripts/lc-builds/blueos_nvcc_clang.sh | 1 + scripts/lc-builds/blueos_nvcc_clang_caliper.sh | 1 + scripts/lc-builds/blueos_nvcc_gcc.sh | 1 + scripts/lc-builds/blueos_nvcc_xl.sh | 1 + scripts/lc-builds/blueos_spectrum_nvcc_clang.sh | 1 + 5 files changed, 5 insertions(+) diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 9801459b9..16364353e 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -45,6 +45,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh index b121d68c2..b7cf0f953 100755 --- a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh +++ b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh @@ -49,6 +49,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 200e86f9b..e8aac3058 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -45,6 +45,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 9f2489694..2342a3837 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -45,6 +45,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index 631f8ef5c..957b1eb2e 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -49,6 +49,7 @@ cmake \ -DENABLE_MPI=On \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ From 56eba4329fac8bb850af2ab964761a60fa390857 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 25 Oct 2023 15:47:52 -0700 Subject: [PATCH 081/454] make arg checking consistent --- scripts/lc-builds/blueos_clang.sh | 2 +- scripts/lc-builds/blueos_clang_omptarget.sh | 2 +- scripts/lc-builds/blueos_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_nvcc_clang_caliper.sh | 2 +- scripts/lc-builds/blueos_nvcc_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_xl.sh | 2 +- scripts/lc-builds/blueos_pgi.sh | 2 +- scripts/lc-builds/blueos_xl.sh | 2 +- scripts/lc-builds/blueos_xl_omptarget.sh | 2 +- scripts/lc-builds/toss3_clang.sh | 2 +- scripts/lc-builds/toss3_gcc.sh | 2 +- scripts/lc-builds/toss3_hipcc.sh | 2 +- scripts/lc-builds/toss3_icpc.sh | 2 +- scripts/lc-builds/toss3_mvapich2_gcc.sh | 2 +- scripts/lc-builds/toss3_pgi.sh | 2 +- scripts/lc-builds/toss4_clang_caliper.sh | 2 +- scripts/lc-builds/toss4_gcc_caliper.sh | 2 +- scripts/ubuntu-builds/ubuntu_clang.sh | 2 +- scripts/ubuntu-builds/ubuntu_gcc.sh | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index a6fc06451..658af1f53 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_clang.sh 11.0.1" diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index 2f7fdf5e9..76c08af4c 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_clang_omptarget.sh 10.0.1-gcc-8.3.1" diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index b51ad749a..9f94fda0c 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_gcc.sh 8.3.1" diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 16364353e..a6332fa54 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" diff --git a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh index b7cf0f953..f36715c21 100755 --- a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh +++ b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 5 ]]; then +if [[ $# -lt 5 ]]; then echo echo "You must pass 5 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index e8aac3058..3ca718cb2 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 2342a3837..ead4d5a7c 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number for nvcc" diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index c715d1c25..7ccfc3bb5 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_pgi.sh 21.1" diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 5d30ab1ea..971015623 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_xl.sh 2021.03.31" diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 5f972f0dc..809c2fd5c 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_xl_omptarget.sh 2022.08.19" diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss3_clang.sh index 7406363bc..75fc28c67 100755 --- a/scripts/lc-builds/toss3_clang.sh +++ b/scripts/lc-builds/toss3_clang.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss3_clang.sh 10.0.1" diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss3_gcc.sh index 4e7bf6bc1..cbc127945 100755 --- a/scripts/lc-builds/toss3_gcc.sh +++ b/scripts/lc-builds/toss3_gcc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss3_gcc.sh 8.3.1" diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss3_hipcc.sh index b5d9b2760..9877ee99a 100755 --- a/scripts/lc-builds/toss3_hipcc.sh +++ b/scripts/lc-builds/toss3_hipcc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 2 ]]; then +if [[ $# -lt 2 ]]; then echo echo "You must pass 2 arguments to the script (in this order): " echo " 1) compiler version number" diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh index a8b7de2b9..f5a10cfda 100755 --- a/scripts/lc-builds/toss3_icpc.sh +++ b/scripts/lc-builds/toss3_icpc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss3_icpc.sh 19.1.0" diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss3_mvapich2_gcc.sh index 8c9e0662c..a66a216ca 100755 --- a/scripts/lc-builds/toss3_mvapich2_gcc.sh +++ b/scripts/lc-builds/toss3_mvapich2_gcc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 2 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss3_mvapich2_gcc.sh 2.3 10.2.1" diff --git a/scripts/lc-builds/toss3_pgi.sh b/scripts/lc-builds/toss3_pgi.sh index 9967dd769..5207ae816 100755 --- a/scripts/lc-builds/toss3_pgi.sh +++ b/scripts/lc-builds/toss3_pgi.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss3_pgi.sh 20.1" diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index 273390561..dcfcdb101 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number" diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh index 11fd22605..3499d6bfa 100755 --- a/scripts/lc-builds/toss4_gcc_caliper.sh +++ b/scripts/lc-builds/toss4_gcc_caliper.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -ne 3 ]]; then +if [[ $# -lt 3 ]]; then echo echo "You must pass 3 arguments to the script (in this order): " echo " 1) compiler version number" diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh index 68b722774..4b83ca173 100755 --- a/scripts/ubuntu-builds/ubuntu_clang.sh +++ b/scripts/ubuntu-builds/ubuntu_clang.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " ubuntu_clang.sh 10" diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh index 04c57fce7..663f856d4 100755 --- a/scripts/ubuntu-builds/ubuntu_gcc.sh +++ b/scripts/ubuntu-builds/ubuntu_gcc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " ubuntu_gcc.sh 8" From 531a184fc2eae44f87bac50de2398c693f8dd4f7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 25 Oct 2023 15:55:30 -0700 Subject: [PATCH 082/454] fix typo --- scripts/lc-builds/blueos_clang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index 658af1f53..011ac9522 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -11,7 +11,7 @@ if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_clang.sh 11.0.1" - echo " -or - " + echo " - or - " echo " blueos_clang.sh ibm-10.0.1-gcc-8.3.1" exit fi From 22fb83358d223ef3bf39a7f79159a5d9eb42c571 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 25 Oct 2023 15:55:43 -0700 Subject: [PATCH 083/454] update ubuntu scripts --- scripts/ubuntu-builds/ubuntu_clang.sh | 4 +++- scripts/ubuntu-builds/ubuntu_gcc.sh | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh index 4b83ca173..77e8100f1 100755 --- a/scripts/ubuntu-builds/ubuntu_clang.sh +++ b/scripts/ubuntu-builds/ubuntu_clang.sh @@ -22,6 +22,8 @@ RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/ubuntu-builds/clang_X.cmake echo echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null @@ -39,5 +41,5 @@ cmake \ echo echo "***********************************************************************" -echo "cd into directory ${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" echo "***********************************************************************" diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh index 663f856d4..741b2fa22 100755 --- a/scripts/ubuntu-builds/ubuntu_gcc.sh +++ b/scripts/ubuntu-builds/ubuntu_gcc.sh @@ -22,6 +22,8 @@ RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/ubuntu-builds/gcc_X.cmake echo echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null @@ -39,5 +41,5 @@ cmake \ echo echo "***********************************************************************" -echo "cd into directory ${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" echo "***********************************************************************" From b53f6510c833a9b65a1892e7b236210941c24ce4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 25 Oct 2023 16:00:17 -0700 Subject: [PATCH 084/454] add note for libpgmath --- scripts/lc-builds/toss4_cray-mpich_amdclang.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 614f2caec..afd60389f 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -98,10 +98,11 @@ echo echo " module unload rocm" echo " srun -n1 make" echo -echo " Please note that cray-mpich requires libmodules.so.1 from cce to run." +echo " Please note that cray-mpich requires libmodules.so.1 from cce and" +echo " libpgmath.so from rocm/llvm to run." echo " Until this is handled transparently in the build system you may add " -echo " cce to your LD_LIBRARY_PATH." +echo " cce and rocm/llvm to your LD_LIBRARY_PATH." echo -echo " export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/cce-tce/cce-13.0.2/cce/x86_64/lib/" +echo " export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/cce-tce/cce-13.0.2/cce/x86_64/lib/:/usr/rocm-5.7.0/llvm/lib" echo echo "***********************************************************************" From eb5a65d450e43b62ad578023203e2f756c75128a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 27 Oct 2023 15:50:37 -0700 Subject: [PATCH 085/454] Use coarse grained hip pinned memory in reducers and fused kernels. This improves performance as it can be cached on device --- src/algorithm/REDUCE_SUM-Hip.cpp | 4 ++-- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 32 ++++++++++++++--------------- src/basic/INDEXLIST-Hip.cpp | 4 ++-- src/basic/INDEXLIST_3LOOP-Hip.cpp | 8 ++++---- src/basic/REDUCE3_INT-Hip.cpp | 8 ++++---- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 60f1af923..073e32a59 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -82,7 +82,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) int len = iend - ibegin; Real_type* sum_storage; - allocData(DataSpace::HipPinned, sum_storage, 1); + allocData(DataSpace::HipPinnedCoarse, sum_storage, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -145,7 +145,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); - deallocData(DataSpace::HipPinned, sum_storage); + deallocData(DataSpace::HipPinnedCoarse, sum_storage); } else { diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 6be241d43..bdc168359 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -26,28 +26,28 @@ namespace apps Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocData(DataSpace::HipPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_len_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocData(DataSpace::HipPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_len_ptrs, num_neighbors * num_vars); + allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); #define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ - deallocData(DataSpace::HipPinned, pack_buffer_ptrs); \ - deallocData(DataSpace::HipPinned, pack_list_ptrs); \ - deallocData(DataSpace::HipPinned, pack_var_ptrs); \ - deallocData(DataSpace::HipPinned, pack_len_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_buffer_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_list_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_var_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_len_ptrs); + deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index f30bda0c9..def89b8c5 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -257,7 +257,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) const size_t shmem_size = 0; Index_type* len; - allocData(DataSpace::HipPinned, len, 1); + allocData(DataSpace::HipPinnedCoarse, len, 1); Index_type* block_counts; allocData(DataSpace::HipDevice, block_counts, grid_size); Index_type* grid_counts; @@ -282,7 +282,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocData(DataSpace::HipPinned, len); + deallocData(DataSpace::HipPinnedCoarse, len); deallocData(DataSpace::HipDevice, block_counts); deallocData(DataSpace::HipDevice, grid_counts); deallocData(DataSpace::HipDevice, block_readys); diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 3defd94d1..7c2751bc0 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -74,7 +74,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_HIP; Index_type* len; - allocData(DataSpace::HipPinned, len, 1); + allocData(DataSpace::HipPinnedCoarse, len, 1); hipStream_t stream = res.get_stream(); @@ -147,7 +147,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) stopTimer(); deallocData(DataSpace::HipDevice, temp_storage); - deallocData(DataSpace::HipPinned, len); + deallocData(DataSpace::HipPinnedCoarse, len); INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; @@ -156,7 +156,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) INDEXLIST_3LOOP_DATA_SETUP_HIP; Index_type* len; - allocData(DataSpace::HipPinned, len, 1); + allocData(DataSpace::HipPinnedCoarse, len, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -187,7 +187,7 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) } stopTimer(); - deallocData(DataSpace::HipPinned, len); + deallocData(DataSpace::HipPinnedCoarse, len); INDEXLIST_3LOOP_DATA_TEARDOWN_HIP; diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 9f429b8e6..528dd4b55 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -89,7 +89,7 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { Int_ptr vmem_init; - allocData(DataSpace::HipPinned, vmem_init, 3); + allocData(DataSpace::HipPinnedCoarse, vmem_init, 3); Int_ptr vmem; allocData(DataSpace::HipDevice, vmem, 3); @@ -125,7 +125,7 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) stopTimer(); deallocData(DataSpace::HipDevice, vmem); - deallocData(DataSpace::HipPinned, vmem_init); + deallocData(DataSpace::HipPinnedCoarse, vmem_init); } else if ( vid == RAJA_HIP ) { @@ -167,7 +167,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { Int_ptr vmem_init; - allocData(DataSpace::HipPinned, vmem_init, 3); + allocData(DataSpace::HipPinnedCoarse, vmem_init, 3); Int_ptr vmem; allocData(DataSpace::HipDevice, vmem, 3); @@ -208,7 +208,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) stopTimer(); deallocData(DataSpace::HipDevice, vmem); - deallocData(DataSpace::HipPinned, vmem_init); + deallocData(DataSpace::HipPinnedCoarse, vmem_init); } else if ( vid == RAJA_HIP ) { From 539082f924016a4d0b3d96a0e46a98e0e80a69b8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 1 Nov 2023 14:09:38 -0700 Subject: [PATCH 086/454] Update RAJA version to get hip pinned coarse --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index e78b1eb03..ac4d5e5cd 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit e78b1eb03cbcd9f954c9f54ea79b5f6f479bde45 +Subproject commit ac4d5e5cd00b18cd2b827055b25a904532ba25c0 From 7ee9138a713289b5d7dbbc312e9fcd20cab9019c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 Nov 2023 13:11:09 -0700 Subject: [PATCH 087/454] Update RAJA, BLT, and GitLab CI to match RAJA --- .gitlab/custom-jobs-and-variables.yml | 2 +- .../corona.yml} | 9 ++-- .../lassen.yml} | 21 +++++---- .gitlab/jobs/poodle.yml | 44 +++++++++++++++++++ .../ruby.yml} | 16 +++---- .../tioga.yml} | 6 +-- blt | 2 +- scripts/gitlab/build_and_test.sh | 3 +- tpl/RAJA | 2 +- 9 files changed, 79 insertions(+), 26 deletions(-) rename .gitlab/{corona-build-and-test-extra.yml => jobs/corona.yml} (78%) rename .gitlab/{lassen-build-and-test-extra.yml => jobs/lassen.yml} (77%) create mode 100644 .gitlab/jobs/poodle.yml rename .gitlab/{ruby-build-and-test-extra.yml => jobs/ruby.yml} (71%) rename .gitlab/{tioga-build-and-test-extra.yml => jobs/tioga.yml} (88%) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index a4081efe1..2a1cd57d3 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -50,7 +50,7 @@ variables: # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen - PROJECT_LASSEN_DEPS: "" + PROJECT_LASSEN_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" # Configuration shared by build and test jobs specific to this project. # Not all configuration can be shared. Here projects can fine tune the diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/jobs/corona.yml similarity index 78% rename from .gitlab/corona-build-and-test-extra.yml rename to .gitlab/jobs/corona.yml index 03d67218a..5f46e27c2 100644 --- a/.gitlab/corona-build-and-test-extra.yml +++ b/.gitlab/jobs/corona.yml @@ -13,7 +13,10 @@ # We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that # the comparison with the original job is easier. -# No overridden jobs so far. +rocmcc_5_6_0_hip: + variables: + SPEC: " ~shared +rocm +openmp amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_corona ############ # Extra jobs @@ -23,5 +26,5 @@ # describe the spec here. # With GitLab CI, included files cannot be empty. -variables: - INCLUDED_FILE_CANNOT_BE_EMPTY: "True" +#variables: +# INCLUDED_FILE_CANNOT_BE_EMPTY: "True" diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/jobs/lassen.yml similarity index 77% rename from .gitlab/lassen-build-and-test-extra.yml rename to .gitlab/jobs/lassen.yml index f9610a1d1..d4133b4d7 100644 --- a/.gitlab/lassen-build-and-test-extra.yml +++ b/.gitlab/jobs/lassen.yml @@ -16,10 +16,10 @@ # Overriding shared spec: Longer allocation + extra flags xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS} ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" MODULE_LIST: "cuda/11.2.0" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" - extends: .build_and_test_on_lassen + extends: .job_on_lassen ############ @@ -29,10 +29,15 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. +gcc_8_3_1: + variables: + SPEC: " ~shared +openmp %gcc@8.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_lassen + gcc_8_3_1_cuda_11_5_0_ats_disabled: - extends: .build_and_test_on_lassen + extends: .job_on_lassen variables: - SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers" + SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" MODULE_LIST: "cuda/11.5.0" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 30" @@ -42,18 +47,18 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled: clang_13_0_1_libcpp: variables: - SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\"" - extends: .build_and_test_on_lassen + SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_lassen #clang_14_0_5_asan: # variables: # SPEC: " ~shared +openmp %clang@14.0.5 cxxflags==\"-fsanitize=address\"" # ASAN_OPTIONS: "detect_leaks=1" # LSAN_OPTIONS: "suppressions=${CI_PROJECT_DIR}/tpl/RAJA/suppressions.asan" -# extends: .build_and_test_on_lassen +# extends: .job_on_lassen # Activated in RAJA, but we don't use desul atomics here #gcc_8_3_1_cuda_10_1_168_desul_atomics: # variables: # SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers" -# extends: .build_and_test_on_lassen +# extends: .job_on_lassen diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml new file mode 100644 index 000000000..7f0883b32 --- /dev/null +++ b/.gitlab/jobs/poodle.yml @@ -0,0 +1,44 @@ +############################################################################## +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## + +######################## +# Overridden shared jobs +######################## +# We duplicate the shared jobs description and add necessary changes for RAJA. +# We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that +# the comparison with the original job is easier. + +clang_14_0_6: + variables: + SPEC: " ~shared +openmp +omptask %clang@14.0.6 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_poodle + +gcc_10_3_1: + variables: + SPEC: " ~shared +openmp +omptask %gcc@10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + POODLE_JOB_ALLOC: "--time=40 --nodes=1" + extends: .job_on_poodle + +intel_19_1_2_gcc_10_3_1: + variables: + SPEC: " ~shared +openmp %intel@19.1.2.gcc.10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + POODLE_JOB_ALLOC: "--time=60 --nodes=1" + extends: .job_on_poodle + +intel_2022_1_0: + variables: + SPEC: "${PROJECT_POODLE_VARIANTS} ~shared +openmp %intel@2022.1.0 ${PROJECT_POODLE_DEPS} ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + allow_failure: true + extends: .job_on_poodle + +############ +# Extra jobs +############ +# We do not recommend using ${PROJECT__VARIANTS} and +# ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully +# describe the spec here. diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/jobs/ruby.yml similarity index 71% rename from .gitlab/ruby-build-and-test-extra.yml rename to .gitlab/jobs/ruby.yml index 965142c5f..6b9196b85 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/jobs/ruby.yml @@ -16,26 +16,26 @@ # Overriding shared config for longer run and algorithm variants clang_14_0_6: variables: - SPEC: " ~shared +openmp +omptask %clang@14.0.6" - extends: .build_and_test_on_ruby + SPEC: " ~shared +openmp +omptask %clang@14.0.6 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_ruby gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask %gcc@10.3.1" + SPEC: " ~shared +openmp +omptask %gcc@10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" - extends: .build_and_test_on_ruby + extends: .job_on_ruby intel_19_1_2_gcc_10_3_1: variables: - SPEC: " +openmp %intel@19.1.2.gcc.10.3.1" + SPEC: " ~shared +openmp %intel@19.1.2.gcc.10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" - extends: .build_and_test_on_ruby + extends: .job_on_ruby intel_2022_1_0: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS} ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" allow_failure: true - extends: .build_and_test_on_ruby + extends: .job_on_ruby ############ # Extra jobs diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/jobs/tioga.yml similarity index 88% rename from .gitlab/tioga-build-and-test-extra.yml rename to .gitlab/jobs/tioga.yml index d3d054b4a..1cf05e4e5 100644 --- a/.gitlab/tioga-build-and-test-extra.yml +++ b/.gitlab/jobs/tioga.yml @@ -22,7 +22,7 @@ # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -rocmcc_5_4_3_hip_openmp: +rocmcc_5_6_0_hip_openmp: variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@develop" - extends: .build_and_test_on_tioga + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_tioga diff --git a/blt b/blt index 5a792c177..a7f0a6ecc 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 5a792c1775e7a7628d84dcde31652a689f1df7b5 +Subproject commit a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81 diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 850ad80d7..c6eec55ac 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -163,7 +163,7 @@ then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # Map CPU core allocations - declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32) + declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["poodle"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32) # If using Multi-project, set up the submodule if [[ -n ${raja_version} ]] @@ -214,6 +214,7 @@ fi cd ${build_dir} +date echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ TESTING RAJAPERF SUITE" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" diff --git a/tpl/RAJA b/tpl/RAJA index e78b1eb03..ac4d5e5cd 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit e78b1eb03cbcd9f954c9f54ea79b5f6f479bde45 +Subproject commit ac4d5e5cd00b18cd2b827055b25a904532ba25c0 From 4dd8d2cc0da9dcccfc5aa215a838b931f72b374c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 Nov 2023 13:31:48 -0700 Subject: [PATCH 088/454] Remove remaining loop_exec policy --- src/apps/EDGE3D-Seq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/EDGE3D-Seq.cpp b/src/apps/EDGE3D-Seq.cpp index 6658650b1..658064427 100644 --- a/src/apps/EDGE3D-Seq.cpp +++ b/src/apps/EDGE3D-Seq.cpp @@ -70,7 +70,7 @@ void EDGE3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( + RAJA::forall( RAJA::RangeSegment(ibegin, iend), edge3d_lam); } From 318c4ee48792a8ea6e236028f06b9edd3766d1c9 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 Nov 2023 14:16:50 -0700 Subject: [PATCH 089/454] Update and fix CI file --- .gitlab/subscribed-pipelines.yml | 41 ++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index 108e84a54..28cb44c31 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -30,6 +30,27 @@ # Comment the jobs for machines you don’t need. ### +# One job to generate the job list for all the subpipelines +generate-job-lists: + stage: prerequisites + tags: [shell, oslic] + variables: + RADIUSS_JOBS_PATH: "scripts/radiuss-spack-configs/gitlab/radiuss-jobs" + LOCAL_JOBS_PATH: ".gitlab/jobs" + script: + - cat ${RADIUSS_JOBS_PATH}/ruby.yml ${LOCAL_JOBS_PATH}/ruby.yml > ruby-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/poodle.yml ${LOCAL_JOBS_PATH}/poodle.yml > poodle-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/lassen.yml ${LOCAL_JOBS_PATH}/lassen.yml > lassen-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/corona.yml ${LOCAL_JOBS_PATH}/corona.yml > corona-jobs.yml + - cat ${RADIUSS_JOBS_PATH}/tioga.yml ${LOCAL_JOBS_PATH}/tioga.yml > tioga-jobs.yml + artifacts: + paths: + - ruby-jobs.yml + - poodle-jobs.yml + - lassen-jobs.yml + - corona-jobs.yml + - tioga-jobs.yml + # RUBY ruby-up-check: variables: @@ -39,7 +60,19 @@ ruby-up-check: ruby-build-and-test: variables: CI_MACHINE: "ruby" - needs: [ruby-up-check] + needs: [ruby-up-check, generate-job-lists] + extends: [.build-and-test] + +# POODLE +poodle-up-check: + variables: + CI_MACHINE: "poodle" + extends: [.machine-check] + +poodle-build-and-test: + variables: + CI_MACHINE: "poodle" + needs: [poodle-up-check, generate-job-lists] extends: [.build-and-test] # CORONA @@ -51,7 +84,7 @@ corona-up-check: corona-build-and-test: variables: CI_MACHINE: "corona" - needs: [corona-up-check] + needs: [corona-up-check, generate-job-lists] extends: [.build-and-test] # TIOGA @@ -63,7 +96,7 @@ tioga-up-check: tioga-build-and-test: variables: CI_MACHINE: "tioga" - needs: [tioga-up-check] + needs: [tioga-up-check, generate-job-lists] extends: [.build-and-test] # LASSEN @@ -75,7 +108,7 @@ lassen-up-check: lassen-build-and-test: variables: CI_MACHINE: "lassen" - needs: [lassen-up-check] + needs: [lassen-up-check, generate-job-lists] extends: [.build-and-test] From 330b467c0e88ecd21390d64b00898f0dea632f18 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 Nov 2023 14:48:58 -0700 Subject: [PATCH 090/454] ne more fix --- .gitlab/subscribed-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index 28cb44c31..f7177c037 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -9,7 +9,7 @@ # The template job to test whether a machine is up. # Expects CI_MACHINE defined to machine name. .machine-check: - stage: machine-checks + stage: prerequisites tags: [shell, oslic] variables: GIT_STRATEGY: none From 7b11de656f05c65977db335a4bd01d7377bd4258 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 3 Nov 2023 23:13:55 +0100 Subject: [PATCH 091/454] Small fix --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e7997e0d1..f08da2522 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -51,7 +51,7 @@ variables: # High level stages stages: - - machine-checks + - prerequisites - build-and-test # Template for jobs triggering a build-and-test sub-pipelines: From 95c70f5007d660bc4a4d7ddfeab408db1fbdf293 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 Nov 2023 15:23:09 -0700 Subject: [PATCH 092/454] Another fix... --- .gitlab/subscribed-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index f7177c037..d024be859 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -35,7 +35,7 @@ generate-job-lists: stage: prerequisites tags: [shell, oslic] variables: - RADIUSS_JOBS_PATH: "scripts/radiuss-spack-configs/gitlab/radiuss-jobs" + RADIUSS_JOBS_PATH: "tpl/RAJA/scripts/radiuss-spack-configs/gitlab/radiuss-jobs" LOCAL_JOBS_PATH: ".gitlab/jobs" script: - cat ${RADIUSS_JOBS_PATH}/ruby.yml ${LOCAL_JOBS_PATH}/ruby.yml > ruby-jobs.yml From a903d2eebcdf6e0495badffc4061260650b50cf1 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 3 Nov 2023 23:23:52 +0100 Subject: [PATCH 093/454] Fix path to access shared jobs in radiuss-spack-configs submodule --- .gitlab/subscribed-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index f7177c037..d024be859 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -35,7 +35,7 @@ generate-job-lists: stage: prerequisites tags: [shell, oslic] variables: - RADIUSS_JOBS_PATH: "scripts/radiuss-spack-configs/gitlab/radiuss-jobs" + RADIUSS_JOBS_PATH: "tpl/RAJA/scripts/radiuss-spack-configs/gitlab/radiuss-jobs" LOCAL_JOBS_PATH: ".gitlab/jobs" script: - cat ${RADIUSS_JOBS_PATH}/ruby.yml ${LOCAL_JOBS_PATH}/ruby.yml > ruby-jobs.yml From 0aa42d17b685495e8683869b64675f94e1f35581 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Mon, 6 Nov 2023 09:40:17 +0100 Subject: [PATCH 094/454] Speed up clone with targeted submodules clones --- .gitlab/subscribed-pipelines.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index d024be859..38b481c97 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -35,6 +35,9 @@ generate-job-lists: stage: prerequisites tags: [shell, oslic] variables: + GIT_SUBMODULE_DEPTH: 2 + GIT_SUBMODULE_STRATEGY: recursive + GIT_SUBMODULE_PATHS: tpl/RAJA RADIUSS_JOBS_PATH: "tpl/RAJA/scripts/radiuss-spack-configs/gitlab/radiuss-jobs" LOCAL_JOBS_PATH: ".gitlab/jobs" script: From 4072e0c605bd288473dd19a033756003e8e21243 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Mon, 6 Nov 2023 10:45:01 +0100 Subject: [PATCH 095/454] Complete the update to radiuss-shared-ci 2023.10 --- .gitlab-ci.yml | 60 ++++++++++++++++----------- .gitlab/custom-jobs-and-variables.yml | 26 ++++++++---- 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f08da2522..88bb385d7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,47 +6,56 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +# DESCRIPTION: ############################################################################### # General GitLab pipelines configurations for supercomputers and Linux clusters # at Lawrence Livermore National Laboratory (LLNL). -# # This entire pipeline is LLNL-specific # -# Important note: This file is a template provided by -# llnl/radiuss-shared-ci. Changes needed consists in setting variable values, -# change the reference to the radiuss-shared-ci repo, opt-in and out optional -# features. The project can then extend it with additional stages. +# Important note: This file is a template provided by llnl/radiuss-shared-ci. +# Remains to set variable values, change the reference to the radiuss-shared-ci +# repo, opt-in and out optional features. The project can then extend it with +# additional stages. # -# However, each project should provide: +# In addition, each project should copy over and complete: # - .gitlab/custom-jobs-and-variables.yml # - .gitlab/subscribed-pipelines.yml -# - .gitlab/${MACHINE}-build-and-test-extra.yml +# +# The jobs should be specified in a file local to the project, +# - .gitlab/jobs/${CI_MACHINE}.yml +# or generated (see LLNL/Umpire for an example). ############################################################################### # We define the following GitLab pipeline variables: variables: -# Required information about GitHub repository - GITHUB_PROJECT_NAME: "RAJAPerf" - GITHUB_PROJECT_ORG: "LLNL" -# Use the umdev service user to run CI. This prevents from running pipelines as -# an actual user. +##### LC GITLAB CONFIGURATION +# Use a LLNL service user to run CI. This prevents from running pipelines as an +# actual user. LLNL_SERVICE_USER: rajasa # Use the service user workspace. Solves permission issues, stores everything # at the same location whoever triggers a pipeline. # CUSTOM_CI_BUILDS_DIR: "" # Tells Gitlab to recursively update the submodules when cloning the project. GIT_SUBMODULE_STRATEGY: recursive -# We build the projects in the CI clone directory. -# TODO: add a clean-up mechanism + +##### PROJECT VARIABLES +# We build the projects in the CI clone directory (used in +# script/gitlab/build_and_test.sh script). +# TODO: add a clean-up mechanism. BUILD_ROOT: ${CI_PROJECT_DIR} + +##### SHARED_CI CONFIGURATION +# Required information about GitHub repository + GITHUB_PROJECT_NAME: "RAJAPerf" + GITHUB_PROJECT_ORG: "LLNL" # Set the build-and-test command. - BUILD_AND_TEST_CMD: "./scripts/gitlab/build_and_test.sh" -# Override the pattern describing branches that will skip the "draft PR test". -# Add protected branches here. See default value in + JOB_CMD: "./scripts/gitlab/build_and_test.sh" +# Override the pattern describing branches that will skip the "draft PR filter +# test". Add protected branches here. See default value in # preliminary-ignore-draft-pr.yml. # ALWAYS_RUN_PATTERN: "^develop$|^main$|^v[0-9.]*-RC$" -# We organize the build-and-test stage in sub-pipelines. Each sub-pipeline +# We organize the build-and-test stage with sub-pipelines. Each sub-pipeline # corresponds to a test batch on a given machine. # High level stages @@ -54,24 +63,25 @@ stages: - prerequisites - build-and-test -# Template for jobs triggering a build-and-test sub-pipelines: +# Template for jobs triggering a build-and-test sub-pipeline: .build-and-test: stage: build-and-test trigger: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: v2023.08.0 - file: '${CI_MACHINE}-build-and-test.yml' - - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml' + ref: 'v2023.10.0' + file: 'pipelines/${CI_MACHINE}.yml' + - artifact: '${CI_MACHINE}-jobs.yml' + job: 'generate-job-lists' strategy: depend forward: pipeline_variables: true include: - # checks preliminary to running the actual CI test (optional) + # [Optional] checks preliminary to running the actual CI test #- project: 'radiuss/radiuss-shared-ci' - # ref: v2023.03.1 - # file: 'preliminary-ignore-draft-pr.yml' + # ref: 'v2023.10.0' + # file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 2a1cd57d3..3cbdce259 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -15,19 +15,29 @@ variables: # Ruby # Arguments for top level allocation - RUBY_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1" + RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1" # Arguments for job level allocation - RUBY_BUILD_AND_TEST_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1" + RUBY_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby PROJECT_RUBY_DEPS: "" +# Poodle +# Arguments for top level allocation + POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=30 --nodes=1" +# Arguments for job level allocation + POODLE_JOB_ALLOC: "--overlap --time=28 --nodes=1" +# Project specific variants for poodle + PROJECT_POODLE_VARIANTS: "~shared +openmp" +# Project specific deps for poodle + PROJECT_POODLE_DEPS: "" + # Corona # Arguments for top level allocation - CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + CORONA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" # Arguments for job level allocation - CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=30m --nodes=1 --begin-time=+5s" + CORONA_JOB_ALLOC: "--time-limit=30m --nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_CORONA_VARIANTS: "~shared ~openmp" # Project specific deps for corona @@ -35,9 +45,9 @@ variables: # Tioga # Arguments for top level allocation - TIOGA_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" # Arguments for job level allocation - TIOGA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s" + TIOGA_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_TIOGA_VARIANTS: "~shared ~openmp" # Project specific deps for corona @@ -46,7 +56,7 @@ variables: # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 30" + LASSEN_JOB_ALLOC: "1 -W 30" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen @@ -56,7 +66,7 @@ variables: # Not all configuration can be shared. Here projects can fine tune the # CI behavior. # See Umpire for an example (export junit test reports). -.custom_build_and_test: +.custom_job: artifacts: reports: junit: junit.xml From 5e36df2ca827d06bf599b0af14102289c865f1c2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 6 Nov 2023 11:31:28 -0800 Subject: [PATCH 096/454] Centralize BLT version for lassen per review comment. --- .gitlab/jobs/lassen.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index d4133b4d7..e5e4e8436 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -16,7 +16,7 @@ # Overriding shared spec: Longer allocation + extra flags xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS} ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.2.0" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" extends: .job_on_lassen @@ -31,13 +31,13 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: gcc_8_3_1: variables: - SPEC: " ~shared +openmp %gcc@8.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: " ~shared +openmp %gcc@8.3.1 ${PROJECT_LASSEN_DEPS}" extends: .job_on_lassen gcc_8_3_1_cuda_11_5_0_ats_disabled: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.5.0" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 30" @@ -47,7 +47,7 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled: clang_13_0_1_libcpp: variables: - SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ${PROJECT_LASSEN_DEPS}" extends: .job_on_lassen #clang_14_0_5_asan: From c0de7aaf75b3090bdaf04e7c8a51f2b8da42c4c2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 6 Nov 2023 12:03:54 -0800 Subject: [PATCH 097/454] Clean up CI redundancies for poodle CI --- .gitlab/custom-jobs-and-variables.yml | 2 +- .gitlab/jobs/poodle.yml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 3cbdce259..0ac5d1df5 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -31,7 +31,7 @@ variables: # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle - PROJECT_POODLE_DEPS: "" + PROJECT_POODLE_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" # Corona # Arguments for top level allocation diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 7f0883b32..97de241a8 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -15,24 +15,24 @@ clang_14_0_6: variables: - SPEC: " ~shared +openmp +omptask %clang@14.0.6 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask %gcc@10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_POODLE_DEPS}" POODLE_JOB_ALLOC: "--time=40 --nodes=1" extends: .job_on_poodle intel_19_1_2_gcc_10_3_1: variables: - SPEC: " ~shared +openmp %intel@19.1.2.gcc.10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" POODLE_JOB_ALLOC: "--time=60 --nodes=1" extends: .job_on_poodle intel_2022_1_0: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} ~shared +openmp %intel@2022.1.0 ${PROJECT_POODLE_DEPS} ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ${PROJECT_POODLE_DEPS} ${PROJECT_POODLE_DEPS}" allow_failure: true extends: .job_on_poodle From ffc40094dc2359c07be5fe835cfd03b6fb06dd8f Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 6 Nov 2023 13:23:33 -0800 Subject: [PATCH 098/454] Follow conventions in specs for overridden vs. extra jobs --- .gitlab/custom-jobs-and-variables.yml | 10 +++++----- .gitlab/jobs/corona.yml | 9 +++++---- .gitlab/jobs/lassen.yml | 6 +++--- .gitlab/jobs/poodle.yml | 2 +- .gitlab/jobs/ruby.yml | 8 ++++---- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 0ac5d1df5..ab8c360df 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -21,7 +21,7 @@ variables: # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby - PROJECT_RUBY_DEPS: "" + PROJECT_RUBY_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" # Poodle # Arguments for top level allocation @@ -41,17 +41,17 @@ variables: # Project specific variants for corona PROJECT_CORONA_VARIANTS: "~shared ~openmp" # Project specific deps for corona - PROJECT_CORONA_DEPS: "^blt@develop " + PROJECT_CORONA_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" # Tioga # Arguments for top level allocation TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" # Arguments for job level allocation TIOGA_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s" -# Project specific variants for corona +# Project specific variants for tioga PROJECT_TIOGA_VARIANTS: "~shared ~openmp" -# Project specific deps for corona - PROJECT_TIOGA_DEPS: "^blt@develop " +# Project specific deps for tioga + PROJECT_TIOGA_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index 5f46e27c2..c0c163494 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -15,7 +15,7 @@ rocmcc_5_6_0_hip: variables: - SPEC: " ~shared +rocm +openmp amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_CORONA_VARIANTS} +rocm amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ${PROJECT_CORONA_DEPS}" extends: .job_on_corona ############ @@ -25,6 +25,7 @@ rocmcc_5_6_0_hip: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -# With GitLab CI, included files cannot be empty. -#variables: -# INCLUDED_FILE_CANNOT_BE_EMPTY: "True" +rocmcc_5_6_0_hip_openmp: + variables: + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_corona diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index e5e4e8436..7d5c492ab 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -31,13 +31,13 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: gcc_8_3_1: variables: - SPEC: " ~shared +openmp %gcc@8.3.1 ${PROJECT_LASSEN_DEPS}" + SPEC: " ~shared +openmp %gcc@8.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" extends: .job_on_lassen gcc_8_3_1_cuda_11_5_0_ats_disabled: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" MODULE_LIST: "cuda/11.5.0" LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 30" @@ -47,7 +47,7 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled: clang_13_0_1_libcpp: variables: - SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ${PROJECT_LASSEN_DEPS}" + SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" extends: .job_on_lassen #clang_14_0_5_asan: diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 97de241a8..286677598 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -32,7 +32,7 @@ intel_19_1_2_gcc_10_3_1: intel_2022_1_0: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ${PROJECT_POODLE_DEPS} ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ${PROJECT_POODLE_DEPS}" allow_failure: true extends: .job_on_poodle diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index 6b9196b85..c265fbf45 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -16,24 +16,24 @@ # Overriding shared config for longer run and algorithm variants clang_14_0_6: variables: - SPEC: " ~shared +openmp +omptask %clang@14.0.6 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask %gcc@10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" extends: .job_on_ruby intel_19_1_2_gcc_10_3_1: variables: - SPEC: " ~shared +openmp %intel@19.1.2.gcc.10.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" extends: .job_on_ruby intel_2022_1_0: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS} ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" allow_failure: true extends: .job_on_ruby From f493311a2c9b266504c06ee0ca9b0e999afca223 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 6 Nov 2023 15:13:05 -0800 Subject: [PATCH 099/454] Restore corona jobs to what was there before --- .gitlab/jobs/corona.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index c0c163494..b7c5fe5b7 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -25,7 +25,6 @@ rocmcc_5_6_0_hip: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -rocmcc_5_6_0_hip_openmp: - variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" - extends: .job_on_corona +# With GitLab CI, included files cannot be empty. +variables: + INCLUDED_FILE_CANNOT_BE_EMPTY: "True" From d03a5901f144caf25b4f8224e91d44516669572c Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 7 Nov 2023 15:40:31 +0100 Subject: [PATCH 100/454] Fine tune allocation duration --- .gitlab/custom-jobs-and-variables.yml | 21 ++++++++++++--------- .gitlab/jobs/poodle.yml | 2 -- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index ab8c360df..e29dfc80f 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -15,9 +15,9 @@ variables: # Ruby # Arguments for top level allocation - RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=45 --nodes=1" + RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=14 --nodes=2" # Arguments for job level allocation - RUBY_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=30 --nodes=1" + RUBY_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=6 --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby @@ -25,9 +25,12 @@ variables: # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=30 --nodes=1" +# Optimization notes: We have 4 jobs lasting at max 5 minutes and using 28 +# cores out of 112 available (see -j in scripts/gitlab/build_and_test.sh). +# We allow allocation overlapping. + POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=12 --nodes=1" # Arguments for job level allocation - POODLE_JOB_ALLOC: "--overlap --time=28 --nodes=1" + POODLE_JOB_ALLOC: "--overlap --time=5 --nodes=1" # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle @@ -35,9 +38,9 @@ variables: # Corona # Arguments for top level allocation - CORONA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + CORONA_SHARED_ALLOC: "--exclusive --time-limit=10m --nodes=1" # Arguments for job level allocation - CORONA_JOB_ALLOC: "--time-limit=30m --nodes=1 --begin-time=+5s" + CORONA_JOB_ALLOC: "--time-limit=8m --nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_CORONA_VARIANTS: "~shared ~openmp" # Project specific deps for corona @@ -45,9 +48,9 @@ variables: # Tioga # Arguments for top level allocation - TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1" + TIOGA_SHARED_ALLOC: "--exclusive --time-limit=26m --nodes=1" # Arguments for job level allocation - TIOGA_JOB_ALLOC: "--time-limit=45m --nodes=1 --begin-time=+5s" + TIOGA_JOB_ALLOC: "--time-limit=8m --nodes=1 --begin-time=+5s" # Project specific variants for tioga PROJECT_TIOGA_VARIANTS: "~shared ~openmp" # Project specific deps for tioga @@ -56,7 +59,7 @@ variables: # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_JOB_ALLOC: "1 -W 30" + LASSEN_JOB_ALLOC: "1 -W 15" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 286677598..9e56823e8 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -21,13 +21,11 @@ clang_14_0_6: gcc_10_3_1: variables: SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_POODLE_DEPS}" - POODLE_JOB_ALLOC: "--time=40 --nodes=1" extends: .job_on_poodle intel_19_1_2_gcc_10_3_1: variables: SPEC: "${PROJECT_POODLE_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" - POODLE_JOB_ALLOC: "--time=60 --nodes=1" extends: .job_on_poodle intel_2022_1_0: From 1db07ffa729fd406a417da68412020f81073e745 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 7 Nov 2023 16:30:28 +0100 Subject: [PATCH 101/454] Give more time for each overlapping job --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index e29dfc80f..9ed913ec6 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -30,7 +30,7 @@ variables: # We allow allocation overlapping. POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=12 --nodes=1" # Arguments for job level allocation - POODLE_JOB_ALLOC: "--overlap --time=5 --nodes=1" + POODLE_JOB_ALLOC: "--overlap --time=10 --nodes=1" # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle From 2053ce4b6fc03a26ccbea0b1373ebe5a788c2f37 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 8 Nov 2023 10:34:41 -0800 Subject: [PATCH 102/454] Update compiler version for azure sycl checks --- Dockerfile | 2 +- azure-pipelines.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9d7f6b197..ab57aa378 100644 --- a/Dockerfile +++ b/Dockerfile @@ -111,7 +111,7 @@ RUN . /opt/spack/share/spack/setup-env.sh && \ ## make -j 6 && \ ## cd .. && rm -rf build -FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl +FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2023.2.1 AS sycl ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build diff --git a/azure-pipelines.yml b/azure-pipelines.yml index da8637d19..673866920 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -56,8 +56,8 @@ jobs: ## docker_target: nvcc11.1.1-debug ## hip5.1.3: ## docker_target: hip5.1.3 -## sycl: -## docker_target: sycl + sycl: + docker_target: sycl pool: vmImage: 'ubuntu-latest' variables: From 6ca1f22419e0b0eab2b8bc5737b12b9f0b9e82cb Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 9 Nov 2023 12:38:07 -0800 Subject: [PATCH 103/454] Fix indentation --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 673866920..41f9c0cd7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -56,8 +56,8 @@ jobs: ## docker_target: nvcc11.1.1-debug ## hip5.1.3: ## docker_target: hip5.1.3 - sycl: - docker_target: sycl + sycl: + docker_target: sycl pool: vmImage: 'ubuntu-latest' variables: From c3798be906b108d20506dcff741980e5b1fc7f64 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 9 Nov 2023 14:48:24 -0800 Subject: [PATCH 104/454] Add CudaManaged DataSpaces with advice --- src/common/CudaDataUtils.hpp | 90 +++++++++++++++++++++++++++++++++--- src/common/DataUtils.cpp | 40 ++++++++++++++++ src/common/RAJAPerfSuite.cpp | 8 ++++ src/common/RAJAPerfSuite.hpp | 4 ++ 4 files changed, 136 insertions(+), 6 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 4f5741e39..305937844 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -131,7 +131,7 @@ inline void copyCudaData(void* dst_ptr, const void* src_ptr, Size_type len) } /*! - * \brief Allocate CUDA device data array (dptr). + * \brief Allocate CUDA device data array. */ inline void* allocCudaDeviceData(Size_type len) { @@ -141,7 +141,7 @@ inline void* allocCudaDeviceData(Size_type len) } /*! - * \brief Allocate CUDA managed data array (dptr). + * \brief Allocate CUDA managed data array. */ inline void* allocCudaManagedData(Size_type len) { @@ -151,7 +151,53 @@ inline void* allocCudaManagedData(Size_type len) } /*! - * \brief Allocate CUDA pinned data array (pptr). + * \brief Allocate CUDA managed host preferred data array. + */ +inline void* allocCudaManagedHostPreferredData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA managed device preferred data array. + */ +inline void* allocCudaManagedDevicePreferredData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, getCudaDevice() ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA managed host preferred host accessed data array. + */ +inline void* allocCudaManagedHostPreferredDeviceAccessedData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetAccessedBy, getCudaDevice() ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA managed device preferred host accessed data array. + */ +inline void* allocCudaManagedDevicePreferredHostAccessedData(Size_type len) +{ + void* mptr = nullptr; + cudaErrchk( cudaMallocManaged( &mptr, len, cudaMemAttachGlobal ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetPreferredLocation, getCudaDevice() ) ); + cudaErrchk( cudaMemAdvise( mptr, len, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId ) ); + return mptr; +} + +/*! + * \brief Allocate CUDA pinned data array. */ inline void* allocCudaPinnedData(Size_type len) { @@ -162,7 +208,7 @@ inline void* allocCudaPinnedData(Size_type len) /*! - * \brief Free device data array. + * \brief Free CUDA device data array. */ inline void deallocCudaDeviceData(void* dptr) { @@ -170,7 +216,7 @@ inline void deallocCudaDeviceData(void* dptr) } /*! - * \brief Free managed data array. + * \brief Free CUDA managed data array. */ inline void deallocCudaManagedData(void* mptr) { @@ -178,7 +224,39 @@ inline void deallocCudaManagedData(void* mptr) } /*! - * \brief Free pinned data array. + * \brief Free CUDA managed host preferred data array. + */ +inline void deallocCudaManagedHostPreferredData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA managed device preferred data array. + */ +inline void deallocCudaManagedDevicePreferredData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA managed host preferred host accessed data array. + */ +inline void deallocCudaManagedHostPreferredDeviceAccessedData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA managed device preferred host accessed data array. + */ +inline void deallocCudaManagedDevicePreferredHostAccessedData(void* mptr) +{ + cudaErrchk( cudaFree( mptr ) ); +} + +/*! + * \brief Free CUDA pinned data array. */ inline void deallocCudaPinnedData(void* pptr) { diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 7062f689c..adb757012 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -72,6 +72,10 @@ bool isCudaDataSpace(DataSpace dataSpace) switch (dataSpace) { case DataSpace::CudaPinned: case DataSpace::CudaManaged: + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + case DataSpace::CudaManagedDevicePreferredHostAccessed: case DataSpace::CudaDevice: return true; default: @@ -186,6 +190,22 @@ void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align) { ptr = detail::allocCudaManagedData(nbytes); } break; + case DataSpace::CudaManagedHostPreferred: + { + ptr = detail::allocCudaManagedHostPreferredData(nbytes); + } break; + case DataSpace::CudaManagedDevicePreferred: + { + ptr = detail::allocCudaManagedDevicePreferredData(nbytes); + } break; + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + { + ptr = detail::allocCudaManagedHostPreferredDeviceAccessedData(nbytes); + } break; + case DataSpace::CudaManagedDevicePreferredHostAccessed: + { + ptr = detail::allocCudaManagedDevicePreferredHostAccessedData(nbytes); + } break; case DataSpace::CudaDevice: { ptr = detail::allocCudaDeviceData(nbytes); @@ -329,6 +349,22 @@ void deallocData(DataSpace dataSpace, void* ptr) { detail::deallocCudaManagedData(ptr); } break; + case DataSpace::CudaManagedHostPreferred: + { + detail::deallocCudaManagedHostPreferredData(ptr); + } break; + case DataSpace::CudaManagedDevicePreferred: + { + detail::deallocCudaManagedDevicePreferredData(ptr); + } break; + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + { + detail::deallocCudaManagedHostPreferredDeviceAccessedData(ptr); + } break; + case DataSpace::CudaManagedDevicePreferredHostAccessed: + { + detail::deallocCudaManagedDevicePreferredHostAccessedData(ptr); + } break; case DataSpace::CudaDevice: { detail::deallocCudaDeviceData(ptr); @@ -581,9 +617,13 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace) return DataSpace::Host; case DataSpace::CudaManaged: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedDevicePreferredHostAccessed: case DataSpace::CudaDevice: return DataSpace::CudaPinned; + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: case DataSpace::HipManaged: case DataSpace::HipManagedAdviseFine: case DataSpace::HipManagedAdviseCoarse: diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 085c058c4..359e7a439 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -348,6 +348,10 @@ static const std::string DataSpaceNames [] = std::string("CudaPinned"), std::string("CudaManaged"), + std::string("CudaManagedHostPreferred"), + std::string("CudaManagedDevicePreferred"), + std::string("CudaManagedHostPreferredDeviceAccessed"), + std::string("CudaManagedDevicePreferredHostAccessed"), std::string("CudaDevice"), std::string("HipHostAdviseFine"), @@ -595,6 +599,10 @@ bool isDataSpaceAvailable(DataSpace dataSpace) #if defined(RAJA_ENABLE_CUDA) case DataSpace::CudaPinned: case DataSpace::CudaManaged: + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: + case DataSpace::CudaManagedDevicePreferredHostAccessed: case DataSpace::CudaDevice: ret_val = true; break; #endif diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 3270a4090..bd037bcc3 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -266,6 +266,10 @@ enum struct DataSpace { CudaPinned, CudaManaged, + CudaManagedHostPreferred, + CudaManagedDevicePreferred, + CudaManagedHostPreferredDeviceAccessed, + CudaManagedDevicePreferredHostAccessed, CudaDevice, HipHostAdviseFine, From c75f8ff8f7870b1e111d0f76d74dcb0b9192698e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 10 Nov 2023 15:05:33 -0800 Subject: [PATCH 105/454] Run SYCL tests on azure --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ab57aa378..05f177748 100644 --- a/Dockerfile +++ b/Dockerfile @@ -118,5 +118,5 @@ WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ - ./bin/raja-perf.exe --checkrun 5 -sp" && \ + ctest -T test --output-on-failure && \ cd .. && rm -rf build From 633e0ed33aee7bf377374d3ff086b88ebbddcaf9 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 10 Nov 2023 15:19:10 -0800 Subject: [PATCH 106/454] Add missing quote --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 05f177748..60ddaeb98 100644 --- a/Dockerfile +++ b/Dockerfile @@ -118,5 +118,5 @@ WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ - ctest -T test --output-on-failure && \ + ctest -T test --output-on-failure" && \ cd .. && rm -rf build From 52d9be769f75e7fe11c02f2f514daad493cc05a2 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Mon, 13 Nov 2023 09:32:38 +0100 Subject: [PATCH 107/454] Comment alloc command choices --- .gitlab/custom-jobs-and-variables.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 9ed913ec6..9c845e6a6 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -17,6 +17,7 @@ variables: # Arguments for top level allocation RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=14 --nodes=2" # Arguments for job level allocation +# Note: We repeat the reservation, necessary when jobs are manually re-triggered. RUBY_JOB_ALLOC: "--reservation=ci --qos=ci_ruby --time=6 --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" From 0158ae7a036b841fb21b9dd745c3af26f1cdc91d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 14 Nov 2023 13:33:28 -0800 Subject: [PATCH 108/454] Attempt to squash build warnings --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 60ddaeb98..eacec59a3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -116,7 +116,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ - cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ + cmake -DCMAKE_CXX_COMPILER=icpx -DCMAKE_CXX_FLAGS="-fsycl" -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ ctest -T test --output-on-failure" && \ cd .. && rm -rf build From b8b25c727f9669cb347ae6a5fec7c2ec810324c7 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 14 Nov 2023 15:00:54 -0800 Subject: [PATCH 109/454] Try enabling tests in sycl build --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index eacec59a3..265e4ecde 100644 --- a/Dockerfile +++ b/Dockerfile @@ -111,12 +111,12 @@ RUN . /opt/spack/share/spack/setup-env.sh && \ ## make -j 6 && \ ## cd .. && rm -rf build -FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2023.2.1 AS sycl +FROM ghcr.io/rse-ops/intel-ubuntu-23.04:intel-2023.2.1 AS sycl ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ - cmake -DCMAKE_CXX_COMPILER=icpx -DCMAKE_CXX_FLAGS="-fsycl" -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS .. && \ make -j 6 &&\ ctest -T test --output-on-failure" && \ cd .. && rm -rf build From b60c9d20cb443f2ec398857d5888fdd35601d298 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 14 Nov 2023 15:11:05 -0800 Subject: [PATCH 110/454] Fix syntax error --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 265e4ecde..b4150dc55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -116,7 +116,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ - cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS .. && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure" && \ cd .. && rm -rf build From 0983f14535c4b9e64426ddbf1eb10e3768f654e6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 14 Nov 2023 15:46:36 -0800 Subject: [PATCH 111/454] Change CMake option --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index b4150dc55..ef0ca313f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -116,7 +116,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ - cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS=On .. && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure" && \ cd .. && rm -rf build From 07a0c543df77b513cd1728f93d9d2dd9d2b34e41 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:46:22 +0100 Subject: [PATCH 112/454] Adapt job alloc to overlap situation --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index ff06b9b43..e445a907a 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -18,7 +18,7 @@ variables: RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=14 --nodes=2" # Arguments for job level allocation # Note: We repeat the reservation, necessary when jobs are manually re-triggered. - RUBY_JOB_ALLOC: "--overlap --reservation=ci --qos=ci_ruby --time=6 --nodes=1" + RUBY_JOB_ALLOC: "--overlap --reservation=ci --qos=ci_ruby --time=12 --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby From cb4159eb617cfbfe8bcfa4e18251cafbac2ae555 Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Wed, 15 Nov 2023 17:11:41 +0000 Subject: [PATCH 113/454] Fix syntax for SYCL build. Initial work updating memory management for SYCL --- src/basic/DAXPY-Sycl.cpp | 22 +++++----- src/basic/IF_QUAD-Sycl.cpp | 2 +- src/basic/INIT3-Sycl.cpp | 2 +- src/basic/INIT_VIEW1D-Sycl.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 2 +- src/basic/MULADDSUB-Sycl.cpp | 2 +- src/basic/NESTED_INIT-Sycl.cpp | 2 +- src/basic/REDUCE3_INT-Sycl.cpp | 2 +- src/basic/REDUCE3_INT.hpp | 2 + src/basic/TRAP_INT-Sycl.cpp | 2 +- src/basic/TRAP_INT.hpp | 2 + src/common/DataUtils.cpp | 34 +++++++++++++++ src/common/KernelBase.hpp | 16 +++++++ src/common/RAJAPerfSuite.cpp | 10 +++++ src/common/RAJAPerfSuite.hpp | 3 ++ src/common/RunParams.cpp | 8 +++- src/common/RunParams.hpp | 2 + src/common/SyclDataUtils.hpp | 62 ++++++++++++++++++++++++++- 18 files changed, 157 insertions(+), 20 deletions(-) diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index f436d966e..880c7e455 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -28,11 +28,11 @@ namespace rajaperf namespace basic { -#define DAXPY_DATA_SETUP_SYCL \ +//#define DAXPY_DATA_SETUP_SYCL \ allocAndInitSyclDeviceData(x, m_x, iend, qu); \ allocAndInitSyclDeviceData(y, m_y, iend, qu); -#define DAXPY_DATA_TEARDOWN_SYCL \ +//#define DAXPY_DATA_TEARDOWN_SYCL \ getSyclDeviceData(m_y, y, iend, qu); \ deallocSyclDeviceData(x, qu); \ deallocSyclDeviceData(y, qu); @@ -44,12 +44,14 @@ void DAXPY::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + DAXPY_DATA_SETUP; if ( vid == Base_SYCL ) { if (work_group_size > 0) { - DAXPY_DATA_SETUP_SYCL; +// DAXPY_DATA_SETUP_SYCL; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -72,10 +74,10 @@ void DAXPY::runSyclVariantImpl(VariantID vid) stopTimer(); - DAXPY_DATA_TEARDOWN_SYCL; +// DAXPY_DATA_TEARDOWN_SYCL; } else { - DAXPY_DATA_SETUP_SYCL; +// DAXPY_DATA_SETUP_SYCL; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -94,7 +96,7 @@ void DAXPY::runSyclVariantImpl(VariantID vid) stopTimer(); - DAXPY_DATA_TEARDOWN_SYCL; +// DAXPY_DATA_TEARDOWN_SYCL; } } else if ( vid == RAJA_SYCL ) { @@ -104,7 +106,7 @@ void DAXPY::runSyclVariantImpl(VariantID vid) return; } - DAXPY_DATA_SETUP_SYCL; +// DAXPY_DATA_SETUP_SYCL; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -118,17 +120,17 @@ void DAXPY::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - DAXPY_DATA_TEARDOWN_SYCL; +// DAXPY_DATA_TEARDOWN_SYCL; } else { std::cout << "\n DAXPY : Unknown Sycl variant id = " << vid << std::endl; } } - -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(DAXPY, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DAXPY, Sycl) } // end namespace basic } // end namespace rajaperf + #endif // RAJA_ENABLE_SYCL diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index 166a85bf9..e52e1714a 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -133,7 +133,7 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(IF_QUAD, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(IF_QUAD, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index ab2e59686..6ea52b4dd 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -131,7 +131,7 @@ void INIT3::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT3, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT3, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index 5ea1c5399..699255499 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -122,7 +122,7 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT_VIEW1D, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp index e832ceb48..a3d2317bc 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -121,7 +121,7 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(INIT_VIEW1D_OFFSET, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INIT_VIEW1D_OFFSET, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp index fdce13fb3..9fbea9d8f 100644 --- a/src/basic/MULADDSUB-Sycl.cpp +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -131,7 +131,7 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MULADDSUB, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MULADDSUB, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index c999d76ea..d5028b403 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -142,7 +142,7 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(NESTED_INIT, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(NESTED_INIT, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index bc3fa55c6..6a108cd7f 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -191,7 +191,7 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(REDUCE3_INT, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE3_INT, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index 83fd78c40..b065233dc 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -84,6 +84,8 @@ class REDUCE3_INT : public KernelBase void runHipVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantOccGS(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index 8e401b419..08393b784 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -161,7 +161,7 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(TRAP_INT, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRAP_INT, Sycl) } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 213dbee9f..0f680b63a 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -81,6 +81,8 @@ class TRAP_INT : public KernelBase void runHipVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantOccGS(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index adb757012..b44a6c8f2 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -10,7 +10,9 @@ #include "CudaDataUtils.hpp" #include "HipDataUtils.hpp" #include "OpenMPTargetDataUtils.hpp" +#include "SyclDataUtils.hpp" +#include "KernelBase.hpp" #include "RAJA/internal/MemUtils_CPU.hpp" @@ -105,6 +107,21 @@ bool isHipDataSpace(DataSpace dataSpace) } } +/*! + * \brief Get if the data space is a sycl DataSpace. + */ +bool isSyclDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { +// case DataSpace::SyclPinned: + case DataSpace::SyclManaged: + case DataSpace::SyclDevice: + return true; + default: + return false; + } +} + static int data_init_count = 0; @@ -262,6 +279,23 @@ void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align) ptr = detail::allocHipDeviceFineData(nbytes); } break; #endif +#if defined(RAJA_ENABLE_SYCL) +// case DataSpace::SyclPinned: +// { +// ptr = detail::allocSyclPinnedData(nbytes); +// } break; + case DataSpace::SyclManaged: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + ptr = detail::allocSyclManagedData(nbytes, qu); + } break; + case DataSpace::SyclDevice: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + ptr = detail::allocSyclDeviceData(nbytes, qu); + } break; +#endif + default: { diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 22dc49203..aebecc45f 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -249,6 +249,15 @@ class KernelBase return camp::resources::Hip::get_default(); } #endif +#if defined(RAJA_ENABLE_SYCL) + camp::resources::Sycl getSyclResource() + { +/* if (run_params.getGPUStream() == 0) { + return camp::resources::Cuda::CudaFromStream(0); + }*/ + return camp::resources::Sycl::get_default(); + } +#endif void synchronize() { @@ -266,6 +275,13 @@ class KernelBase hipErrchk( hipDeviceSynchronize() ); } #endif +#if defined(RAJA_ENABLE_SYCL) + if ( running_variant == Base_SYCL || + running_variant == RAJA_SYCL ) { + getSyclResource().wait(); + } +#endif + } Size_type getDataAlignment() const; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index f070a23e8..8c06f541c 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -368,6 +368,9 @@ static const std::string DataSpaceNames [] = std::string("HipDevice"), std::string("HipDeviceFine"), + std::string("SyclManaged"), + std::string("SyclDevice"), + std::string("Unknown Memory") // Keep this at the end and DO NOT remove.... }; // END VariantNames @@ -642,6 +645,13 @@ bool isDataSpaceAvailable(DataSpace dataSpace) ret_val = true; break; #endif +#if defined(RAJA_ENABLE_SYCL) +// case DataSpace::CudaPinned: + case DataSpace::SyclManaged: + case DataSpace::SyclDevice: + ret_val = true; break; +#endif + default: ret_val = false; break; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 9ea6f3624..ffaa1fc08 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -286,6 +286,9 @@ enum struct DataSpace { HipDevice, HipDeviceFine, + SyclManaged, + SyclDevice, + NumSpaces // Keep this one last and NEVER comment out (!!) }; diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index b988cbded..8ec21192b 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -143,6 +143,7 @@ void RunParams::print(std::ostream& str) const str << "\n cuda data space = " << getDataSpaceName(cudaDataSpace); str << "\n hip data space = " << getDataSpaceName(hipDataSpace); str << "\n kokkos data space = " << getDataSpaceName(kokkosDataSpace); + str << "\n sycl data space = " << getDataSpaceName(syclDataSpace); str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { @@ -529,7 +530,9 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) opt == std::string("--hip-data-space") || opt == std::string("-hds") || opt == std::string("--kokkos-data-space") || - opt == std::string("-kds") ) { + opt == std::string("-kds") || + opt == std::string("--sycl-data-space") || + opt == std::string("-syds")) { bool got_someting = false; bool got_something_available = false; @@ -563,6 +566,9 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt_name == std::string("--kokkos-data-space") || opt_name == std::string("-kds") ) { kokkosDataSpace = ds; + } else if ( opt_name == std::string("--sycl-data-space") || + opt_name == std::string("-syds") ) { + cudaDataSpace = ds; } else { got_someting = false; } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 63c071069..57672161b 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -140,6 +140,7 @@ class RunParams { DataSpace getCudaDataSpace() const { return cudaDataSpace; } DataSpace getHipDataSpace() const { return hipDataSpace; } DataSpace getKokkosDataSpace() const { return kokkosDataSpace; } + DataSpace getSyclDataSpace() const { return syclDataSpace; } double getPFTolerance() const { return pf_tol; } @@ -232,6 +233,7 @@ class RunParams { DataSpace cudaDataSpace = DataSpace::CudaDevice; DataSpace hipDataSpace = DataSpace::HipDevice; DataSpace kokkosDataSpace = DataSpace::Host; + DataSpace syclDataSpace = DataSpace::SyclDevice; // // Arrays to hold input strings for valid/invalid input. Helpful for diff --git a/src/common/SyclDataUtils.hpp b/src/common/SyclDataUtils.hpp index a444a8733..c8e63371e 100644 --- a/src/common/SyclDataUtils.hpp +++ b/src/common/SyclDataUtils.hpp @@ -39,7 +39,7 @@ void initSyclDeviceData(T& dptr, const T hptr, int len, sycl::queue* qu) len * sizeof(typename std::remove_pointer::type)); e.wait(); - incDataInitCount(); + detail::incDataInitCount(); } /*! @@ -78,6 +78,66 @@ void deallocSyclDeviceData(T& dptr, sycl::queue *qu) dptr = 0; } +namespace detail +{ +/* + * Copy memory len bytes from src to dst. + */ +inline void copySyclData(void* dst_ptr, const void* src_ptr, Size_type len, sycl::queue *qu) +{ + auto e = qu->memcpy( dst_ptr, src_ptr, len); +} + +/*! + * \brief Allocate SYCL device data array (dptr). + */ +inline void* allocSyclDeviceData(Size_type len, sycl::queue *qu) +{ + void* dptr = nullptr; + dptr = sycl::malloc_device(len, *qu); + return dptr; +} + +/*! + * \brief Allocate SYCL managed data array (dptr). + */ +inline void* allocSyclManagedData(Size_type len, sycl::queue *qu) +{ + void* mptr = nullptr; + mptr = sycl::malloc_shared(len, *qu); + return mptr; +} + +/*! + * \brief Allocate SYCL pinned data array (pptr). + *//* +inline void* allocSyclPinnedData(Size_type len) +{ + void* pptr = nullptr; + cudaErrchk( cudaHostAlloc( &pptr, len, cudaHostAllocMapped ) ); + return pptr; +} +*/ + +/*! + * \brief Free device data array. + */ +inline void deallocSyclDeviceData(void* dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + +/*! + * \brief Free managed data array. + */ +inline void deallocSyclManagedData(void* dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + +} // closing brace for detail namespac } // closing brace for rajaperf namespace From ebb118bb307b31231b7151bddc387854dfb2ae94 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 15 Nov 2023 13:32:48 -0800 Subject: [PATCH 114/454] Enable a basic run on Azure with SYCL variants turned off --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ef0ca313f..d150b7724 100644 --- a/Dockerfile +++ b/Dockerfile @@ -118,5 +118,5 @@ WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ cmake -DCMAKE_CXX_COMPILER=dpcpp -DENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 -DENABLE_TESTS=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure" && \ + ./bin/raja-perf.exe --checkrun --exclude-variants Base_SYCL RAJA_SYCL -sp" && \ cd .. && rm -rf build From 68ca18ea99c43bd7bb7fee99464c039ca10c210d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 17 Nov 2023 13:46:33 -0800 Subject: [PATCH 115/454] Fix incorrect variable name --- src/common/RunParams.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 8ec21192b..2bb2ec382 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -568,7 +568,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) kokkosDataSpace = ds; } else if ( opt_name == std::string("--sycl-data-space") || opt_name == std::string("-syds") ) { - cudaDataSpace = ds; + syclDataSpace = ds; } else { got_someting = false; } From 13cf7bb02a66261d4b339a5336a1d92ffafd3f98 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 17 Nov 2023 13:53:42 -0800 Subject: [PATCH 116/454] Add missing entry for Sycl --- src/common/Executor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 6df13da92..ba61c7b08 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -413,6 +413,9 @@ void Executor::reportRunSummary(ostream& str) const if (isVariantAvailable(VariantID::Base_HIP)) { str << "\nHip - " << getDataSpaceName(run_params.getHipDataSpace()); } + if (isVariantAvailable(VariantID::Base_SYCL)) { + str << "\nSycl - " << getDataSpaceName(run_params.getSyclDataSpace()); + } if (isVariantAvailable(VariantID::Kokkos_Lambda)) { str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosDataSpace()); } From 54f0533c096bb3f7afa06122529abbbc91cd11f8 Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Tue, 21 Nov 2023 16:16:52 +0000 Subject: [PATCH 117/454] Working SYCL memory space framework, DAXPY updated to use memory space --- src/basic/DAXPY-Sycl.cpp | 23 ++----------------- src/common/DataUtils.cpp | 44 ++++++++++++++++++++++++++++++++---- src/common/DataUtils.hpp | 3 +++ src/common/KernelBase.cpp | 4 ++++ src/common/RAJAPerfSuite.cpp | 3 ++- src/common/RAJAPerfSuite.hpp | 1 + src/common/SyclDataUtils.hpp | 18 +++++++++++---- 7 files changed, 65 insertions(+), 31 deletions(-) diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index 880c7e455..dc7802d3b 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -28,15 +28,6 @@ namespace rajaperf namespace basic { -//#define DAXPY_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(x, m_x, iend, qu); \ - allocAndInitSyclDeviceData(y, m_y, iend, qu); - -//#define DAXPY_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_y, y, iend, qu); \ - deallocSyclDeviceData(x, qu); \ - deallocSyclDeviceData(y, qu); - template void DAXPY::runSyclVariantImpl(VariantID vid) { @@ -51,8 +42,6 @@ void DAXPY::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { if (work_group_size > 0) { -// DAXPY_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -74,11 +63,8 @@ void DAXPY::runSyclVariantImpl(VariantID vid) stopTimer(); -// DAXPY_DATA_TEARDOWN_SYCL; } else { -// DAXPY_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -96,22 +82,19 @@ void DAXPY::runSyclVariantImpl(VariantID vid) stopTimer(); -// DAXPY_DATA_TEARDOWN_SYCL; } } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { - std::cout << "\n INIT3 : RAJA_SYCL does not support auto work group size" << std::endl; + std::cout << "\n DAXPY : RAJA_SYCL does not support auto work group size" << std::endl; return; } -// DAXPY_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { DAXPY_BODY; }); @@ -120,8 +103,6 @@ void DAXPY::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); -// DAXPY_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n DAXPY : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index b44a6c8f2..cdc41d81b 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -113,7 +113,7 @@ bool isHipDataSpace(DataSpace dataSpace) bool isSyclDataSpace(DataSpace dataSpace) { switch (dataSpace) { -// case DataSpace::SyclPinned: + case DataSpace::SyclPinned: case DataSpace::SyclManaged: case DataSpace::SyclDevice: return true; @@ -280,10 +280,11 @@ void* allocData(DataSpace dataSpace, Size_type nbytes, Size_type align) } break; #endif #if defined(RAJA_ENABLE_SYCL) -// case DataSpace::SyclPinned: -// { -// ptr = detail::allocSyclPinnedData(nbytes); -// } break; + case DataSpace::SyclPinned: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + ptr = detail::allocSyclPinnedData(nbytes, qu); + } break; case DataSpace::SyclManaged: { auto qu = camp::resources::Sycl::get_default().get_queue(); @@ -344,6 +345,14 @@ void copyData(DataSpace dst_dataSpace, void* dst_ptr, } #endif +#if defined(RAJA_ENABLE_SYCL) + else if (isSyclDataSpace(dst_dataSpace) || + isSyclDataSpace(src_dataSpace)) { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::copySyclData(dst_ptr, src_ptr, nbytes,qu); + } +#endif + else { throw std::invalid_argument("copyData : Unknown data space"); } @@ -427,6 +436,26 @@ void deallocData(DataSpace dataSpace, void* ptr) } break; #endif +#if defined(RAJA_ENABLE_SYCL) + case DataSpace::SyclPinned: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::deallocSyclPinnedData(ptr,qu); + } break; + case DataSpace::SyclManaged: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::deallocSyclManagedData(ptr,qu); + } break; + case DataSpace::SyclDevice: + { + auto qu = camp::resources::Sycl::get_default().get_queue(); + detail::deallocSyclDeviceData(ptr,qu); + } break; +#endif + + + default: { throw std::invalid_argument("deallocData : Unknown data space"); @@ -645,6 +674,7 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace) case DataSpace::HipPinned: case DataSpace::HipPinnedFine: case DataSpace::HipPinnedCoarse: + case DataSpace::SyclPinned: return dataSpace; case DataSpace::OmpTarget: @@ -667,6 +697,10 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace) case DataSpace::HipDeviceFine: return DataSpace::HipPinned; + case DataSpace::SyclManaged: + case DataSpace::SyclDevice: + return DataSpace::SyclPinned; + default: { throw std::invalid_argument("hostAccessibleDataSpace : Unknown data space"); diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 67a612c83..bba0674a7 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -27,6 +27,9 @@ #if defined(RAJA_ENABLE_HIP) #include "RAJA/policy/hip/MemUtils_HIP.hpp" #endif +#if defined(RAJA_ENABLE_HIP) +#include "RAJA/policy/sycl/MemUtils_SYCL.hpp" +#endif namespace rajaperf { diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 424796b26..8f3654c13 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -239,6 +239,10 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const case Kokkos_Lambda : return run_params.getKokkosDataSpace(); + case Base_SYCL : + case RAJA_SYCL : + return run_params.getSyclDataSpace(); + default: throw std::invalid_argument("getDataSpace : Unknown variant id"); } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 8c06f541c..48d796013 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -368,6 +368,7 @@ static const std::string DataSpaceNames [] = std::string("HipDevice"), std::string("HipDeviceFine"), + std::string("SyclPinned"), std::string("SyclManaged"), std::string("SyclDevice"), @@ -646,7 +647,7 @@ bool isDataSpaceAvailable(DataSpace dataSpace) #endif #if defined(RAJA_ENABLE_SYCL) -// case DataSpace::CudaPinned: + case DataSpace::SyclPinned: case DataSpace::SyclManaged: case DataSpace::SyclDevice: ret_val = true; break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index ffaa1fc08..aabaa58df 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -286,6 +286,7 @@ enum struct DataSpace { HipDevice, HipDeviceFine, + SyclPinned, SyclManaged, SyclDevice, diff --git a/src/common/SyclDataUtils.hpp b/src/common/SyclDataUtils.hpp index c8e63371e..8301f5006 100644 --- a/src/common/SyclDataUtils.hpp +++ b/src/common/SyclDataUtils.hpp @@ -86,6 +86,7 @@ namespace detail inline void copySyclData(void* dst_ptr, const void* src_ptr, Size_type len, sycl::queue *qu) { auto e = qu->memcpy( dst_ptr, src_ptr, len); + e.wait(); } /*! @@ -110,14 +111,14 @@ inline void* allocSyclManagedData(Size_type len, sycl::queue *qu) /*! * \brief Allocate SYCL pinned data array (pptr). - *//* -inline void* allocSyclPinnedData(Size_type len) + */ +inline void* allocSyclPinnedData(Size_type len, sycl::queue *qu) { void* pptr = nullptr; - cudaErrchk( cudaHostAlloc( &pptr, len, cudaHostAllocMapped ) ); + pptr = sycl::malloc_host(len, *qu); return pptr; } -*/ + /*! * \brief Free device data array. @@ -137,6 +138,15 @@ inline void deallocSyclManagedData(void* dptr, sycl::queue *qu) dptr = 0; } +/*! + * \brief Free managed data array. + */ +inline void deallocSyclPinnedData(void* dptr, sycl::queue *qu) +{ + sycl::free(dptr, *qu); + dptr = 0; +} + } // closing brace for detail namespac } // closing brace for rajaperf namespace From 848407ec237a3c195ef5fc9b831ff282abc470b5 Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Tue, 21 Nov 2023 16:33:42 +0000 Subject: [PATCH 118/454] Add help information for sycl-data-space --- src/common/RunParams.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 2bb2ec382..1249346f7 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1003,6 +1003,13 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n"; + str << "\t --sycl-data-space, -syds [Default is SyclDevice]\n" + << "\t (names of data space to use for SYCL variants)\n" + << "\t Valid data space names are 'SyclDevice', 'SyclPinned', or 'SyclManaged'\n"; + str << "\t\t Examples...\n" + << "\t\t --sycl-data-space SyclManaged (run SYCL variants with Sycl Managed memory)\n" + << "\t\t -syds SyclPinned (run SYCL variants with Sycl Pinned memory)\n\n"; + #if defined(RAJA_PERFSUITE_USE_CALIPER) str << "\t --add-to-spot-config, -atsc [Default is none]\n" << "\t\t appends additional parameters to the built-in Caliper spot config\n"; From 69edb2f7e44ceaa2b668caa9c49200453d0c9154 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Nov 2023 13:53:31 -0800 Subject: [PATCH 119/454] Remove non-atomic reduction "implementations" --- src/algorithm/REDUCE_SUM-Cuda.cpp | 6 ------ src/algorithm/REDUCE_SUM-Hip.cpp | 6 ------ src/basic/PI_REDUCE-Cuda.cpp | 6 ------ src/basic/PI_REDUCE-Hip.cpp | 8 +------- src/basic/REDUCE3_INT-Cuda.cpp | 8 -------- src/basic/REDUCE3_INT-Hip.cpp | 8 -------- src/basic/REDUCE_STRUCT-Cuda.cpp | 1 - src/basic/REDUCE_STRUCT-Hip.cpp | 1 - src/basic/TRAP_INT-Cuda.cpp | 7 ------- src/basic/TRAP_INT-Hip.cpp | 7 ------- src/stream/DOT-Cuda.cpp | 7 ------- src/stream/DOT-Hip.cpp | 10 +--------- 12 files changed, 2 insertions(+), 73 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 822a33eaf..2c3787f4e 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -48,15 +48,9 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( dsum, psum[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *dsum += psum[ 0 ]; - } -#endif } diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 073e32a59..eb5204fbd 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -53,15 +53,9 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( dsum, psum[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *dsum += psum[ 0 ]; - } -#endif } diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index e3cd15273..8acf4aaa1 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -47,15 +47,9 @@ __global__ void pi_reduce(Real_type dx, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( dpi, ppi[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *dpi += ppi[ 0 ]; - } -#endif } diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 5a28adb85..7b3811e70 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -47,15 +47,9 @@ __global__ void pi_reduce(Real_type dx, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { - RAJA::atomicAdd(RAJA::hip_atomic{}, dpi, ppi[ 0 ] ); + RAJA::atomicAdd( dpi, ppi[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) i{ - *dpi += ppi[ 0 ]; - } -#endif } diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index d5572ddac..d97328819 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -58,19 +58,11 @@ __global__ void reduce3int(Int_ptr vec, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( vsum, psum[ 0 ] ); RAJA::atomicMin( vmin, pmin[ 0 ] ); RAJA::atomicMax( vmax, pmax[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *vsum += psum[ 0 ]; - *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); - *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); - } -#endif } diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 528dd4b55..801c193e6 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -58,19 +58,11 @@ __global__ void reduce3int(Int_ptr vec, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( vsum, psum[ 0 ] ); RAJA::atomicMin( vmin, pmin[ 0 ] ); RAJA::atomicMax( vmax, pmax[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *vsum += psum[ 0 ]; - *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); - *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); - } -#endif } diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 0025d0825..5b49b8176 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -85,7 +85,6 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, __syncthreads(); } -// serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( xsum, pxsum[ 0 ] ); RAJA::atomicMin( xmin, pxmin[ 0 ] ); diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 2b7213c8b..e7dbfb55e 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -85,7 +85,6 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, __syncthreads(); } -// serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( xsum, pxsum[ 0 ] ); RAJA::atomicMin( xmin, pxmin[ 0 ] ); diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 2b1d62851..16d9089ba 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -66,16 +66,9 @@ __global__ void trapint(Real_type x0, Real_type xp, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( sumx, psumx[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *sumx += psumx[ 0 ]; - } -#endif - } diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index a092ecba8..3f7d057f0 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -66,16 +66,9 @@ __global__ void trapint(Real_type x0, Real_type xp, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( sumx, psumx[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *sumx += psumx[ 0 ]; - } -#endif - } diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index ddbf36128..c48974ef7 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -46,16 +46,9 @@ __global__ void dot(Real_ptr a, Real_ptr b, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { RAJA::atomicAdd( dprod, pdot[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *dprod += pdot[ 0 ]; - } -#endif - } diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 340807cad..1cea2bd51 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -46,17 +46,9 @@ __global__ void dot(Real_ptr a, Real_ptr b, __syncthreads(); } -#if 1 // serialized access to shared data; if ( threadIdx.x == 0 ) { - //atomicAdd(dprod, pdot[ 0 ] ); - RAJA::atomicAdd(RAJA::hip_atomic{}, dprod, pdot[ 0 ] ); + RAJA::atomicAdd( dprod, pdot[ 0 ] ); } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *dprod += pdot[ 0 ]; - } -#endif - } From ee64ce916c209a6a81e2ef8e94adbd18a2ea73a6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Nov 2023 15:04:15 -0800 Subject: [PATCH 120/454] Add hostBasedDataSpace function This separates the use cases of getting a hostBasedDataSpace which is usable on the host with high performance versus getting a hostAccessibleDataSpace which may not be performant. --- src/common/DataUtils.cpp | 63 ++++++++++++++++++++++++++++++++++----- src/common/DataUtils.hpp | 30 ++++++++++++++----- src/common/KernelBase.cpp | 5 ---- src/common/KernelBase.hpp | 8 ++--- 4 files changed, 82 insertions(+), 24 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index adb757012..f374b358a 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -279,8 +279,8 @@ void copyData(DataSpace dst_dataSpace, void* dst_ptr, DataSpace src_dataSpace, const void* src_ptr, Size_type nbytes) { - if (hostAccessibleDataSpace(dst_dataSpace) == dst_dataSpace && - hostAccessibleDataSpace(src_dataSpace) == src_dataSpace) { + if (hostBasedDataSpace(dst_dataSpace) == dst_dataSpace && + hostBasedDataSpace(src_dataSpace) == src_dataSpace) { detail::copyHostData(dst_ptr, src_ptr, nbytes); } @@ -598,19 +598,30 @@ long double calcChecksum(Complex_ptr ptr, Size_type len, /*! - * \brief Get an host accessible data space for this dataSpace. + * \brief Get an host based data space for the given dataSpace. + * + * A host based data space is one that is always stored on the host. + * + * The intention is to check if the performance (bandwidth) of the given data + * space is good on the host. If not then fall back on a space that performs + * well on the host and in explicit copy operations with the given space. */ -DataSpace hostAccessibleDataSpace(DataSpace dataSpace) +DataSpace hostBasedDataSpace(DataSpace dataSpace) { switch (dataSpace) { case DataSpace::Host: case DataSpace::Omp: case DataSpace::CudaPinned: + case DataSpace::CudaManagedHostPreferred: + case DataSpace::CudaManagedHostPreferredDeviceAccessed: case DataSpace::HipHostAdviseFine: case DataSpace::HipHostAdviseCoarse: case DataSpace::HipPinned: case DataSpace::HipPinnedFine: case DataSpace::HipPinnedCoarse: + case DataSpace::HipManaged: + case DataSpace::HipManagedAdviseFine: + case DataSpace::HipManagedAdviseCoarse: return dataSpace; case DataSpace::OmpTarget: @@ -622,16 +633,54 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace) case DataSpace::CudaDevice: return DataSpace::CudaPinned; + case DataSpace::HipDevice: + case DataSpace::HipDeviceFine: + return DataSpace::HipPinned; + + default: + { + throw std::invalid_argument("hostBasedDataSpace : Unknown data space"); + } break; + } +} + +/*! + * \brief Get an host accessible data space for the given dataSpace. + * + * A host accessible data space is one that can be accessed on the host. + * + * The intention is to check if the given memory space is accessible on the + * host. If not then fall back on a space that is host accessible and can be + * used with explicit copy operations with the given space. + */ +DataSpace hostAccessibleDataSpace(DataSpace dataSpace) +{ + switch (dataSpace) { + case DataSpace::Host: + case DataSpace::Omp: + case DataSpace::CudaPinned: + case DataSpace::CudaManaged: case DataSpace::CudaManagedHostPreferred: case DataSpace::CudaManagedHostPreferredDeviceAccessed: + case DataSpace::CudaManagedDevicePreferred: + case DataSpace::CudaManagedDevicePreferredHostAccessed: + case DataSpace::HipHostAdviseFine: + case DataSpace::HipHostAdviseCoarse: + case DataSpace::HipPinned: + case DataSpace::HipPinnedFine: + case DataSpace::HipPinnedCoarse: case DataSpace::HipManaged: case DataSpace::HipManagedAdviseFine: case DataSpace::HipManagedAdviseCoarse: - return dataSpace; - case DataSpace::HipDevice: case DataSpace::HipDeviceFine: - return DataSpace::HipPinned; + return dataSpace; + + case DataSpace::OmpTarget: + return DataSpace::Host; + + case DataSpace::CudaDevice: + return DataSpace::CudaPinned; default: { diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 67a612c83..710cddd3f 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -160,10 +160,24 @@ long double calcChecksum(Complex_ptr d, Size_type len, /*! - * \brief Get an host accessible data space for this dataSpace. + * \brief Get an host based data space for the given dataSpace. * - * Intended to be a space that is quick to copy to from the given space if - * the given space is not accessible on the Host. + * A host based data space is one that is always stored on the host. + * + * The intention is to check if the performance (bandwidth) of the given data + * space is good on the host. If not then fall back on a space that performs + * well on the host and in explicit copy operations with the given space. + */ +DataSpace hostBasedDataSpace(DataSpace dataSpace); + +/*! + * \brief Get an host accessible data space for the given dataSpace. + * + * A host accessible data space is one that can be accessed on the host. + * + * The intention is to check if the given memory space is accessible on the + * host. If not then fall back on a space that is host accessible and can be + * used with explicit copy operations with the given space. */ DataSpace hostAccessibleDataSpace(DataSpace dataSpace); @@ -294,7 +308,7 @@ struct AutoDataMover template inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -313,7 +327,7 @@ template inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align, T val) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -330,7 +344,7 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, S template inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -348,7 +362,7 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len template inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -367,7 +381,7 @@ inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size T* check_ptr = ptr; T* copied_ptr = nullptr; - DataSpace check_dataSpace = hostAccessibleDataSpace(dataSpace); + DataSpace check_dataSpace = hostBasedDataSpace(dataSpace); if (check_dataSpace != dataSpace) { allocData(check_dataSpace, copied_ptr, len, align); diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index b08fa179e..2b29dc9d0 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -235,11 +235,6 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const } } -DataSpace KernelBase::getHostAccessibleDataSpace(VariantID vid) const -{ - return hostAccessibleDataSpace(getDataSpace(vid)); -} - void KernelBase::execute(VariantID vid, size_t tune_idx) { running_variant = vid; diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 08be3d71e..c48db73a8 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -261,7 +261,6 @@ class KernelBase Size_type getDataAlignment() const; DataSpace getDataSpace(VariantID vid) const; - DataSpace getHostAccessibleDataSpace(VariantID vid) const; template void allocData(DataSpace dataSpace, T& ptr, Size_type len) @@ -322,9 +321,10 @@ class KernelBase template rajaperf::AutoDataMover scopedMoveData(T*& ptr, Size_type len, VariantID vid) { - rajaperf::moveData(getHostAccessibleDataSpace(vid), getDataSpace(vid), - ptr, len, getDataAlignment()); - return {getDataSpace(vid), getHostAccessibleDataSpace(vid), ptr, len, getDataAlignment()}; + DataSpace ds = getDataSpace(vid); + DataSpace hds = rajaperf::hostBasedDataSpace(ds); + rajaperf::moveData(hds, ds, ptr, len, getDataAlignment()); + return {ds, hds, ptr, len, getDataAlignment()}; } template From 9f98e0a11abd407faa38f846712d11ed3f5cfad9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Nov 2023 15:54:38 -0800 Subject: [PATCH 121/454] Add reduction data space param --- src/common/Executor.cpp | 21 +++++++++++++++++++++ src/common/KernelBase.cpp | 36 ++++++++++++++++++++++++++++++++++++ src/common/KernelBase.hpp | 1 + src/common/RunParams.cpp | 28 +++++++++++++++++++++++++++- src/common/RunParams.hpp | 14 ++++++++++++++ 5 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index d730fb21d..c48459e7c 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -414,6 +414,27 @@ void Executor::reportRunSummary(ostream& str) const } str << endl; + str << "\nReduction Data Spaces" + << "\n--------"; + str << "\nSeq - " << getDataSpaceName(run_params.getSeqReductionDataSpace()); + if (isVariantAvailable(VariantID::Base_OpenMP)) { + str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Base_OpenMPTarget)) { + str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Base_CUDA)) { + str << "\nCuda - " << getDataSpaceName(run_params.getCudaReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Base_HIP)) { + str << "\nHip - " << getDataSpaceName(run_params.getHipReductionDataSpace()); + } + if (isVariantAvailable(VariantID::Kokkos_Lambda)) { + str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosReductionDataSpace()); + } + str << endl; + + str << "\nVariants and Tunings" << "\n--------\n"; for (size_t iv = 0; iv < variant_ids.size(); ++iv) { diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 2b29dc9d0..0ea1b91c9 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -235,6 +235,42 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const } } +DataSpace KernelBase::getReductionDataSpace(VariantID vid) const +{ + switch ( vid ) { + + case Base_Seq : + case Lambda_Seq : + case RAJA_Seq : + return run_params.getSeqReductionDataSpace(); + + case Base_OpenMP : + case Lambda_OpenMP : + case RAJA_OpenMP : + return run_params.getOmpReductionDataSpace(); + + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + return run_params.getOmpTargetReductionDataSpace(); + + case Base_CUDA : + case Lambda_CUDA : + case RAJA_CUDA : + return run_params.getCudaReductionDataSpace(); + + case Base_HIP : + case Lambda_HIP : + case RAJA_HIP : + return run_params.getHipReductionDataSpace(); + + case Kokkos_Lambda : + return run_params.getKokkosReductionDataSpace(); + + default: + throw std::invalid_argument("getReductionDataSpace : Unknown variant id"); + } +} + void KernelBase::execute(VariantID vid, size_t tune_idx) { running_variant = vid; diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index c48db73a8..640710770 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -261,6 +261,7 @@ class KernelBase Size_type getDataAlignment() const; DataSpace getDataSpace(VariantID vid) const; + DataSpace getReductionDataSpace(VariantID vid) const; template void allocData(DataSpace dataSpace, T& ptr, Size_type len) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index b988cbded..131b86b5a 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -144,6 +144,13 @@ void RunParams::print(std::ostream& str) const str << "\n hip data space = " << getDataSpaceName(hipDataSpace); str << "\n kokkos data space = " << getDataSpaceName(kokkosDataSpace); + str << "\n seq reduction data space = " << getDataSpaceName(seqReductionDataSpace); + str << "\n omp reduction data space = " << getDataSpaceName(ompReductionDataSpace); + str << "\n omp target reduction data space = " << getDataSpaceName(ompTargetReductionDataSpace); + str << "\n cuda reduction data space = " << getDataSpaceName(cudaReductionDataSpace); + str << "\n hip reduction data space = " << getDataSpaceName(hipReductionDataSpace); + str << "\n kokkos reduction data space = " << getDataSpaceName(kokkosReductionDataSpace); + str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { str << "\n\t" << kernel_input[j]; @@ -529,7 +536,13 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) opt == std::string("--hip-data-space") || opt == std::string("-hds") || opt == std::string("--kokkos-data-space") || - opt == std::string("-kds") ) { + opt == std::string("-kds") || + opt == std::string("--seq-reduction-data-space") || + opt == std::string("--omp-reduction-data-space") || + opt == std::string("--omptarget-reduction-data-space") || + opt == std::string("--cuda-reduction-data-space") || + opt == std::string("--hip-reduction-data-space") || + opt == std::string("--kokkos-reduction-data-space") ) { bool got_someting = false; bool got_something_available = false; @@ -563,6 +576,18 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } else if ( opt_name == std::string("--kokkos-data-space") || opt_name == std::string("-kds") ) { kokkosDataSpace = ds; + } else if ( opt_name == std::string("--seq-reduction-data-space") ) { + seqReductionDataSpace = ds; + } else if ( opt_name == std::string("--omp-reduction-data-space") ) { + ompReductionDataSpace = ds; + } else if ( opt_name == std::string("--omptarget-reduction-data-space") ) { + ompTargetReductionDataSpace = ds; + } else if ( opt_name == std::string("--cuda-reduction-data-space") ) { + cudaReductionDataSpace = ds; + } else if ( opt_name == std::string("--hip-reduction-data-space") ) { + hipReductionDataSpace = ds; + } else if ( opt_name == std::string("--kokkos-reduction-data-space") ) { + kokkosReductionDataSpace = ds; } else { got_someting = false; } @@ -583,6 +608,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } } } + } else if ( std::string(argv[i]) == std::string("--tunings") || std::string(argv[i]) == std::string("-t") ) { diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 63c071069..d8359b166 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -141,6 +141,13 @@ class RunParams { DataSpace getHipDataSpace() const { return hipDataSpace; } DataSpace getKokkosDataSpace() const { return kokkosDataSpace; } + DataSpace getSeqReductionDataSpace() const { return seqReductionDataSpace; } + DataSpace getOmpReductionDataSpace() const { return ompReductionDataSpace; } + DataSpace getOmpTargetReductionDataSpace() const { return ompTargetReductionDataSpace; } + DataSpace getCudaReductionDataSpace() const { return cudaReductionDataSpace; } + DataSpace getHipReductionDataSpace() const { return hipReductionDataSpace; } + DataSpace getKokkosReductionDataSpace() const { return kokkosReductionDataSpace; } + double getPFTolerance() const { return pf_tol; } int getCheckRunReps() const { return checkrun_reps; } @@ -233,6 +240,13 @@ class RunParams { DataSpace hipDataSpace = DataSpace::HipDevice; DataSpace kokkosDataSpace = DataSpace::Host; + DataSpace seqReductionDataSpace = DataSpace::Host; + DataSpace ompReductionDataSpace = DataSpace::Omp; + DataSpace ompTargetReductionDataSpace = DataSpace::OmpTarget; + DataSpace cudaReductionDataSpace = DataSpace::CudaManagedDevicePreferredHostAccessed; + DataSpace hipReductionDataSpace = DataSpace::HipDevice; + DataSpace kokkosReductionDataSpace = DataSpace::Host; + // // Arrays to hold input strings for valid/invalid input. Helpful for // debugging command line args. From 2f277385dd1293362b60679d95ccc5113486d5d3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 21 Nov 2023 15:59:00 -0800 Subject: [PATCH 122/454] Use reduction data space in REDUCE_SUM --- src/algorithm/REDUCE_SUM-Cuda.cpp | 90 +++++++++++++++++++++++------ src/algorithm/REDUCE_SUM-Hip.cpp | 94 ++++++++++++++++++++++++------- 2 files changed, 146 insertions(+), 38 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 2c3787f4e..fbd3c41d5 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -70,8 +70,16 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) int len = iend - ibegin; - Real_type* sum_storage; - allocData(DataSpace::CudaPinned, sum_storage, 1); + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + + Real_ptr dsum; + allocData(rds, dsum, 1); + Real_ptr hsum = dsum; + if (separate_buffers) { + allocData(hrds, hsum, 1); + } // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -79,7 +87,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + dsum, len, ::cub::Sum(), m_sum_init, @@ -98,21 +106,29 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + dsum, len, ::cub::Sum(), m_sum_init, stream)); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hsum, dsum, sizeof(Real_type), + cudaMemcpyDeviceToHost, stream ) ); + } + cudaErrchk(cudaStreamSynchronize(stream)); - m_sum = *sum_storage; + m_sum = *hsum; } stopTimer(); // Free temporary storage deallocData(DataSpace::CudaDevice, temp_storage); - deallocData(DataSpace::CudaPinned, sum_storage); + deallocData(rds, dsum); + if (separate_buffers) { + deallocData(hrds, hsum); + } } else { @@ -135,14 +151,27 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dsum; - allocData(DataSpace::CudaDevice, dsum, 1); + allocData(rds, dsum, 1); + Real_ptr hsum = dsum; + if (separate_buffers) { + allocData(hrds, hsum, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsum = m_sum_init; + cudaErrchk( cudaMemcpyAsync( dsum, hsum, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dsum = m_sum_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -152,14 +181,20 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_sum, dsum, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hsum, dsum, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + m_sum = *hsum; } stopTimer(); - deallocData(DataSpace::CudaDevice, dsum); + deallocData(rds, dsum); + if (separate_buffers) { + deallocData(hrds, hsum); + } } else if ( vid == RAJA_CUDA ) { @@ -199,8 +234,16 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dsum; - allocData(DataSpace::CudaDevice, dsum, 1); + allocData(rds, dsum, 1); + Real_ptr hsum = dsum; + if (separate_buffers) { + allocData(hrds, hsum, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -209,8 +252,13 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsum = m_sum_init; + cudaErrchk( cudaMemcpyAsync( dsum, hsum, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dsum = m_sum_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -220,14 +268,20 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_sum, dsum, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hsum, dsum, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); + m_sum = *hsum; } stopTimer(); - deallocData(DataSpace::CudaDevice, dsum); + deallocData(rds, dsum); + if (separate_buffers) { + deallocData(hrds, hsum); + } } else if ( vid == RAJA_CUDA ) { diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index eb5204fbd..521cb79fa 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -75,8 +75,16 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) int len = iend - ibegin; - Real_type* sum_storage; - allocData(DataSpace::HipPinnedCoarse, sum_storage, 1); + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + + Real_ptr dsum; + allocData(rds, dsum, 1); + Real_ptr hsum = dsum; + if (separate_buffers) { + allocData(hrds, hsum, 1); + } // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -85,7 +93,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::rocprim::reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + dsum, m_sum_init, len, rocprim::plus(), @@ -94,7 +102,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + dsum, len, ::cub::Sum(), m_sum_init, @@ -115,7 +123,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::rocprim::reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + dsum, m_sum_init, len, rocprim::plus(), @@ -124,22 +132,30 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - sum_storage, + dsum, len, ::cub::Sum(), m_sum_init, stream)); #endif + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hsum, dsum, sizeof(Real_type), + hipMemcpyDeviceToHost, stream ) ); + } + hipErrchk(hipStreamSynchronize(stream)); - m_sum = *sum_storage; + m_sum = *hsum; } stopTimer(); // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); - deallocData(DataSpace::HipPinnedCoarse, sum_storage); + deallocData(rds, dsum); + if (separate_buffers) { + deallocData(hrds, hsum); + } } else { @@ -162,14 +178,27 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dsum; - allocData(DataSpace::HipDevice, dsum, 1); + allocData(rds, dsum, 1); + Real_ptr hsum = dsum; + if (separate_buffers) { + allocData(hrds, hsum, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsum = m_sum_init; + hipErrchk( hipMemcpyAsync( dsum, hsum, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dsum = m_sum_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -178,14 +207,20 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) x, dsum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_sum, dsum, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hsum, dsum, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + m_sum = *hsum; } stopTimer(); - deallocData(DataSpace::HipDevice, dsum); + deallocData(rds, dsum); + if (separate_buffers) { + deallocData(hrds, hsum); + } } else if ( vid == RAJA_HIP ) { @@ -225,8 +260,16 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dsum; - allocData(DataSpace::HipDevice, dsum, 1); + allocData(rds, dsum, 1); + Real_ptr hsum = dsum; + if (separate_buffers) { + allocData(hrds, hsum, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -235,8 +278,13 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dsum, &m_sum_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsum = m_sum_init; + hipErrchk( hipMemcpyAsync( dsum, hsum, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dsum = m_sum_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -245,14 +293,20 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) x, dsum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_sum, dsum, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hsum, dsum, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); + m_sum = *hsum; } stopTimer(); - deallocData(DataSpace::HipDevice, dsum); + deallocData(rds, dsum); + if (separate_buffers) { + deallocData(hrds, hsum); + } } else if ( vid == RAJA_HIP ) { From 966a495349b2196a63a7e04782d39049ad5a2b52 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 13:06:36 -0800 Subject: [PATCH 123/454] Use reduction data space in PI_REDUCE --- src/basic/PI_REDUCE-Cuda.cpp | 64 ++++++++++++++++++++++++++++-------- src/basic/PI_REDUCE-Hip.cpp | 64 ++++++++++++++++++++++++++++-------- 2 files changed, 100 insertions(+), 28 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 8acf4aaa1..fdda43dec 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -67,14 +67,27 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dpi; - allocData(DataSpace::CudaDevice, dpi, 1); + allocData(rds, dpi, 1); + Real_ptr hpi = dpi; + if (separate_buffers) { + allocData(hrds, hpi, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + cudaErrchk( cudaMemcpyAsync( dpi, hpi, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dpi = m_pi_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -84,15 +97,20 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_pi, dpi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hpi, dpi, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi *= 4.0; + m_pi = *hpi * 4.0; } stopTimer(); - deallocData(DataSpace::CudaDevice, dpi); + deallocData(rds, dpi); + if (separate_buffers) { + deallocData(hrds, hpi); + } } else if ( vid == RAJA_CUDA ) { @@ -129,8 +147,16 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dpi; - allocData(DataSpace::CudaDevice, dpi, 1); + allocData(rds, dpi, 1); + Real_ptr hpi = dpi; + if (separate_buffers) { + allocData(hrds, hpi, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -139,8 +165,13 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + cudaErrchk( cudaMemcpyAsync( dpi, hpi, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dpi = m_pi_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -150,15 +181,20 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_pi, dpi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hpi, dpi, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi *= 4.0; + m_pi = *hpi * 4.0; } stopTimer(); - deallocData(DataSpace::CudaDevice, dpi); + deallocData(rds, dpi); + if (separate_buffers) { + deallocData(hrds, hpi); + } } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 7b3811e70..5c2078df3 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -67,14 +67,27 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dpi; - allocData(DataSpace::HipDevice, dpi, 1); + allocData(rds, dpi, 1); + Real_ptr hpi = dpi; + if (separate_buffers) { + allocData(hrds, hpi, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + hipErrchk( hipMemcpyAsync( dpi, hpi, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dpi = m_pi_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -83,15 +96,20 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) dx, dpi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_pi, dpi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hpi, dpi, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi *= 4.0; + m_pi = *hpi * 4.0; } stopTimer(); - deallocData(DataSpace::HipDevice, dpi); + deallocData(rds, dpi); + if (separate_buffers) { + deallocData(hrds, hpi); + } } else if ( vid == RAJA_HIP ) { @@ -128,8 +146,16 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dpi; - allocData(DataSpace::HipDevice, dpi, 1); + allocData(rds, dpi, 1); + Real_ptr hpi = dpi; + if (separate_buffers) { + allocData(hrds, hpi, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -138,8 +164,13 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dpi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + hipErrchk( hipMemcpyAsync( dpi, hpi, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dpi = m_pi_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -148,15 +179,20 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) dx, dpi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_pi, dpi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hpi, dpi, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi *= 4.0; + m_pi = *hpi * 4.0; } stopTimer(); - deallocData(DataSpace::HipDevice, dpi); + deallocData(rds, dpi); + if (separate_buffers) { + deallocData(hrds, hpi); + } } else if ( vid == RAJA_HIP ) { From 65aab9e15ef33abf5d1e803abf01b9d9abfd90c9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 13:06:44 -0800 Subject: [PATCH 124/454] Use reduction data space in PI_ATOMIC --- src/basic/PI_ATOMIC-Cuda.cpp | 64 +++++++++++++++++++++++++++--------- src/basic/PI_ATOMIC-Hip.cpp | 64 +++++++++++++++++++++++++++--------- src/basic/PI_ATOMIC.cpp | 2 +- 3 files changed, 99 insertions(+), 31 deletions(-) diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 644d358dc..1ecdae2cd 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -47,23 +47,39 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) PI_ATOMIC_DATA_SETUP; + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + + Real_ptr hpi = pi; + if (separate_buffers) { + allocData(hrds, hpi, 1); + } + if ( vid == Base_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *pi = m_pi_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; pi_atomic<<>>( pi, dx, iend ); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + m_pi_final = *hpi * 4.0; } stopTimer(); @@ -73,8 +89,13 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *pi = m_pi_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; @@ -85,10 +106,12 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) }); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + m_pi_final = *hpi * 4.0; } stopTimer(); @@ -98,8 +121,13 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *pi = m_pi_init; + } RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -107,10 +135,12 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - cudaErrchk( cudaMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + m_pi_final = *hpi * 4.0; } stopTimer(); @@ -118,6 +148,10 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) } else { getCout() << "\n PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; } + + if (separate_buffers) { + deallocData(hrds, hpi); + } } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Cuda) diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 1db304a52..fa3d705b7 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -47,23 +47,39 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) PI_ATOMIC_DATA_SETUP; + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + + Real_ptr hpi = pi; + if (separate_buffers) { + allocData(hrds, hpi, 1); + } + if ( vid == Base_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *pi = m_pi_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; hipLaunchKernelGGL((atomic_pi),grid_size, block_size, shmem, res.get_stream(), pi, dx, iend ); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + m_pi_final = *hpi * 4.0; } stopTimer(); @@ -73,8 +89,13 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *pi = m_pi_init; + } auto atomic_pi_lambda = [=] __device__ (Index_type i) { double x = (double(i) + 0.5) * dx; @@ -87,10 +108,12 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, atomic_pi_lambda); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + m_pi_final = *hpi * 4.0; } stopTimer(); @@ -100,8 +123,13 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( pi, &m_pi_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hpi = m_pi_init; + hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *pi = m_pi_init; + } RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -109,10 +137,12 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - hipErrchk( hipMemcpyAsync( &m_pi_final, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final *= 4.0; + m_pi_final = *hpi * 4.0; } stopTimer(); @@ -120,6 +150,10 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) } else { getCout() << "\n PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl; } + + if (separate_buffers) { + deallocData(hrds, hpi); + } } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Hip) diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index af33d01fc..6e08f5813 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -64,7 +64,7 @@ PI_ATOMIC::~PI_ATOMIC() void PI_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_dx = 1.0 / double(getActualProblemSize()); - allocAndInitDataConst(m_pi, 1, 0.0, vid); + allocData(getReductionDataSpace(vid), m_pi, 1); m_pi_init = 0.0; m_pi_final = -static_cast(vid); } From 1466488665e9e0af1089e5585c100aff927fdaa5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 13:49:30 -0800 Subject: [PATCH 125/454] Use reduction data space in REDUCE3_INT --- src/basic/REDUCE3_INT-Cuda.cpp | 92 ++++++++++++++++++++++------------ src/basic/REDUCE3_INT-Hip.cpp | 92 ++++++++++++++++++++++------------ 2 files changed, 120 insertions(+), 64 deletions(-) diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index d97328819..44accd32d 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -80,20 +80,31 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - Int_ptr vmem_init; - allocData(DataSpace::CudaPinned, vmem_init, 3); + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; Int_ptr vmem; - allocData(DataSpace::CudaDevice, vmem, 3); + allocData(rds, vmem, 3); + Int_ptr hvmem = vmem; + if (separate_buffers) { + allocData(hrds, hvmem, 3); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - vmem_init[0] = m_vsum_init; - vmem_init[1] = m_vmin_init; - vmem_init[2] = m_vmax_init; - cudaErrchk( cudaMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + hvmem[0] = m_vsum_init; + hvmem[1] = m_vmin_init; + hvmem[2] = m_vmax_init; + cudaErrchk( cudaMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + vmem[0] = m_vsum_init; + vmem[1] = m_vmin_init; + vmem[2] = m_vmax_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; @@ -105,19 +116,22 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - Int_type lmem[3]; - cudaErrchk( cudaMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_vsum += lmem[0]; - m_vmin = RAJA_MIN(m_vmin, lmem[1]); - m_vmax = RAJA_MAX(m_vmax, lmem[2]); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); - deallocData(DataSpace::CudaDevice, vmem); - deallocData(DataSpace::CudaPinned, vmem_init); + deallocData(rds, vmem); + if (separate_buffers) { + deallocData(hrds, hvmem); + } } else if ( vid == RAJA_CUDA ) { @@ -158,11 +172,16 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - Int_ptr vmem_init; - allocData(DataSpace::CudaPinned, vmem_init, 3); + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; Int_ptr vmem; - allocData(DataSpace::CudaDevice, vmem, 3); + allocData(rds, vmem, 3); + Int_ptr hvmem = vmem; + if (separate_buffers) { + allocData(hrds, hvmem, 3); + } constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -171,11 +190,17 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - vmem_init[0] = m_vsum_init; - vmem_init[1] = m_vmin_init; - vmem_init[2] = m_vmax_init; - cudaErrchk( cudaMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + hvmem[0] = m_vsum_init; + hvmem[1] = m_vmin_init; + hvmem[2] = m_vmax_init; + cudaErrchk( cudaMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + vmem[0] = m_vsum_init; + vmem[1] = m_vmin_init; + vmem[2] = m_vmax_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -187,19 +212,22 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - Int_type lmem[3]; - cudaErrchk( cudaMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_vsum += lmem[0]; - m_vmin = RAJA_MIN(m_vmin, lmem[1]); - m_vmax = RAJA_MAX(m_vmax, lmem[2]); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); - deallocData(DataSpace::CudaDevice, vmem); - deallocData(DataSpace::CudaPinned, vmem_init); + deallocData(rds, vmem); + if (separate_buffers) { + deallocData(hrds, hvmem); + } } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 801c193e6..361f947a4 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -80,20 +80,31 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - Int_ptr vmem_init; - allocData(DataSpace::HipPinnedCoarse, vmem_init, 3); + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; Int_ptr vmem; - allocData(DataSpace::HipDevice, vmem, 3); + allocData(rds, vmem, 3); + Int_ptr hvmem = vmem; + if (separate_buffers) { + allocData(hrds, hvmem, 3); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - vmem_init[0] = m_vsum_init; - vmem_init[1] = m_vmin_init; - vmem_init[2] = m_vmax_init; - hipErrchk( hipMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + hvmem[0] = m_vsum_init; + hvmem[1] = m_vmin_init; + hvmem[2] = m_vmax_init; + hipErrchk( hipMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + vmem[0] = m_vsum_init; + vmem[1] = m_vmin_init; + vmem[2] = m_vmax_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; @@ -105,19 +116,22 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - Int_type lmem[3]; - hipErrchk( hipMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_vsum += lmem[0]; - m_vmin = RAJA_MIN(m_vmin, lmem[1]); - m_vmax = RAJA_MAX(m_vmax, lmem[2]); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); - deallocData(DataSpace::HipDevice, vmem); - deallocData(DataSpace::HipPinnedCoarse, vmem_init); + deallocData(rds, vmem); + if (separate_buffers) { + deallocData(hrds, hvmem); + } } else if ( vid == RAJA_HIP ) { @@ -158,11 +172,16 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - Int_ptr vmem_init; - allocData(DataSpace::HipPinnedCoarse, vmem_init, 3); + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; Int_ptr vmem; - allocData(DataSpace::HipDevice, vmem, 3); + allocData(rds, vmem, 3); + Int_ptr hvmem = vmem; + if (separate_buffers) { + allocData(hrds, hvmem, 3); + } constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -171,11 +190,17 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - vmem_init[0] = m_vsum_init; - vmem_init[1] = m_vmin_init; - vmem_init[2] = m_vmax_init; - hipErrchk( hipMemcpyAsync( vmem, vmem_init, 3*sizeof(Int_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + hvmem[0] = m_vsum_init; + hvmem[1] = m_vmin_init; + hvmem[2] = m_vmax_init; + hipErrchk( hipMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + vmem[0] = m_vsum_init; + vmem[1] = m_vmin_init; + vmem[2] = m_vmax_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -188,19 +213,22 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - Int_type lmem[3]; - hipErrchk( hipMemcpyAsync( &lmem[0], vmem, 3*sizeof(Int_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_vsum += lmem[0]; - m_vmin = RAJA_MIN(m_vmin, lmem[1]); - m_vmax = RAJA_MAX(m_vmax, lmem[2]); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); - deallocData(DataSpace::HipDevice, vmem); - deallocData(DataSpace::HipPinnedCoarse, vmem_init); + deallocData(rds, vmem); + if (separate_buffers) { + deallocData(hrds, hvmem); + } } else if ( vid == RAJA_HIP ) { From 654895b6a69df93761b773e2c1f22346931e4e65 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 14:04:37 -0800 Subject: [PATCH 126/454] Use reduction data space in REDUCE_STRUCT --- src/basic/REDUCE_STRUCT-Cuda.cpp | 104 +++++++++++++++++++++++-------- src/basic/REDUCE_STRUCT-Hip.cpp | 102 +++++++++++++++++++++++------- 2 files changed, 157 insertions(+), 49 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 5b49b8176..1959104c2 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -109,16 +109,40 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(DataSpace::CudaDevice, mem,6); + allocData(rds, mem, 6); + Real_ptr hmem = mem; + if (separate_buffers) { + allocData(hrds, hmem, 6); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); + if (separate_buffers) { + hmem[0] = m_init_sum; // xcenter + hmem[1] = m_init_min; // xmin + hmem[2] = m_init_max; // xmax + hmem[3] = m_init_sum; // ycenter + hmem[4] = m_init_min; // ymin + hmem[5] = m_init_max; // ymax + cudaErrchk( cudaMemcpyAsync( mem, hmem, 6*sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + mem[0] = m_init_sum; // xcenter + mem[1] = m_init_min; // xmin + mem[2] = m_init_max; // xmax + mem[3] = m_init_sum; // ycenter + mem[4] = m_init_min; // ymin + mem[5] = m_init_max; // ymax + } + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - reduce_struct<<>>( points.x, points.y, @@ -128,22 +152,25 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) points.N); cudaErrchk( cudaGetLastError() ); - Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - cudaErrchk( cudaMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hmem, mem, 6*sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - - points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); - points.SetXMin(lmem[1]); - points.SetXMax(lmem[2]); - points.SetYMin(lmem[4]); - points.SetYMax(lmem[5]); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); m_points=points; } stopTimer(); - deallocData(DataSpace::CudaDevice, mem); + deallocData(rds, mem); + if (separate_buffers) { + deallocData(hrds, hmem); + } } else if ( vid == RAJA_CUDA ) { @@ -192,8 +219,16 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(DataSpace::CudaDevice, mem,6); + allocData(rds, mem, 6); + Real_ptr hmem = mem; + if (separate_buffers) { + allocData(hrds, hmem, 6); + } constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -202,7 +237,23 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk(cudaMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); + if (separate_buffers) { + hmem[0] = m_init_sum; // xcenter + hmem[1] = m_init_min; // xmin + hmem[2] = m_init_max; // xmax + hmem[3] = m_init_sum; // ycenter + hmem[4] = m_init_min; // ymin + hmem[5] = m_init_max; // ymax + cudaErrchk( cudaMemcpyAsync( mem, hmem, 6*sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + mem[0] = m_init_sum; // xcenter + mem[1] = m_init_min; // xmin + mem[2] = m_init_max; // xmax + mem[3] = m_init_sum; // ycenter + mem[4] = m_init_min; // ymin + mem[5] = m_init_max; // ymax + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -215,22 +266,25 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) points.N); cudaErrchk( cudaGetLastError() ); - Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - cudaErrchk( cudaMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hmem, mem, 6*sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - - points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); - points.SetXMin(lmem[1]); - points.SetXMax(lmem[2]); - points.SetYMin(lmem[4]); - points.SetYMax(lmem[5]); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); m_points=points; } stopTimer(); - deallocData(DataSpace::CudaDevice, mem); + deallocData(rds, mem); + if (separate_buffers) { + deallocData(hrds, hmem); + } } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index e7dbfb55e..f29dcada9 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -110,13 +110,37 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(DataSpace::HipDevice, mem,6); + allocData(rds, mem, 6); + Real_ptr hmem = mem; + if (separate_buffers) { + allocData(hrds, hmem, 6); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); + if (separate_buffers) { + hmem[0] = m_init_sum; // xcenter + hmem[1] = m_init_min; // xmin + hmem[2] = m_init_max; // xmax + hmem[3] = m_init_sum; // ycenter + hmem[4] = m_init_min; // ymin + hmem[5] = m_init_max; // ymax + hipErrchk( hipMemcpyAsync( mem, hmem, 6*sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + mem[0] = m_init_sum; // xcenter + mem[1] = m_init_min; // xmin + mem[2] = m_init_max; // xmax + mem[3] = m_init_sum; // ycenter + mem[4] = m_init_min; // ymin + mem[5] = m_init_max; // ymax + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; @@ -131,22 +155,25 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) points.N); hipErrchk( hipGetLastError() ); - Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - hipErrchk( hipMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hmem, mem, 6*sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - - points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); - points.SetXMin(lmem[1]); - points.SetXMax(lmem[2]); - points.SetYMin(lmem[4]); - points.SetYMax(lmem[5]); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); m_points=points; } stopTimer(); - deallocData(DataSpace::HipDevice, mem); + deallocData(rds, mem); + if (separate_buffers) { + deallocData(hrds, hmem); + } } else if ( vid == RAJA_HIP ) { @@ -194,8 +221,16 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(DataSpace::HipDevice, mem,6); + allocData(rds, mem, 6); + Real_ptr hmem = mem; + if (separate_buffers) { + allocData(hrds, hmem, 6); + } constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -204,7 +239,23 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk(hipMemsetAsync(mem, 0.0, 6*sizeof(Real_type), res.get_stream())); + if (separate_buffers) { + hmem[0] = m_init_sum; // xcenter + hmem[1] = m_init_min; // xmin + hmem[2] = m_init_max; // xmax + hmem[3] = m_init_sum; // ycenter + hmem[4] = m_init_min; // ymin + hmem[5] = m_init_max; // ymax + hipErrchk( hipMemcpyAsync( mem, hmem, 6*sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + mem[0] = m_init_sum; // xcenter + mem[1] = m_init_min; // xmin + mem[2] = m_init_max; // xmax + mem[3] = m_init_sum; // ycenter + mem[4] = m_init_min; // ymin + mem[5] = m_init_max; // ymax + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -218,22 +269,25 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) points.N); hipErrchk( hipGetLastError() ); - Real_type lmem[6]={0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - hipErrchk( hipMemcpyAsync( &lmem[0], mem, 6*sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hmem, mem, 6*sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - - points.SetCenter(lmem[0]/points.N, lmem[3]/points.N); - points.SetXMin(lmem[1]); - points.SetXMax(lmem[2]); - points.SetYMin(lmem[4]); - points.SetYMax(lmem[5]); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); m_points=points; } stopTimer(); - deallocData(DataSpace::HipDevice, mem); + deallocData(rds, mem); + if (separate_buffers) { + deallocData(hrds, hmem); + } } else if ( vid == RAJA_HIP ) { From 9cf84a29ce9fe9ff5bd2a3104067709e7734ebab Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 14:17:35 -0800 Subject: [PATCH 127/454] Use reduction data space in TRAP_INT --- src/basic/TRAP_INT-Cuda.cpp | 66 ++++++++++++++++++++++++++++--------- src/basic/TRAP_INT-Hip.cpp | 66 ++++++++++++++++++++++++++++--------- 2 files changed, 100 insertions(+), 32 deletions(-) diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 16d9089ba..b03a5955f 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -86,14 +86,27 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr sumx; - allocData(DataSpace::CudaDevice, sumx, 1); + allocData(rds, sumx, 1); + Real_ptr hsumx = sumx; + if (separate_buffers) { + allocData(hrds, hsumx, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsumx = m_sumx_init; + cudaErrchk( cudaMemcpyAsync( sumx, hsumx, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *sumx = m_sumx_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -105,16 +118,20 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) iend); cudaErrchk( cudaGetLastError() ); - Real_type lsumx; - cudaErrchk( cudaMemcpyAsync( &lsumx, sumx, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hsumx, sumx, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_sumx += lsumx * h; + m_sumx += *hsumx * h; } stopTimer(); - deallocData(DataSpace::CudaDevice, sumx); + deallocData(rds, sumx); + if (separate_buffers) { + deallocData(hrds, hsumx); + } } else if ( vid == RAJA_CUDA ) { @@ -151,8 +168,16 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr sumx; - allocData(DataSpace::CudaDevice, sumx, 1); + allocData(rds, sumx, 1); + Real_ptr hsumx = sumx; + if (separate_buffers) { + allocData(hrds, hsumx, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -161,8 +186,13 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsumx = m_sumx_init; + cudaErrchk( cudaMemcpyAsync( sumx, hsumx, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *sumx = m_sumx_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -174,16 +204,20 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) iend); cudaErrchk( cudaGetLastError() ); - Real_type lsumx; - cudaErrchk( cudaMemcpyAsync( &lsumx, sumx, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hsumx, sumx, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_sumx += lsumx * h; + m_sumx += *hsumx * h; } stopTimer(); - deallocData(DataSpace::CudaDevice, sumx); + deallocData(rds, sumx); + if (separate_buffers) { + deallocData(hrds, hsumx); + } } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 3f7d057f0..a04374a41 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -86,14 +86,27 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr sumx; - allocData(DataSpace::HipDevice, sumx, 1); + allocData(rds, sumx, 1); + Real_ptr hsumx = sumx; + if (separate_buffers) { + allocData(hrds, hsumx, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsumx = m_sumx_init; + hipErrchk( hipMemcpyAsync( sumx, hsumx, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *sumx = m_sumx_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -104,16 +117,20 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) iend); hipErrchk( hipGetLastError() ); - Real_type lsumx; - hipErrchk( hipMemcpyAsync( &lsumx, sumx, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hsumx, sumx, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_sumx += lsumx * h; + m_sumx += *hsumx * h; } stopTimer(); - deallocData(DataSpace::HipDevice, sumx); + deallocData(rds, sumx); + if (separate_buffers) { + deallocData(hrds, hsumx); + } } else if ( vid == RAJA_HIP ) { @@ -150,8 +167,16 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr sumx; - allocData(DataSpace::HipDevice, sumx, 1); + allocData(rds, sumx, 1); + Real_ptr hsumx = sumx; + if (separate_buffers) { + allocData(hrds, hsumx, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -160,8 +185,13 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( sumx, &m_sumx_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hsumx = m_sumx_init; + hipErrchk( hipMemcpyAsync( sumx, hsumx, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *sumx = m_sumx_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -173,16 +203,20 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) iend); hipErrchk( hipGetLastError() ); - Real_type lsumx; - hipErrchk( hipMemcpyAsync( &lsumx, sumx, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hsumx, sumx, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_sumx += lsumx * h; + m_sumx += *hsumx * h; } stopTimer(); - deallocData(DataSpace::HipDevice, sumx); + deallocData(rds, sumx); + if (separate_buffers) { + deallocData(hrds, hsumx); + } } else if ( vid == RAJA_HIP ) { From da1e69ac18bce7c3ad3f33b300e8cbfe82b301f8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 14:25:56 -0800 Subject: [PATCH 128/454] Use reduction data space in DOT --- src/stream/DOT-Cuda.cpp | 66 +++++++++++++++++++++++++++++++---------- src/stream/DOT-Hip.cpp | 66 +++++++++++++++++++++++++++++++---------- 2 files changed, 100 insertions(+), 32 deletions(-) diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index c48974ef7..805ddba90 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -65,14 +65,27 @@ void DOT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dprod; - allocData(DataSpace::CudaDevice, dprod, 1); + allocData(rds, dprod, 1); + Real_ptr hprod = dprod; + if (separate_buffers) { + allocData(hrds, hprod, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hprod = m_dot_init; + cudaErrchk( cudaMemcpyAsync( dprod, hprod, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dprod = m_dot_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -80,16 +93,20 @@ void DOT::runCudaVariantBlock(VariantID vid) a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); - Real_type lprod; - cudaErrchk( cudaMemcpyAsync( &lprod, dprod, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hprod, dprod, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_dot += lprod; + m_dot += *hprod; } stopTimer(); - deallocData(DataSpace::CudaDevice, dprod); + deallocData(rds, dprod); + if (separate_buffers) { + deallocData(hrds, hprod); + } } else if ( vid == RAJA_CUDA ) { @@ -126,8 +143,16 @@ void DOT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dprod; - allocData(DataSpace::CudaDevice, dprod, 1); + allocData(rds, dprod, 1); + Real_ptr hprod = dprod; + if (separate_buffers) { + allocData(hrds, hprod, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -136,8 +161,13 @@ void DOT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hprod = m_dot_init; + cudaErrchk( cudaMemcpyAsync( dprod, hprod, sizeof(Real_type), + cudaMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dprod = m_dot_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -145,16 +175,20 @@ void DOT::runCudaVariantOccGS(VariantID vid) a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); - Real_type lprod; - cudaErrchk( cudaMemcpyAsync( &lprod, dprod, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + cudaErrchk( cudaMemcpyAsync( hprod, dprod, sizeof(Real_type), + cudaMemcpyDeviceToHost, res.get_stream() ) ); + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_dot += lprod; + m_dot += *hprod; } stopTimer(); - deallocData(DataSpace::CudaDevice, dprod); + deallocData(rds, dprod); + if (separate_buffers) { + deallocData(hrds, hprod); + } } else if ( vid == RAJA_CUDA ) { diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 1cea2bd51..f5fa41083 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -65,14 +65,27 @@ void DOT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dprod; - allocData(DataSpace::HipDevice, dprod, 1); + allocData(rds, dprod, 1); + Real_ptr hprod = dprod; + if (separate_buffers) { + allocData(hrds, hprod, 1); + } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hprod = m_dot_init; + hipErrchk( hipMemcpyAsync( dprod, hprod, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dprod = m_dot_init; + } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -81,16 +94,20 @@ void DOT::runHipVariantBlock(VariantID vid) a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); - Real_type lprod; - hipErrchk( hipMemcpyAsync( &lprod, dprod, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hprod, dprod, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_dot += lprod; + m_dot += *hprod; } stopTimer(); - deallocData(DataSpace::HipDevice, dprod); + deallocData(rds, dprod); + if (separate_buffers) { + deallocData(hrds, hprod); + } } else if ( vid == RAJA_HIP ) { @@ -127,8 +144,16 @@ void DOT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { + DataSpace rds = getReductionDataSpace(vid); + DataSpace hrds = hostAccessibleDataSpace(rds); + const bool separate_buffers = hrds != rds; + Real_ptr dprod; - allocData(DataSpace::HipDevice, dprod, 1); + allocData(rds, dprod, 1); + Real_ptr hprod = dprod; + if (separate_buffers) { + allocData(hrds, hprod, 1); + } constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -137,8 +162,13 @@ void DOT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync( dprod, &m_dot_init, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); + if (separate_buffers) { + *hprod = m_dot_init; + hipErrchk( hipMemcpyAsync( dprod, hprod, sizeof(Real_type), + hipMemcpyHostToDevice, res.get_stream() ) ); + } else { + *dprod = m_dot_init; + } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -147,16 +177,20 @@ void DOT::runHipVariantOccGS(VariantID vid) a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); - Real_type lprod; - hipErrchk( hipMemcpyAsync( &lprod, dprod, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); + if (separate_buffers) { + hipErrchk( hipMemcpyAsync( hprod, dprod, sizeof(Real_type), + hipMemcpyDeviceToHost, res.get_stream() ) ); + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_dot += lprod; + m_dot += *hprod; } stopTimer(); - deallocData(DataSpace::HipDevice, dprod); + deallocData(rds, dprod); + if (separate_buffers) { + deallocData(hrds, hprod); + } } else if ( vid == RAJA_HIP ) { From b31f650918bab47b6775a6663aadda9216d2539d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 15:22:21 -0800 Subject: [PATCH 129/454] Add blkatm RAJA GPU tunings in REDUCE_SUM --- src/algorithm/REDUCE_SUM-Cuda.cpp | 114 +++++++++++++++++++++++++++-- src/algorithm/REDUCE_SUM-Hip.cpp | 115 ++++++++++++++++++++++++++++-- src/algorithm/REDUCE_SUM.hpp | 14 +++- 3 files changed, 226 insertions(+), 17 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 822a33eaf..52782f86d 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -129,7 +129,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlock(VariantID vid) +void REDUCE_SUM::runCudaVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -172,7 +172,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sum(m_sum_init); + RAJA::ReduceSum sum(m_sum_init); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -193,7 +193,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) +void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -237,6 +237,78 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) } else if ( vid == RAJA_CUDA ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +template < size_t block_size > +void REDUCE_SUM::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +template < size_t block_size > +void REDUCE_SUM::runCudaVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -286,7 +358,7 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockAtomic(vid); } @@ -295,12 +367,33 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantOccGS(vid); + runCudaVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockOccGS(vid); + + } + + t += 1; + + } } }); @@ -328,10 +421,17 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } }); diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 073e32a59..9a39156cd 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -156,7 +156,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runHipVariantBlock(VariantID vid) +void REDUCE_SUM::runHipVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -198,7 +198,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sum(m_sum_init); + RAJA::ReduceSum sum(m_sum_init); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -219,7 +219,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runHipVariantOccGS(VariantID vid) +void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -262,6 +262,78 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) } else if ( vid == RAJA_HIP ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +template < size_t block_size > +void REDUCE_SUM::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_SUM_BODY; + }); + + m_sum = sum.get(); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +template < size_t block_size > +void REDUCE_SUM::runHipVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -311,7 +383,7 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockAtomic(vid); } @@ -320,12 +392,33 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantOccGS(vid); + runHipVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockOccGS(vid); + + } + + t += 1; + + } } }); @@ -357,9 +450,17 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 9174b2170..63a77898d 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -64,13 +64,21 @@ class REDUCE_SUM : public KernelBase void runCudaVariantCub(VariantID vid); void runHipVariantRocprim(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runHipVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantOccGS(VariantID vid); + void runHipVariantBlockAtomicOccGS(VariantID vid); + template < size_t block_size > + void runCudaVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantOccGS(VariantID vid); + void runCudaVariantBlockOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 0a04dd2b959b3dae81da2d160cdc09d3dddfe368 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 15:29:08 -0800 Subject: [PATCH 130/454] Add blkatm RAJA GPU tunings in PI_REDUCE --- src/basic/PI_REDUCE-Cuda.cpp | 111 ++++++++++++++++++++++++++++++++--- src/basic/PI_REDUCE-Hip.cpp | 110 +++++++++++++++++++++++++++++++--- src/basic/PI_REDUCE.hpp | 14 ++++- 3 files changed, 218 insertions(+), 17 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index e3cd15273..0e594adbc 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -61,7 +61,7 @@ __global__ void pi_reduce(Real_type dx, template < size_t block_size > -void PI_REDUCE::runCudaVariantBlock(VariantID vid) +void PI_REDUCE::runCudaVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -105,7 +105,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum pi(m_pi_init); + RAJA::ReduceSum pi(m_pi_init); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -123,7 +123,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void PI_REDUCE::runCudaVariantOccGS(VariantID vid) +void PI_REDUCE::runCudaVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -168,6 +168,73 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) } else if ( vid == RAJA_CUDA ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * static_cast(pi.get()); + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } +} + + +template < size_t block_size > +void PI_REDUCE::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * static_cast(pi.get()); + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void PI_REDUCE::runCudaVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -202,7 +269,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockAtomic(vid); } @@ -211,12 +278,34 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantOccGS(vid); + runCudaVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -238,9 +327,17 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 5a28adb85..33c385ce2 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -61,7 +61,7 @@ __global__ void pi_reduce(Real_type dx, template < size_t block_size > -void PI_REDUCE::runHipVariantBlock(VariantID vid) +void PI_REDUCE::runHipVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -104,7 +104,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum pi(m_pi_init); + RAJA::ReduceSum pi(m_pi_init); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -122,7 +122,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void PI_REDUCE::runHipVariantOccGS(VariantID vid) +void PI_REDUCE::runHipVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -166,6 +166,73 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) } else if ( vid == RAJA_HIP ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * static_cast(pi.get()); + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } +} + + +template < size_t block_size > +void PI_REDUCE::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + PI_REDUCE_BODY; + }); + + m_pi = 4.0 * static_cast(pi.get()); + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void PI_REDUCE::runHipVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -200,7 +267,7 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockAtomic(vid); } @@ -209,12 +276,34 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantOccGS(vid); + runHipVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -236,10 +325,17 @@ void PI_REDUCE::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } }); diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 4efdf6d21..303d0b6fd 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -60,13 +60,21 @@ class PI_REDUCE : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runHipVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantOccGS(VariantID vid); + void runHipVariantBlockAtomicOccGS(VariantID vid); + template < size_t block_size > + void runCudaVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantOccGS(VariantID vid); + void runCudaVariantBlockOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From a83f0607883c4db2514407e7279fa94888060c1c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 15:38:02 -0800 Subject: [PATCH 131/454] Add blkatm RAJA GPU tunings in REDUCE3_INT --- src/basic/REDUCE3_INT-Cuda.cpp | 122 ++++++++++++++++++++++++++++++--- src/basic/REDUCE3_INT-Hip.cpp | 122 ++++++++++++++++++++++++++++++--- src/basic/REDUCE3_INT.hpp | 14 +++- 3 files changed, 237 insertions(+), 21 deletions(-) diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index d5572ddac..56afdd824 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -76,7 +76,7 @@ __global__ void reduce3int(Int_ptr vec, template < size_t block_size > -void REDUCE3_INT::runCudaVariantBlock(VariantID vid) +void REDUCE3_INT::runCudaVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -132,9 +132,9 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -154,7 +154,7 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) +void REDUCE3_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -211,6 +211,80 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) } else if ( vid == RAJA_CUDA ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void REDUCE3_INT::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void REDUCE3_INT::runCudaVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -249,7 +323,7 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockAtomic(vid); } @@ -258,12 +332,34 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantOccGS(vid); + runCudaVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -285,9 +381,17 @@ void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 528dd4b55..637a5420a 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -76,7 +76,7 @@ __global__ void reduce3int(Int_ptr vec, template < size_t block_size > -void REDUCE3_INT::runHipVariantBlock(VariantID vid) +void REDUCE3_INT::runHipVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -132,9 +132,9 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -154,7 +154,7 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE3_INT::runHipVariantOccGS(VariantID vid) +void REDUCE3_INT::runHipVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -212,6 +212,80 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) } else if ( vid == RAJA_HIP ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void REDUCE3_INT::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void REDUCE3_INT::runHipVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -250,7 +324,7 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockAtomic(vid); } @@ -259,12 +333,34 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantOccGS(vid); + runHipVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -286,9 +382,17 @@ void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index 01b7f226e..c3fccd588 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -75,13 +75,21 @@ class REDUCE3_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runHipVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantOccGS(VariantID vid); + void runHipVariantBlockAtomicOccGS(VariantID vid); + template < size_t block_size > + void runCudaVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantOccGS(VariantID vid); + void runCudaVariantBlockOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 45984ad0b5394be9634704e3da422498eb76aa1b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 15:57:25 -0800 Subject: [PATCH 132/454] Add blkatm RAJA GPU tunings in REDUCE_STRUCT --- src/basic/REDUCE_STRUCT-Cuda.cpp | 146 ++++++++++++++++++++++++++++--- src/basic/REDUCE_STRUCT-Hip.cpp | 145 +++++++++++++++++++++++++++--- src/basic/REDUCE_STRUCT.hpp | 14 ++- 3 files changed, 278 insertions(+), 27 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 0025d0825..bf26471e6 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -97,8 +97,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } + + template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) +void REDUCE_STRUCT::runCudaVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -151,12 +153,12 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -181,7 +183,7 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) +void REDUCE_STRUCT::runCudaVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -235,6 +237,96 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) } else if ( vid == RAJA_CUDA ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points=points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + } + +} + +template < size_t block_size > +void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points=points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + } + +} + +template < size_t block_size > +void REDUCE_STRUCT::runCudaVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -281,7 +373,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockAtomic(vid); } @@ -290,12 +382,34 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantOccGS(vid); + runCudaVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -317,9 +431,17 @@ void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 2b7213c8b..596ae6136 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -98,8 +98,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } + + template < size_t block_size > -void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) +void REDUCE_STRUCT::runHipVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -154,12 +156,12 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -183,7 +185,7 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) +void REDUCE_STRUCT::runHipVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -238,6 +240,95 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) } else if ( vid == RAJA_HIP ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points=points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + } + +} + +template < size_t block_size > +void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points=points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; + } + +} +template < size_t block_size > +void REDUCE_STRUCT::runHipVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -284,7 +375,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockAtomic(vid); } @@ -293,12 +384,34 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantOccGS(vid); + runHipVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -320,9 +433,17 @@ void REDUCE_STRUCT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 063acd5b3..2bfa10412 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -90,13 +90,21 @@ class REDUCE_STRUCT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runHipVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantOccGS(VariantID vid); + void runHipVariantBlockAtomicOccGS(VariantID vid); + template < size_t block_size > + void runCudaVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantOccGS(VariantID vid); + void runCudaVariantBlockOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockOccGS(VariantID vid); struct PointsType { Int_type N; From a08bf64216cb149753ac184f7bd9001bf7a3a5d7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 15:58:07 -0800 Subject: [PATCH 133/454] Add blkatm RAJA GPU tunings in TRAP_INT --- src/basic/TRAP_INT-Cuda.cpp | 110 +++++++++++++++++++++++++++++++++--- src/basic/TRAP_INT-Hip.cpp | 110 +++++++++++++++++++++++++++++++++--- src/basic/TRAP_INT.hpp | 14 ++++- 3 files changed, 217 insertions(+), 17 deletions(-) diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 2b1d62851..18c9f8cee 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -81,7 +81,7 @@ __global__ void trapint(Real_type x0, Real_type xp, template < size_t block_size > -void TRAP_INT::runCudaVariantBlock(VariantID vid) +void TRAP_INT::runCudaVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -128,7 +128,7 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + RAJA::ReduceSum sumx(m_sumx_init); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -146,7 +146,7 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void TRAP_INT::runCudaVariantOccGS(VariantID vid) +void TRAP_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -194,6 +194,72 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) } else if ( vid == RAJA_CUDA ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRAP_INT::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRAP_INT::runCudaVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -228,7 +294,7 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockAtomic(vid); } @@ -237,12 +303,34 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantOccGS(vid); + runCudaVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -264,9 +352,17 @@ void TRAP_INT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index a092ecba8..47a08f8a8 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -81,7 +81,7 @@ __global__ void trapint(Real_type x0, Real_type xp, template < size_t block_size > -void TRAP_INT::runHipVariantBlock(VariantID vid) +void TRAP_INT::runHipVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -127,7 +127,7 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + RAJA::ReduceSum sumx(m_sumx_init); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -145,7 +145,7 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void TRAP_INT::runHipVariantOccGS(VariantID vid) +void TRAP_INT::runHipVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -193,6 +193,72 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) } else if ( vid == RAJA_HIP ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRAP_INT::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void TRAP_INT::runHipVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -227,7 +293,7 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockAtomic(vid); } @@ -236,12 +302,34 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantOccGS(vid); + runHipVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -263,9 +351,17 @@ void TRAP_INT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 4f13c9eca..1a77131f5 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -72,13 +72,21 @@ class TRAP_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runHipVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantOccGS(VariantID vid); + void runHipVariantBlockAtomicOccGS(VariantID vid); + template < size_t block_size > + void runCudaVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantOccGS(VariantID vid); + void runCudaVariantBlockOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 3410085ac293be63bb0b18b47a40260e7d3d75a9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 22 Nov 2023 15:58:42 -0800 Subject: [PATCH 134/454] Add blkatm RAJA GPU tunings in DOT --- src/stream/DOT-Cuda.cpp | 111 +++++++++++++++++++++++++++++++++++++--- src/stream/DOT-Hip.cpp | 111 +++++++++++++++++++++++++++++++++++++--- src/stream/DOT.hpp | 14 +++-- 3 files changed, 219 insertions(+), 17 deletions(-) diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index ddbf36128..d0f5f688d 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -59,8 +59,9 @@ __global__ void dot(Real_ptr a, Real_ptr b, } + template < size_t block_size > -void DOT::runCudaVariantBlock(VariantID vid) +void DOT::runCudaVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -103,7 +104,7 @@ void DOT::runCudaVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum dot(m_dot_init); + RAJA::ReduceSum dot(m_dot_init); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -121,7 +122,7 @@ void DOT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void DOT::runCudaVariantOccGS(VariantID vid) +void DOT::runCudaVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -165,6 +166,72 @@ void DOT::runCudaVariantOccGS(VariantID vid) } else if ( vid == RAJA_CUDA ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void DOT::runCudaVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void DOT::runCudaVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -199,7 +266,7 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockAtomic(vid); } @@ -208,12 +275,34 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantOccGS(vid); + runCudaVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -235,9 +324,17 @@ void DOT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 340807cad..6125e611f 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -60,8 +60,9 @@ __global__ void dot(Real_ptr a, Real_ptr b, } + template < size_t block_size > -void DOT::runHipVariantBlock(VariantID vid) +void DOT::runHipVariantBlockAtomic(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -105,7 +106,7 @@ void DOT::runHipVariantBlock(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum dot(m_dot_init); + RAJA::ReduceSum dot(m_dot_init); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -123,7 +124,7 @@ void DOT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void DOT::runHipVariantOccGS(VariantID vid) +void DOT::runHipVariantBlockAtomicOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -168,6 +169,72 @@ void DOT::runHipVariantOccGS(VariantID vid) } else if ( vid == RAJA_HIP ) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void DOT::runHipVariantBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void DOT::runHipVariantBlockOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -202,7 +269,7 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockAtomic(vid); } @@ -211,12 +278,34 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantOccGS(vid); + runHipVariantBlockAtomicOccGS(vid); } t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlock(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockOccGS(vid); + + } + + t += 1; + + } + } }); @@ -238,9 +327,17 @@ void DOT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 856caef14..3247b1f5a 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -56,13 +56,21 @@ class DOT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runHipVariantBlockAtomic(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantOccGS(VariantID vid); + void runHipVariantBlockAtomicOccGS(VariantID vid); + template < size_t block_size > + void runCudaVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantBlock(VariantID vid); template < size_t block_size > - void runHipVariantOccGS(VariantID vid); + void runCudaVariantBlockOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From b369c497228ff00d53234e38a920ce5cc84fc9bd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 13:47:45 -0800 Subject: [PATCH 135/454] Rename HALOEXHCANGE_base to HALO_base --- src/CMakeLists.txt | 2 +- src/comm/CMakeLists.txt | 2 +- src/comm/HALOEXCHANGE.cpp | 2 +- src/comm/HALOEXCHANGE.hpp | 6 ++--- src/comm/HALOEXCHANGE_FUSED.cpp | 2 +- src/comm/HALOEXCHANGE_FUSED.hpp | 6 ++--- .../{HALOEXCHANGE_base.cpp => HALO_base.cpp} | 26 +++++++++---------- .../{HALOEXCHANGE_base.hpp => HALO_base.hpp} | 12 ++++----- src/comm/MPI_HALOEXCHANGE.cpp | 2 +- src/comm/MPI_HALOEXCHANGE.hpp | 6 ++--- src/comm/MPI_HALOEXCHANGE_FUSED.cpp | 2 +- src/comm/MPI_HALOEXCHANGE_FUSED.hpp | 6 ++--- 12 files changed, 37 insertions(+), 37 deletions(-) rename src/comm/{HALOEXCHANGE_base.cpp => HALO_base.cpp} (92%) rename src/comm/{HALOEXCHANGE_base.hpp => HALO_base.hpp} (94%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bbfbef48e..7d6a94844 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -244,7 +244,7 @@ blt_add_executable( algorithm/MEMCPY.cpp algorithm/MEMCPY-Seq.cpp algorithm/MEMCPY-OMPTarget.cpp - comm/HALOEXCHANGE_base.cpp + comm/HALO_base.cpp comm/HALOEXCHANGE.cpp comm/HALOEXCHANGE-Seq.cpp comm/HALOEXCHANGE-OMPTarget.cpp diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt index 3b99decdf..8e6abfa46 100644 --- a/src/comm/CMakeLists.txt +++ b/src/comm/CMakeLists.txt @@ -8,7 +8,7 @@ blt_add_library( NAME comm - SOURCES HALOEXCHANGE_base.cpp + SOURCES HALO_base.cpp HALOEXCHANGE.cpp HALOEXCHANGE-Seq.cpp HALOEXCHANGE-Hip.cpp diff --git a/src/comm/HALOEXCHANGE.cpp b/src/comm/HALOEXCHANGE.cpp index 5429620e7..355a22fbf 100644 --- a/src/comm/HALOEXCHANGE.cpp +++ b/src/comm/HALOEXCHANGE.cpp @@ -18,7 +18,7 @@ namespace comm { HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Comm_HALOEXCHANGE, params) + : HALO_base(rajaperf::Comm_HALOEXCHANGE, params) { setDefaultReps(200); diff --git a/src/comm/HALOEXCHANGE.hpp b/src/comm/HALOEXCHANGE.hpp index f7a93bd7f..5618449c7 100644 --- a/src/comm/HALOEXCHANGE.hpp +++ b/src/comm/HALOEXCHANGE.hpp @@ -50,7 +50,7 @@ #define RAJAPerf_Comm_HALOEXCHANGE_HPP #define HALOEXCHANGE_DATA_SETUP \ - HALOEXCHANGE_base_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ std::vector vars = m_vars; \ @@ -58,7 +58,7 @@ std::vector buffers = m_buffers; -#include "HALOEXCHANGE_base.hpp" +#include "HALO_base.hpp" #include "RAJA/RAJA.hpp" @@ -67,7 +67,7 @@ namespace rajaperf namespace comm { -class HALOEXCHANGE : public HALOEXCHANGE_base +class HALOEXCHANGE : public HALO_base { public: diff --git a/src/comm/HALOEXCHANGE_FUSED.cpp b/src/comm/HALOEXCHANGE_FUSED.cpp index 3c4d5440f..4aa14b544 100644 --- a/src/comm/HALOEXCHANGE_FUSED.cpp +++ b/src/comm/HALOEXCHANGE_FUSED.cpp @@ -18,7 +18,7 @@ namespace comm { HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Comm_HALOEXCHANGE_FUSED, params) + : HALO_base(rajaperf::Comm_HALOEXCHANGE_FUSED, params) { setDefaultReps(200); diff --git a/src/comm/HALOEXCHANGE_FUSED.hpp b/src/comm/HALOEXCHANGE_FUSED.hpp index 305ee147b..1053c8517 100644 --- a/src/comm/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALOEXCHANGE_FUSED.hpp @@ -52,7 +52,7 @@ #define RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP #define HALOEXCHANGE_FUSED_DATA_SETUP \ - HALOEXCHANGE_base_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ std::vector vars = m_vars; \ @@ -104,7 +104,7 @@ delete[] unpack_lens; -#include "HALOEXCHANGE_base.hpp" +#include "HALO_base.hpp" #include "RAJA/RAJA.hpp" @@ -113,7 +113,7 @@ namespace rajaperf namespace comm { -class HALOEXCHANGE_FUSED : public HALOEXCHANGE_base +class HALOEXCHANGE_FUSED : public HALO_base { public: diff --git a/src/comm/HALOEXCHANGE_base.cpp b/src/comm/HALO_base.cpp similarity index 92% rename from src/comm/HALOEXCHANGE_base.cpp rename to src/comm/HALO_base.cpp index 41f3c89b7..904fbfc69 100644 --- a/src/comm/HALOEXCHANGE_base.cpp +++ b/src/comm/HALO_base.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_base.hpp" +#include "HALO_base.hpp" #include "RAJA/RAJA.hpp" @@ -19,11 +19,11 @@ namespace rajaperf namespace comm { -Index_type HALOEXCHANGE_base::s_grid_dims_default[3] {100, 100, 100}; -Index_type HALOEXCHANGE_base::s_halo_width_default = 1; -Index_type HALOEXCHANGE_base::s_num_vars_default = 3; +Index_type HALO_base::s_grid_dims_default[3] {100, 100, 100}; +Index_type HALO_base::s_halo_width_default = 1; +Index_type HALO_base::s_num_vars_default = 3; -HALOEXCHANGE_base::HALOEXCHANGE_base(KernelID kid, const RunParams& params) +HALO_base::HALO_base(KernelID kid, const RunParams& params) : KernelBase(kid, params) { setDefaultProblemSize( s_grid_dims_default[0] * @@ -47,11 +47,11 @@ HALOEXCHANGE_base::HALOEXCHANGE_base(KernelID kid, const RunParams& params) setActualProblemSize( m_grid_dims[0] * m_grid_dims[1] * m_grid_dims[1] ); } -HALOEXCHANGE_base::~HALOEXCHANGE_base() +HALO_base::~HALO_base() { } -void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, +void HALO_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_mpi_ranks.resize(s_num_neighbors, -1); @@ -68,7 +68,7 @@ void HALOEXCHANGE_base::setUp_base(const int my_mpi_rank, const int* mpi_dims, s_num_neighbors, vid); } -void HALOEXCHANGE_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { destroy_lists(m_pack_index_lists, m_unpack_index_lists, s_num_neighbors, vid); m_unpack_index_list_lengths.clear(); @@ -81,7 +81,7 @@ void HALOEXCHANGE_base::tearDown_base(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } -const int HALOEXCHANGE_base::s_boundary_offsets[HALOEXCHANGE_base::s_num_neighbors][3]{ +const int HALO_base::s_boundary_offsets[HALO_base::s_num_neighbors][3]{ // faces {-1, 0, 0}, @@ -117,8 +117,8 @@ const int HALOEXCHANGE_base::s_boundary_offsets[HALOEXCHANGE_base::s_num_neighbo }; -HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( - const HALOEXCHANGE_base::message_type msg_type, +HALO_base::Extent HALO_base::make_boundary_extent( + const HALO_base::message_type msg_type, const int (&boundary_offset)[3], const Index_type halo_width, const Index_type* grid_dims) { @@ -168,7 +168,7 @@ HALOEXCHANGE_base::Extent HALOEXCHANGE_base::make_boundary_extent( // // Function to generate mpi decomposition and index lists for packing and unpacking. // -void HALOEXCHANGE_base::create_lists( +void HALO_base::create_lists( int my_mpi_rank, const int* mpi_dims, std::vector& mpi_ranks, @@ -295,7 +295,7 @@ void HALOEXCHANGE_base::create_lists( // // Function to destroy packing and unpacking index lists. // -void HALOEXCHANGE_base::destroy_lists( +void HALO_base::destroy_lists( std::vector& pack_index_lists, std::vector& unpack_index_lists, const Index_type num_neighbors, diff --git a/src/comm/HALOEXCHANGE_base.hpp b/src/comm/HALO_base.hpp similarity index 94% rename from src/comm/HALOEXCHANGE_base.hpp rename to src/comm/HALO_base.hpp index 04fc01e33..f64be131d 100644 --- a/src/comm/HALOEXCHANGE_base.hpp +++ b/src/comm/HALO_base.hpp @@ -42,10 +42,10 @@ /// } /// -#ifndef RAJAPerf_Comm_HALOEXCHANGE_base_HPP -#define RAJAPerf_Comm_HALOEXCHANGE_base_HPP +#ifndef RAJAPerf_Comm_HALO_BASE_HPP +#define RAJAPerf_Comm_HALO_BASE_HPP -#define HALOEXCHANGE_base_DATA_SETUP \ +#define HALO_BASE_DATA_SETUP \ Index_type num_neighbors = s_num_neighbors; \ std::vector send_tags = m_send_tags; \ std::vector pack_index_lists = m_pack_index_lists; \ @@ -74,13 +74,13 @@ class RunParams; namespace comm { -class HALOEXCHANGE_base : public KernelBase +class HALO_base : public KernelBase { public: - HALOEXCHANGE_base(KernelID kid, const RunParams& params); + HALO_base(KernelID kid, const RunParams& params); - ~HALOEXCHANGE_base(); + ~HALO_base(); void setUp_base(const int my_mpi_rank, const int* mpi_dims, VariantID vid, size_t tune_idx); diff --git a/src/comm/MPI_HALOEXCHANGE.cpp b/src/comm/MPI_HALOEXCHANGE.cpp index 684fe1c48..a8ec6fcbd 100644 --- a/src/comm/MPI_HALOEXCHANGE.cpp +++ b/src/comm/MPI_HALOEXCHANGE.cpp @@ -20,7 +20,7 @@ namespace comm { MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Comm_MPI_HALOEXCHANGE, params) + : HALO_base(rajaperf::Comm_MPI_HALOEXCHANGE, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); diff --git a/src/comm/MPI_HALOEXCHANGE.hpp b/src/comm/MPI_HALOEXCHANGE.hpp index 83569304d..51f8e0da0 100644 --- a/src/comm/MPI_HALOEXCHANGE.hpp +++ b/src/comm/MPI_HALOEXCHANGE.hpp @@ -60,7 +60,7 @@ #define RAJAPerf_Comm_MPI_HALOEXCHANGE_HPP #define MPI_HALOEXCHANGE_DATA_SETUP \ - HALOEXCHANGE_base_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ std::vector vars = m_vars; \ @@ -81,7 +81,7 @@ std::vector recv_buffers = m_recv_buffers; -#include "HALOEXCHANGE_base.hpp" +#include "HALO_base.hpp" #include "RAJA/RAJA.hpp" @@ -95,7 +95,7 @@ namespace rajaperf namespace comm { -class MPI_HALOEXCHANGE : public HALOEXCHANGE_base +class MPI_HALOEXCHANGE : public HALO_base { public: diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp index 9706c0471..406e5cc7c 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp @@ -20,7 +20,7 @@ namespace comm { MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) - : HALOEXCHANGE_base(rajaperf::Comm_MPI_HALOEXCHANGE_FUSED, params) + : HALO_base(rajaperf::Comm_MPI_HALOEXCHANGE_FUSED, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp index fde967245..e21b040f6 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp @@ -63,7 +63,7 @@ #define RAJAPerf_Comm_MPI_HALOEXCHANGE_FUSED_HPP #define MPI_HALOEXCHANGE_FUSED_DATA_SETUP \ - HALOEXCHANGE_base_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ std::vector vars = m_vars; \ @@ -128,7 +128,7 @@ delete[] unpack_lens; -#include "HALOEXCHANGE_base.hpp" +#include "HALO_base.hpp" #include "RAJA/RAJA.hpp" @@ -139,7 +139,7 @@ namespace rajaperf namespace comm { -class MPI_HALOEXCHANGE_FUSED : public HALOEXCHANGE_base +class MPI_HALOEXCHANGE_FUSED : public HALO_base { public: From 99bf055ff97a579f44f1c6c437ee1bffd99b0cbd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 14:01:07 -0800 Subject: [PATCH 136/454] Rename HALOEXCHANGE to HALOPACKING Try to differentiate the HALO kernels based on what they actually do. Change the non-MPI kernels to HALO_PACKING as they only do packing/unpacking and don't actually exchange the data via MPI. --- src/CMakeLists.txt | 12 +++---- src/comm/CMakeLists.txt | 24 +++++++------- ...EXCHANGE-Cuda.cpp => HALOPACKING-Cuda.cpp} | 18 +++++------ ...LOEXCHANGE-Hip.cpp => HALOPACKING-Hip.cpp} | 18 +++++------ ...LOEXCHANGE-OMP.cpp => HALOPACKING-OMP.cpp} | 20 ++++++------ ...MPTarget.cpp => HALOPACKING-OMPTarget.cpp} | 16 +++++----- ...LOEXCHANGE-Seq.cpp => HALOPACKING-Seq.cpp} | 20 ++++++------ .../{HALOEXCHANGE.cpp => HALOPACKING.cpp} | 14 ++++---- .../{HALOEXCHANGE.hpp => HALOPACKING.hpp} | 14 ++++---- ...ED-Cuda.cpp => HALOPACKING_FUSED-Cuda.cpp} | 26 +++++++-------- ...USED-Hip.cpp => HALOPACKING_FUSED-Hip.cpp} | 26 +++++++-------- ...USED-OMP.cpp => HALOPACKING_FUSED-OMP.cpp} | 28 ++++++++-------- ...et.cpp => HALOPACKING_FUSED-OMPTarget.cpp} | 32 +++++++++---------- ...USED-Seq.cpp => HALOPACKING_FUSED-Seq.cpp} | 24 +++++++------- ...CHANGE_FUSED.cpp => HALOPACKING_FUSED.cpp} | 14 ++++---- ...CHANGE_FUSED.hpp => HALOPACKING_FUSED.hpp} | 26 +++++++-------- src/comm/HALO_base.hpp | 6 ++-- src/comm/MPI_HALOEXCHANGE-Cuda.cpp | 8 ++--- src/comm/MPI_HALOEXCHANGE-Hip.cpp | 8 ++--- src/comm/MPI_HALOEXCHANGE-OMP.cpp | 12 +++---- src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp | 8 ++--- src/comm/MPI_HALOEXCHANGE-Seq.cpp | 12 +++---- src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp | 8 ++--- src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp | 8 ++--- src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp | 12 +++---- src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp | 8 ++--- src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp | 8 ++--- src/comm/MPI_HALOEXCHANGE_FUSED.hpp | 4 +-- src/common/Executor.cpp | 4 +-- src/common/RAJAPerfSuite.cpp | 16 +++++----- src/common/RAJAPerfSuite.hpp | 4 +-- test/test-raja-perf-suite.cpp | 2 +- 32 files changed, 230 insertions(+), 230 deletions(-) rename src/comm/{HALOEXCHANGE-Cuda.cpp => HALOPACKING-Cuda.cpp} (91%) rename src/comm/{HALOEXCHANGE-Hip.cpp => HALOPACKING-Hip.cpp} (91%) rename src/comm/{HALOEXCHANGE-OMP.cpp => HALOPACKING-OMP.cpp} (90%) rename src/comm/{HALOEXCHANGE-OMPTarget.cpp => HALOPACKING-OMPTarget.cpp} (89%) rename src/comm/{HALOEXCHANGE-Seq.cpp => HALOPACKING-Seq.cpp} (90%) rename src/comm/{HALOEXCHANGE.cpp => HALOPACKING.cpp} (88%) rename src/comm/{HALOEXCHANGE.hpp => HALOPACKING.hpp} (91%) rename src/comm/{HALOEXCHANGE_FUSED-Cuda.cpp => HALOPACKING_FUSED-Cuda.cpp} (93%) rename src/comm/{HALOEXCHANGE_FUSED-Hip.cpp => HALOPACKING_FUSED-Hip.cpp} (93%) rename src/comm/{HALOEXCHANGE_FUSED-OMP.cpp => HALOPACKING_FUSED-OMP.cpp} (93%) rename src/comm/{HALOEXCHANGE_FUSED-OMPTarget.cpp => HALOPACKING_FUSED-OMPTarget.cpp} (90%) rename src/comm/{HALOEXCHANGE_FUSED-Seq.cpp => HALOPACKING_FUSED-Seq.cpp} (92%) rename src/comm/{HALOEXCHANGE_FUSED.cpp => HALOPACKING_FUSED.cpp} (86%) rename src/comm/{HALOEXCHANGE_FUSED.hpp => HALOPACKING_FUSED.hpp} (88%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7d6a94844..f1d2c61c6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -245,12 +245,12 @@ blt_add_executable( algorithm/MEMCPY-Seq.cpp algorithm/MEMCPY-OMPTarget.cpp comm/HALO_base.cpp - comm/HALOEXCHANGE.cpp - comm/HALOEXCHANGE-Seq.cpp - comm/HALOEXCHANGE-OMPTarget.cpp - comm/HALOEXCHANGE_FUSED.cpp - comm/HALOEXCHANGE_FUSED-Seq.cpp - comm/HALOEXCHANGE_FUSED-OMPTarget.cpp + comm/HALOPACKING.cpp + comm/HALOPACKING-Seq.cpp + comm/HALOPACKING-OMPTarget.cpp + comm/HALOPACKING_FUSED.cpp + comm/HALOPACKING_FUSED-Seq.cpp + comm/HALOPACKING_FUSED-OMPTarget.cpp comm/MPI_HALOEXCHANGE.cpp comm/MPI_HALOEXCHANGE-Seq.cpp comm/MPI_HALOEXCHANGE-OMPTarget.cpp diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt index 8e6abfa46..6731135da 100644 --- a/src/comm/CMakeLists.txt +++ b/src/comm/CMakeLists.txt @@ -9,18 +9,18 @@ blt_add_library( NAME comm SOURCES HALO_base.cpp - HALOEXCHANGE.cpp - HALOEXCHANGE-Seq.cpp - HALOEXCHANGE-Hip.cpp - HALOEXCHANGE-Cuda.cpp - HALOEXCHANGE-OMP.cpp - HALOEXCHANGE-OMPTarget.cpp - HALOEXCHANGE_FUSED.cpp - HALOEXCHANGE_FUSED-Seq.cpp - HALOEXCHANGE_FUSED-Hip.cpp - HALOEXCHANGE_FUSED-Cuda.cpp - HALOEXCHANGE_FUSED-OMP.cpp - HALOEXCHANGE_FUSED-OMPTarget.cpp + HALOPACKING.cpp + HALOPACKING-Seq.cpp + HALOPACKING-Hip.cpp + HALOPACKING-Cuda.cpp + HALOPACKING-OMP.cpp + HALOPACKING-OMPTarget.cpp + HALOPACKING_FUSED.cpp + HALOPACKING_FUSED-Seq.cpp + HALOPACKING_FUSED-Hip.cpp + HALOPACKING_FUSED-Cuda.cpp + HALOPACKING_FUSED-OMP.cpp + HALOPACKING_FUSED-OMPTarget.cpp MPI_HALOEXCHANGE.cpp MPI_HALOEXCHANGE-Seq.cpp MPI_HALOEXCHANGE-Hip.cpp diff --git a/src/comm/HALOEXCHANGE-Cuda.cpp b/src/comm/HALOPACKING-Cuda.cpp similarity index 91% rename from src/comm/HALOEXCHANGE-Cuda.cpp rename to src/comm/HALOPACKING-Cuda.cpp index 1158cb4cc..960eeca02 100644 --- a/src/comm/HALOEXCHANGE-Cuda.cpp +++ b/src/comm/HALOPACKING-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALOPACKING.hpp" #include "RAJA/RAJA.hpp" @@ -29,7 +29,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -41,19 +41,19 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } template < size_t block_size > -void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) +void HALOPACKING::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOEXCHANGE_DATA_SETUP; + HALOPACKING_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -109,7 +109,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), @@ -126,7 +126,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), @@ -140,11 +140,11 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE-Hip.cpp b/src/comm/HALOPACKING-Hip.cpp similarity index 91% rename from src/comm/HALOEXCHANGE-Hip.cpp rename to src/comm/HALOPACKING-Hip.cpp index c190c262e..ba12a1acf 100644 --- a/src/comm/HALOEXCHANGE-Hip.cpp +++ b/src/comm/HALOPACKING-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALOPACKING.hpp" #include "RAJA/RAJA.hpp" @@ -29,7 +29,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -41,19 +41,19 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } template < size_t block_size > -void HALOEXCHANGE::runHipVariantImpl(VariantID vid) +void HALOPACKING::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - HALOEXCHANGE_DATA_SETUP; + HALOPACKING_DATA_SETUP; if ( vid == Base_HIP ) { @@ -111,7 +111,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), @@ -128,7 +128,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), @@ -142,11 +142,11 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE-OMP.cpp b/src/comm/HALOPACKING-OMP.cpp similarity index 90% rename from src/comm/HALOEXCHANGE-OMP.cpp rename to src/comm/HALOPACKING-OMP.cpp index 823ad5940..fb0b02502 100644 --- a/src/comm/HALOEXCHANGE-OMP.cpp +++ b/src/comm/HALOPACKING-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALOPACKING.hpp" #include "RAJA/RAJA.hpp" @@ -18,13 +18,13 @@ namespace comm { -void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOPACKING_DATA_SETUP; switch ( vid ) { @@ -41,7 +41,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu Real_ptr var = vars[v]; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } @@ -55,7 +55,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu Real_ptr var = vars[v]; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -79,7 +79,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { @@ -96,7 +96,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { @@ -126,7 +126,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -142,7 +142,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -158,7 +158,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } default : { - getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE-OMPTarget.cpp b/src/comm/HALOPACKING-OMPTarget.cpp similarity index 89% rename from src/comm/HALOEXCHANGE-OMPTarget.cpp rename to src/comm/HALOPACKING-OMPTarget.cpp index 007a09f1c..f50cb7edb 100644 --- a/src/comm/HALOEXCHANGE-OMPTarget.cpp +++ b/src/comm/HALOPACKING-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALOPACKING.hpp" #include "RAJA/RAJA.hpp" @@ -27,11 +27,11 @@ namespace comm const size_t threads_per_team = 256; -void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOPACKING_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -47,7 +47,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ #pragma omp target is_device_ptr(buffer, list, var) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } @@ -62,7 +62,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ #pragma omp target is_device_ptr(buffer, list, var) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -85,7 +85,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -101,7 +101,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -114,7 +114,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE-Seq.cpp b/src/comm/HALOPACKING-Seq.cpp similarity index 90% rename from src/comm/HALOEXCHANGE-Seq.cpp rename to src/comm/HALOPACKING-Seq.cpp index aa444af74..3dbc1978c 100644 --- a/src/comm/HALOEXCHANGE-Seq.cpp +++ b/src/comm/HALOPACKING-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALOPACKING.hpp" #include "RAJA/RAJA.hpp" @@ -18,11 +18,11 @@ namespace comm { -void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALOPACKING_DATA_SETUP; switch ( vid ) { @@ -38,7 +38,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } @@ -51,7 +51,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -76,7 +76,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; for (Index_type i = 0; i < len; i++) { haloexchange_pack_base_lam(i); @@ -92,7 +92,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; for (Index_type i = 0; i < len; i++) { haloexchange_unpack_base_lam(i); @@ -121,7 +121,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -137,7 +137,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -154,7 +154,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE.cpp b/src/comm/HALOPACKING.cpp similarity index 88% rename from src/comm/HALOEXCHANGE.cpp rename to src/comm/HALOPACKING.cpp index 355a22fbf..9229d2446 100644 --- a/src/comm/HALOEXCHANGE.cpp +++ b/src/comm/HALOPACKING.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALOPACKING.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) - : HALO_base(rajaperf::Comm_HALOEXCHANGE, params) +HALOPACKING::HALOPACKING(const RunParams& params) + : HALO_base(rajaperf::Comm_HALOPACKING, params) { setDefaultReps(200); @@ -53,11 +53,11 @@ HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) setVariantDefined( RAJA_HIP ); } -HALOEXCHANGE::~HALOEXCHANGE() +HALOPACKING::~HALOPACKING() { } -void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) +void HALOPACKING::setUp(VariantID vid, size_t tune_idx) { int my_mpi_rank = 0; const int mpi_dims[3] = {1,1,1}; @@ -82,14 +82,14 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) } } -void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) +void HALOPACKING::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } -void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) +void HALOPACKING::tearDown(VariantID vid, size_t tune_idx) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l], vid); diff --git a/src/comm/HALOEXCHANGE.hpp b/src/comm/HALOPACKING.hpp similarity index 91% rename from src/comm/HALOEXCHANGE.hpp rename to src/comm/HALOPACKING.hpp index 5618449c7..e7c6c9480 100644 --- a/src/comm/HALOEXCHANGE.hpp +++ b/src/comm/HALOPACKING.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOEXCHANGE kernel reference implementation: +/// HALOPACKING kernel reference implementation: /// /// // post a recv for each neighbor /// @@ -46,10 +46,10 @@ /// // wait for all sends to complete /// -#ifndef RAJAPerf_Comm_HALOEXCHANGE_HPP -#define RAJAPerf_Comm_HALOEXCHANGE_HPP +#ifndef RAJAPerf_Comm_HALOPACKING_HPP +#define RAJAPerf_Comm_HALOPACKING_HPP -#define HALOEXCHANGE_DATA_SETUP \ +#define HALOPACKING_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -67,13 +67,13 @@ namespace rajaperf namespace comm { -class HALOEXCHANGE : public HALO_base +class HALOPACKING : public HALO_base { public: - HALOEXCHANGE(const RunParams& params); + HALOPACKING(const RunParams& params); - ~HALOEXCHANGE(); + ~HALOPACKING(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/HALOPACKING_FUSED-Cuda.cpp similarity index 93% rename from src/comm/HALOEXCHANGE_FUSED-Cuda.cpp rename to src/comm/HALOPACKING_FUSED-Cuda.cpp index ffa28ae35..d50000666 100644 --- a/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALOPACKING_FUSED-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALOPACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ +#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ +#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ @@ -64,7 +64,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -83,23 +83,23 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } template < size_t block_size > -void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) +void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOPACKING_FUSED_DATA_SETUP; if ( vid == Base_CUDA ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + HALOPACKING_FUSED_MANUAL_FUSER_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -161,7 +161,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; } else if ( vid == RAJA_CUDA ) { @@ -205,7 +205,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -224,7 +224,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -240,11 +240,11 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING_FUSED, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/HALOPACKING_FUSED-Hip.cpp similarity index 93% rename from src/comm/HALOEXCHANGE_FUSED-Hip.cpp rename to src/comm/HALOPACKING_FUSED-Hip.cpp index fa9a1b0ba..77e96bfdd 100644 --- a/src/comm/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALOPACKING_FUSED-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALOPACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ +#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP_HIP \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ +#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ @@ -64,7 +64,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -83,23 +83,23 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } template < size_t block_size > -void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) +void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOPACKING_FUSED_DATA_SETUP; if ( vid == Base_HIP ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; + HALOPACKING_FUSED_MANUAL_FUSER_SETUP_HIP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -161,7 +161,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP; } else if ( vid == RAJA_HIP ) { @@ -209,7 +209,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -228,7 +228,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -244,11 +244,11 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING_FUSED : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING_FUSED, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/HALOPACKING_FUSED-OMP.cpp similarity index 93% rename from src/comm/HALOEXCHANGE_FUSED-OMP.cpp rename to src/comm/HALOPACKING_FUSED-OMP.cpp index b5562ad2a..61a3186e5 100644 --- a/src/comm/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALOPACKING_FUSED-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALOPACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -18,19 +18,19 @@ namespace comm { -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOPACKING_FUSED_DATA_SETUP; switch ( vid ) { case Base_OpenMP : { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALOPACKING_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -61,7 +61,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } } @@ -73,7 +73,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } #endif @@ -104,7 +104,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } } @@ -116,7 +116,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } #endif @@ -124,14 +124,14 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN; break; } case Lambda_OpenMP : { - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -217,7 +217,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -265,7 +265,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -283,7 +283,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -301,7 +301,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALOPACKING_FUSED-OMPTarget.cpp similarity index 90% rename from src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp rename to src/comm/HALOPACKING_FUSED-OMPTarget.cpp index 85141315a..6707258e2 100644 --- a/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALOPACKING_FUSED-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALOPACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -26,7 +26,7 @@ namespace comm // //const size_t threads_per_team = 256; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ +#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ void** pack_ptrs; \ allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ @@ -50,28 +50,28 @@ namespace comm Real_ptr* h_unpack_var_ptrs = reinterpret_cast(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \ Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ +#define HALOPACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ +#define HALOPACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ +#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ deallocData(DataSpace::OmpTarget, pack_ptrs); \ delete[] h_pack_ptrs; \ deallocData(DataSpace::OmpTarget, unpack_ptrs); \ delete[] h_unpack_ptrs; -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOPACKING_FUSED_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + HALOPACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -94,7 +94,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U buffer += len; } } - HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; + HALOPACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -107,7 +107,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type len = pack_len_ptrs[j]; for (Index_type i = ii; i < len; i += pack_len_ave) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } } @@ -130,7 +130,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U buffer += len; } } - HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; + HALOPACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -143,7 +143,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type len = unpack_len_ptrs[j]; for (Index_type i = ii; i < len; i += unpack_len_ave) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } } @@ -151,7 +151,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; } else if ( vid == RAJA_OpenMPTarget ) { @@ -196,7 +196,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -214,7 +214,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -229,7 +229,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/HALOPACKING_FUSED-Seq.cpp similarity index 92% rename from src/comm/HALOEXCHANGE_FUSED-Seq.cpp rename to src/comm/HALOPACKING_FUSED-Seq.cpp index b6b8c2e27..d4abf1a7a 100644 --- a/src/comm/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALOPACKING_FUSED-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALOPACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -18,17 +18,17 @@ namespace comm { -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALOPACKING_FUSED_DATA_SETUP; switch ( vid ) { case Base_Seq : { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALOPACKING_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -53,7 +53,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -77,14 +77,14 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN; break; } @@ -92,7 +92,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -144,7 +144,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -192,7 +192,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -210,7 +210,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -229,7 +229,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG #endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOPACKING_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE_FUSED.cpp b/src/comm/HALOPACKING_FUSED.cpp similarity index 86% rename from src/comm/HALOEXCHANGE_FUSED.cpp rename to src/comm/HALOPACKING_FUSED.cpp index 4aa14b544..50c54270e 100644 --- a/src/comm/HALOEXCHANGE_FUSED.cpp +++ b/src/comm/HALOPACKING_FUSED.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALOPACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) - : HALO_base(rajaperf::Comm_HALOEXCHANGE_FUSED, params) +HALOPACKING_FUSED::HALOPACKING_FUSED(const RunParams& params) + : HALO_base(rajaperf::Comm_HALOPACKING_FUSED, params) { setDefaultReps(200); @@ -53,11 +53,11 @@ HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) setVariantDefined( RAJA_HIP ); } -HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() +HALOPACKING_FUSED::~HALOPACKING_FUSED() { } -void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) +void HALOPACKING_FUSED::setUp(VariantID vid, size_t tune_idx) { int my_mpi_rank = 0; const int mpi_dims[3] = {1,1,1}; @@ -82,14 +82,14 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) } } -void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +void HALOPACKING_FUSED::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } -void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) +void HALOPACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) { for (int l = 0; l < s_num_neighbors; ++l) { deallocData(m_buffers[l], vid); diff --git a/src/comm/HALOEXCHANGE_FUSED.hpp b/src/comm/HALOPACKING_FUSED.hpp similarity index 88% rename from src/comm/HALOEXCHANGE_FUSED.hpp rename to src/comm/HALOPACKING_FUSED.hpp index 1053c8517..387872527 100644 --- a/src/comm/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALOPACKING_FUSED.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOEXCHANGE_FUSED kernel reference implementation: +/// HALOPACKING_FUSED kernel reference implementation: /// /// // post a recv for each neighbor /// @@ -48,10 +48,10 @@ /// // wait for all sends to complete /// -#ifndef RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP -#define RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP +#ifndef RAJAPerf_Comm_HALOPACKING_FUSED_HPP +#define RAJAPerf_Comm_HALOPACKING_FUSED_HPP -#define HALOEXCHANGE_FUSED_DATA_SETUP \ +#define HALOPACKING_FUSED_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -59,7 +59,7 @@ \ std::vector buffers = m_buffers; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ +#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ Real_ptr buffer; \ Int_ptr list; \ @@ -70,17 +70,17 @@ ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ +#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN \ delete[] pack_ptr_holders; \ delete[] pack_lens; \ delete[] unpack_ptr_holders; \ delete[] unpack_lens; -#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ +#define HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_PACK_BODY; \ + HALO_PACK_BODY; \ }; \ }; \ using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ @@ -89,7 +89,7 @@ Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_UNPACK_BODY; \ + HALO_UNPACK_BODY; \ }; \ }; \ using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ @@ -97,7 +97,7 @@ malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ +#define HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ free(pack_lambdas); \ delete[] pack_lens; \ free(unpack_lambdas); \ @@ -113,13 +113,13 @@ namespace rajaperf namespace comm { -class HALOEXCHANGE_FUSED : public HALO_base +class HALOPACKING_FUSED : public HALO_base { public: - HALOEXCHANGE_FUSED(const RunParams& params); + HALOPACKING_FUSED(const RunParams& params); - ~HALOEXCHANGE_FUSED(); + ~HALOPACKING_FUSED(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp index f64be131d..f7cd2cd26 100644 --- a/src/comm/HALO_base.hpp +++ b/src/comm/HALO_base.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOEXCHANGE kernel reference implementation: +/// HALOPACKING kernel reference implementation: /// /// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -54,10 +54,10 @@ std::vector unpack_index_lists = m_unpack_index_lists; \ std::vector unpack_index_list_lengths = m_unpack_index_list_lengths; -#define HALOEXCHANGE_PACK_BODY \ +#define HALO_PACK_BODY \ buffer[i] = var[list[i]]; -#define HALOEXCHANGE_UNPACK_BODY \ +#define HALO_UNPACK_BODY \ var[list[i]] = buffer[i]; diff --git a/src/comm/MPI_HALOEXCHANGE-Cuda.cpp b/src/comm/MPI_HALOEXCHANGE-Cuda.cpp index 33cc728cf..a712b1854 100644 --- a/src/comm/MPI_HALOEXCHANGE-Cuda.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Cuda.cpp @@ -29,7 +29,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -41,7 +41,7 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } @@ -141,7 +141,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall(res, RAJA::TypedRangeSegment(0, len), @@ -176,7 +176,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall(res, RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE-Hip.cpp b/src/comm/MPI_HALOEXCHANGE-Hip.cpp index 7a46f0977..2c5b2c174 100644 --- a/src/comm/MPI_HALOEXCHANGE-Hip.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Hip.cpp @@ -29,7 +29,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -41,7 +41,7 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type i = threadIdx.x + blockIdx.x * block_size; if (i < len) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } @@ -143,7 +143,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), @@ -178,7 +178,7 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE-OMP.cpp b/src/comm/MPI_HALOEXCHANGE-OMP.cpp index 636a57f8a..fa29518b3 100644 --- a/src/comm/MPI_HALOEXCHANGE-OMP.cpp +++ b/src/comm/MPI_HALOEXCHANGE-OMP.cpp @@ -49,7 +49,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR Real_ptr var = vars[v]; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } @@ -81,7 +81,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR Real_ptr var = vars[v]; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -113,7 +113,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { @@ -148,7 +148,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { @@ -186,7 +186,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -220,7 +220,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp b/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp index 9b47d1c64..8da8400f3 100644 --- a/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp +++ b/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp @@ -53,7 +53,7 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU #pragma omp target is_device_ptr(buffer, list, var) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } @@ -86,7 +86,7 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU #pragma omp target is_device_ptr(buffer, list, var) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -117,7 +117,7 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -151,7 +151,7 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE-Seq.cpp b/src/comm/MPI_HALOEXCHANGE-Seq.cpp index 1346e6d92..36415aa92 100644 --- a/src/comm/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Seq.cpp @@ -46,7 +46,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } buffer += len; } @@ -77,7 +77,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } buffer += len; } @@ -110,7 +110,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; for (Index_type i = 0; i < len; i++) { haloexchange_pack_base_lam(i); @@ -144,7 +144,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; for (Index_type i = 0; i < len; i++) { haloexchange_unpack_base_lam(i); @@ -181,7 +181,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), @@ -215,7 +215,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp index 0e3c9b1ea..8b1e91e01 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp @@ -64,7 +64,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -83,7 +83,7 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } @@ -240,7 +240,7 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -280,7 +280,7 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp index ae47dea02..ca2f24f67 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp @@ -64,7 +64,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } @@ -83,7 +83,7 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* for (Index_type i = threadIdx.x + blockIdx.x * block_size; i < len; i += block_size * gridDim.x) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } @@ -244,7 +244,7 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -284,7 +284,7 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp index 23722763a..5b0a41a64 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp @@ -68,7 +68,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } } @@ -80,7 +80,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } #endif @@ -131,7 +131,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } } @@ -143,7 +143,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } #endif @@ -327,7 +327,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -366,7 +366,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp index 9dc1e3ec0..61b9536d0 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -113,7 +113,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE Index_type len = pack_len_ptrs[j]; for (Index_type i = ii; i < len; i += pack_len_ave) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } } @@ -170,7 +170,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE Index_type len = unpack_len_ptrs[j]; for (Index_type i = ii; i < len; i += unpack_len_ave) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } } @@ -231,7 +231,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -270,7 +270,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp index 84ddbaa0e..01beb7b00 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp @@ -61,7 +61,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED Real_ptr var = pack_ptr_holders[j].var; Index_type len = pack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; } } if (separate_buffers) { @@ -106,7 +106,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED Real_ptr var = unpack_ptr_holders[j].var; Index_type len = unpack_lens[j]; for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; } } @@ -258,7 +258,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; + HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), @@ -297,7 +297,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp index e21b040f6..09a5e76dd 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.hpp @@ -104,7 +104,7 @@ #define MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_PACK_BODY; \ + HALO_PACK_BODY; \ }; \ }; \ using pack_lambda_type = decltype(make_pack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ @@ -113,7 +113,7 @@ Index_type* pack_lens = new Index_type[num_neighbors * num_vars]; \ auto make_unpack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ - HALOEXCHANGE_UNPACK_BODY; \ + HALO_UNPACK_BODY; \ }; \ }; \ using unpack_lambda_type = decltype(make_unpack_lambda(Real_ptr(), Int_ptr(), Real_ptr())); \ diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 75a4d3f1d..346145da8 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -24,7 +24,7 @@ #include "basic/REDUCE3_INT.hpp" #include "basic/INDEXLIST_3LOOP.hpp" #include "algorithm/SORT.hpp" -#include "comm/HALOEXCHANGE_FUSED.hpp" +#include "comm/HALOPACKING_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) #include "comm/MPI_HALOEXCHANGE_FUSED.hpp" #endif @@ -677,7 +677,7 @@ void Executor::runWarmupKernels() kernel_ids.insert(Basic_INDEXLIST_3LOOP); break; case Workgroup: - kernel_ids.insert(Comm_HALOEXCHANGE_FUSED); break; + kernel_ids.insert(Comm_HALOPACKING_FUSED); break; case Reduction: kernel_ids.insert(Basic_REDUCE3_INT); break; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 364bb3607..812f4442e 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -108,8 +108,8 @@ // // Comm kernels... // -#include "comm/HALOEXCHANGE.hpp" -#include "comm/HALOEXCHANGE_FUSED.hpp" +#include "comm/HALOPACKING.hpp" +#include "comm/HALOPACKING_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) #include "comm/MPI_HALOEXCHANGE.hpp" #include "comm/MPI_HALOEXCHANGE_FUSED.hpp" @@ -257,8 +257,8 @@ static const std::string KernelNames [] = // // Comm kernels... // - std::string("Comm_HALOEXCHANGE"), - std::string("Comm_HALOEXCHANGE_FUSED"), + std::string("Comm_HALOPACKING"), + std::string("Comm_HALOPACKING_FUSED"), #if defined(RAJA_PERFSUITE_ENABLE_MPI) std::string("Comm_MPI_HALOEXCHANGE"), std::string("Comm_MPI_HALOEXCHANGE_FUSED"), @@ -988,12 +988,12 @@ KernelBase* getKernelObject(KernelID kid, // // Comm kernels... // - case Comm_HALOEXCHANGE : { - kernel = new comm::HALOEXCHANGE(run_params); + case Comm_HALOPACKING : { + kernel = new comm::HALOPACKING(run_params); break; } - case Comm_HALOEXCHANGE_FUSED : { - kernel = new comm::HALOEXCHANGE_FUSED(run_params); + case Comm_HALOPACKING_FUSED : { + kernel = new comm::HALOPACKING_FUSED(run_params); break; } #if defined(RAJA_PERFSUITE_ENABLE_MPI) diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index d1cd269fe..b388601b9 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -168,8 +168,8 @@ enum KernelID { // // Comm kernels... // - Comm_HALOEXCHANGE, - Comm_HALOEXCHANGE_FUSED, + Comm_HALOPACKING, + Comm_HALOPACKING_FUSED, #if defined(RAJA_PERFSUITE_ENABLE_MPI) Comm_MPI_HALOEXCHANGE, Comm_MPI_HALOEXCHANGE_FUSED, diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index f19d9cd89..3b11e0326 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -63,7 +63,7 @@ TEST(ShortSuiteTest, Basic) (HIP_VERSION_MAJOR < 5 || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) sargv.emplace_back(std::string("--exclude-kernels")); - sargv.emplace_back(std::string("HALOEXCHANGE_FUSED")); + sargv.emplace_back(std::string("HALOPACKING_FUSED")); #endif #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) From ad2cb05025396ff4716479b27c3b4e14bad7ccba Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 15:08:10 -0800 Subject: [PATCH 137/454] Add MPI_HALOSENDRECV kernel The intent is to measure just the MPI message passing time in MPI_HALOEXCHANGE. This time may be overlapped with packing and unpacking in the full MPI_HALOEXCHANGE kernel. --- src/CMakeLists.txt | 3 + src/comm/CMakeLists.txt | 6 ++ src/comm/MPI_HALOSENDRECV-Cuda.cpp | 63 +++++++++++ src/comm/MPI_HALOSENDRECV-Hip.cpp | 63 +++++++++++ src/comm/MPI_HALOSENDRECV-OMP.cpp | 74 +++++++++++++ src/comm/MPI_HALOSENDRECV-OMPTarget.cpp | 68 ++++++++++++ src/comm/MPI_HALOSENDRECV-Seq.cpp | 69 ++++++++++++ src/comm/MPI_HALOSENDRECV.cpp | 133 ++++++++++++++++++++++++ src/comm/MPI_HALOSENDRECV.hpp | 124 ++++++++++++++++++++++ src/common/KernelBase.hpp | 7 ++ src/common/RAJAPerfSuite.cpp | 6 ++ src/common/RAJAPerfSuite.hpp | 1 + 12 files changed, 617 insertions(+) create mode 100644 src/comm/MPI_HALOSENDRECV-Cuda.cpp create mode 100644 src/comm/MPI_HALOSENDRECV-Hip.cpp create mode 100644 src/comm/MPI_HALOSENDRECV-OMP.cpp create mode 100644 src/comm/MPI_HALOSENDRECV-OMPTarget.cpp create mode 100644 src/comm/MPI_HALOSENDRECV-Seq.cpp create mode 100644 src/comm/MPI_HALOSENDRECV.cpp create mode 100644 src/comm/MPI_HALOSENDRECV.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f1d2c61c6..6266194e5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -251,6 +251,9 @@ blt_add_executable( comm/HALOPACKING_FUSED.cpp comm/HALOPACKING_FUSED-Seq.cpp comm/HALOPACKING_FUSED-OMPTarget.cpp + comm/MPI_HALOSENDRECV.cpp + comm/MPI_HALOSENDRECV-Seq.cpp + comm/MPI_HALOSENDRECV-OMPTarget.cpp comm/MPI_HALOEXCHANGE.cpp comm/MPI_HALOEXCHANGE-Seq.cpp comm/MPI_HALOEXCHANGE-OMPTarget.cpp diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt index 6731135da..91d4f6d3d 100644 --- a/src/comm/CMakeLists.txt +++ b/src/comm/CMakeLists.txt @@ -21,6 +21,12 @@ blt_add_library( HALOPACKING_FUSED-Cuda.cpp HALOPACKING_FUSED-OMP.cpp HALOPACKING_FUSED-OMPTarget.cpp + MPI_HALOSENDRECV.cpp + MPI_HALOSENDRECV-Seq.cpp + MPI_HALOSENDRECV-Hip.cpp + MPI_HALOSENDRECV-Cuda.cpp + MPI_HALOSENDRECV-OMP.cpp + MPI_HALOSENDRECV-OMPTarget.cpp MPI_HALOEXCHANGE.cpp MPI_HALOEXCHANGE-Seq.cpp MPI_HALOEXCHANGE-Hip.cpp diff --git a/src/comm/MPI_HALOSENDRECV-Cuda.cpp b/src/comm/MPI_HALOSENDRECV-Cuda.cpp new file mode 100644 index 000000000..9893de266 --- /dev/null +++ b/src/comm/MPI_HALOSENDRECV-Cuda.cpp @@ -0,0 +1,63 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOSENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void MPI_HALOSENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOSENDRECV_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOSENDRECV : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/comm/MPI_HALOSENDRECV-Hip.cpp b/src/comm/MPI_HALOSENDRECV-Hip.cpp new file mode 100644 index 000000000..28775c15e --- /dev/null +++ b/src/comm/MPI_HALOSENDRECV-Hip.cpp @@ -0,0 +1,63 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOSENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void MPI_HALOSENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOSENDRECV_DATA_SETUP; + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOSENDRECV : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/comm/MPI_HALOSENDRECV-OMP.cpp b/src/comm/MPI_HALOSENDRECV-OMP.cpp new file mode 100644 index 000000000..7b5e3bed4 --- /dev/null +++ b/src/comm/MPI_HALOSENDRECV-OMP.cpp @@ -0,0 +1,74 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOSENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void MPI_HALOSENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + MPI_HALOSENDRECV_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MPI_HALOSENDRECV : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/MPI_HALOSENDRECV-OMPTarget.cpp b/src/comm/MPI_HALOSENDRECV-OMPTarget.cpp new file mode 100644 index 000000000..87a1d69d7 --- /dev/null +++ b/src/comm/MPI_HALOSENDRECV-OMPTarget.cpp @@ -0,0 +1,68 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOSENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace comm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void MPI_HALOSENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOSENDRECV_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + } else { + getCout() << "\n MPI_HALOSENDRECV : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace comm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/comm/MPI_HALOSENDRECV-Seq.cpp b/src/comm/MPI_HALOSENDRECV-Seq.cpp new file mode 100644 index 000000000..28c4938b6 --- /dev/null +++ b/src/comm/MPI_HALOSENDRECV-Seq.cpp @@ -0,0 +1,69 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOSENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + + +void MPI_HALOSENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + MPI_HALOSENDRECV_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = unpack_index_list_lengths[l]; + MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, + mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); + } + + MPI_Waitall(num_neighbors, unpack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MPI_HALOSENDRECV : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/MPI_HALOSENDRECV.cpp b/src/comm/MPI_HALOSENDRECV.cpp new file mode 100644 index 000000000..328bd3b0c --- /dev/null +++ b/src/comm/MPI_HALOSENDRECV.cpp @@ -0,0 +1,133 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MPI_HALOSENDRECV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include + +namespace rajaperf +{ +namespace comm +{ + +MPI_HALOSENDRECV::MPI_HALOSENDRECV(const RunParams& params) + : HALO_base(rajaperf::Comm_MPI_HALOSENDRECV, params) +{ + m_mpi_size = params.getMPISize(); + m_my_mpi_rank = params.getMPIRank(); + m_mpi_dims = params.getMPI3DDivision(); + + setDefaultReps(200); + + m_num_vars = s_num_vars_default; + m_var_size = m_grid_plus_halo_size ; + + setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); + setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); + setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack + (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send + (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + // recv + (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack + (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setFLOPsPerRep(0); + + setUsesFeature(Forall); + setUsesFeature(MPI); + + if (params.validMPI3DDivision()) { + setVariantDefined( Base_Seq ); + + setVariantDefined( Base_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + + setVariantDefined( Base_HIP ); + } +} + +MPI_HALOSENDRECV::~MPI_HALOSENDRECV() +{ +} + +void MPI_HALOSENDRECV::setUp(VariantID vid, size_t tune_idx) +{ + setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_send_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_send_buffers[l], buffer_len); + } + } + + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_recv_buffers[l], buffer_len); + } + } +} + +void MPI_HALOSENDRECV::updateChecksum(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_recv_buffers[l], buffer_len, vid); + } else { + checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_recv_buffers[l], buffer_len, vid); + } + } +} + +void MPI_HALOSENDRECV::tearDown(VariantID vid, size_t tune_idx) +{ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_recv_buffers[l]); + } + } + m_recv_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_send_buffers[l]); + } + } + m_send_buffers.clear(); + + tearDown_base(vid, tune_idx); +} + +} // end namespace comm +} // end namespace rajaperf + +#endif diff --git a/src/comm/MPI_HALOSENDRECV.hpp b/src/comm/MPI_HALOSENDRECV.hpp new file mode 100644 index 000000000..db1a1e00e --- /dev/null +++ b/src/comm/MPI_HALOSENDRECV.hpp @@ -0,0 +1,124 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// MPI_HALOSENDRECV kernel reference implementation: +/// +/// // post a recv for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Index_type len = unpack_index_list_lengths[l]; +/// MPI_Irecv(recv_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], recv_tags[l], MPI_COMM_WORLD, &unpack_mpi_requests[l]); +/// } +/// +/// // pack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// Real_ptr buffer = pack_buffers[l]; +/// Int_ptr list = pack_index_lists[l]; +/// Index_type len = pack_index_list_lengths[l]; +/// // pack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// buffer[i] = var[list[i]]; +/// } +/// buffer += len; +/// } +/// // send buffer to neighbor +/// MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, +/// mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); +/// } +/// +/// // unpack a buffer for each neighbor +/// for (Index_type l = 0; l < num_neighbors; ++l) { +/// // receive buffer from neighbor +/// MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE); +/// Real_ptr buffer = unpack_buffers[l]; +/// Int_ptr list = unpack_index_lists[l]; +/// Index_type len = unpack_index_list_lengths[l]; +/// // unpack part of each variable +/// for (Index_type v = 0; v < num_vars; ++v) { +/// Real_ptr var = vars[v]; +/// for (Index_type i = 0; i < len; i++) { +/// var[list[i]] = buffer[i]; +/// } +/// buffer += len; +/// } +/// } +/// +/// // wait for all sends to complete +/// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); +/// + + +#ifndef RAJAPerf_Comm_MPI_HALOSENDRECV_HPP +#define RAJAPerf_Comm_MPI_HALOSENDRECV_HPP + +#define MPI_HALOSENDRECV_DATA_SETUP \ + HALO_BASE_DATA_SETUP \ + \ + Index_type num_vars = m_num_vars; \ + \ + std::vector mpi_ranks = m_mpi_ranks; \ + \ + std::vector pack_mpi_requests(num_neighbors); \ + std::vector unpack_mpi_requests(num_neighbors); \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; + + +#include "HALO_base.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_PERFSUITE_ENABLE_MPI) + +#include +#include + +namespace rajaperf +{ +namespace comm +{ + +class MPI_HALOSENDRECV : public HALO_base +{ +public: + + MPI_HALOSENDRECV(const RunParams& params); + + ~MPI_HALOSENDRECV(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + +private: + int m_mpi_size = -1; + int m_my_mpi_rank = -1; + std::array m_mpi_dims = {-1, -1, -1}; + + Index_type m_num_vars; + Index_type m_var_size; + + std::vector m_send_buffers; + std::vector m_recv_buffers; +}; + +} // end namespace comm +} // end namespace rajaperf + +#endif +#endif // closing endif for header file include guard diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index c130bdbaa..a9d8f6935 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -348,6 +348,13 @@ class KernelBase rajaperf::detail::initData(d); } + template + long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, VariantID RAJAPERF_UNUSED_ARG(vid)) + { + return rajaperf::calcChecksum(dataSpace, + ptr, len, getDataAlignment(), 1.0); + } + template long double calcChecksum(T* ptr, Size_type len, VariantID vid) { diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 812f4442e..c2fb6bae1 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -111,6 +111,7 @@ #include "comm/HALOPACKING.hpp" #include "comm/HALOPACKING_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) +#include "comm/MPI_HALOSENDRECV.hpp" #include "comm/MPI_HALOEXCHANGE.hpp" #include "comm/MPI_HALOEXCHANGE_FUSED.hpp" #endif @@ -260,6 +261,7 @@ static const std::string KernelNames [] = std::string("Comm_HALOPACKING"), std::string("Comm_HALOPACKING_FUSED"), #if defined(RAJA_PERFSUITE_ENABLE_MPI) + std::string("Comm_MPI_HALOSENDRECV"), std::string("Comm_MPI_HALOEXCHANGE"), std::string("Comm_MPI_HALOEXCHANGE_FUSED"), #endif @@ -997,6 +999,10 @@ KernelBase* getKernelObject(KernelID kid, break; } #if defined(RAJA_PERFSUITE_ENABLE_MPI) + case Comm_MPI_HALOSENDRECV : { + kernel = new comm::MPI_HALOSENDRECV(run_params); + break; + } case Comm_MPI_HALOEXCHANGE : { kernel = new comm::MPI_HALOEXCHANGE(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index b388601b9..b5b95f9f2 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -171,6 +171,7 @@ enum KernelID { Comm_HALOPACKING, Comm_HALOPACKING_FUSED, #if defined(RAJA_PERFSUITE_ENABLE_MPI) + Comm_MPI_HALOSENDRECV, Comm_MPI_HALOEXCHANGE, Comm_MPI_HALOEXCHANGE_FUSED, #endif From 8671933e9edf71eed99bae35a7f656803cfa0558 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 15:36:44 -0800 Subject: [PATCH 138/454] Add MPI data spaces to output --- src/common/Executor.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 346145da8..a629548cb 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -424,6 +424,26 @@ void Executor::reportRunSummary(ostream& str) const } str << endl; + str << "\nMPI Data Spaces" + << "\n--------"; + str << "\nSeq - " << getDataSpaceName(run_params.getSeqMPIDataSpace()); + if (isVariantAvailable(VariantID::Base_OpenMP)) { + str << "\nOpenMP - " << getDataSpaceName(run_params.getOmpMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Base_OpenMPTarget)) { + str << "\nOpenMP Target - " << getDataSpaceName(run_params.getOmpTargetMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Base_CUDA)) { + str << "\nCuda - " << getDataSpaceName(run_params.getCudaMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Base_HIP)) { + str << "\nHip - " << getDataSpaceName(run_params.getHipMPIDataSpace()); + } + if (isVariantAvailable(VariantID::Kokkos_Lambda)) { + str << "\nKokkos - " << getDataSpaceName(run_params.getKokkosMPIDataSpace()); + } + str << endl; + str << "\nVariants and Tunings" << "\n--------\n"; for (size_t iv = 0; iv < variant_ids.size(); ++iv) { From 946695341a0078299693a80fa20e60a107cd3cf5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 16:29:51 -0800 Subject: [PATCH 139/454] Fix MPI_HALOSENDRECV bytesPerRep --- src/comm/MPI_HALOSENDRECV.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/comm/MPI_HALOSENDRECV.cpp b/src/comm/MPI_HALOSENDRECV.cpp index 328bd3b0c..a0bde59f8 100644 --- a/src/comm/MPI_HALOSENDRECV.cpp +++ b/src/comm/MPI_HALOSENDRECV.cpp @@ -32,13 +32,9 @@ MPI_HALOSENDRECV::MPI_HALOSENDRECV(const RunParams& params) m_var_size = m_grid_plus_halo_size ; setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); - setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + // recv - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setKernelsPerRep( 0 ); + setBytesPerRep( (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send + (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() ); // recv setFLOPsPerRep(0); setUsesFeature(Forall); From 4683e34b36eb3a88edf00ad5aacbefee77ddd9eb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 20:46:41 -0800 Subject: [PATCH 140/454] Fix ifdef in MPI_HALOEXCHANGE_FUSED-OMPTarget --- src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp index 61b9536d0..bf4bd9621 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -#if defined(RAJA_ENABLE_TARGET_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP) +#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP) #include "common/OpenMPTargetDataUtils.hpp" From bd55cd80c89b4de356c3eab5051573e7549cc13c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 20:57:12 -0800 Subject: [PATCH 141/454] Fix sync locations in MPI_HALOEXCHANGE_FUSED GPU variants --- src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp | 4 ++-- src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp index 8b1e91e01..b3db3d473 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp @@ -136,7 +136,6 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) haloexchange_fused_pack<<>>( pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); if (separate_buffers) { for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; @@ -145,6 +144,7 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) len*num_vars); } } + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, @@ -250,7 +250,6 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(res); - res.wait(); if (separate_buffers) { for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; @@ -259,6 +258,7 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) len*num_vars); } } + res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp index ca2f24f67..980d303b3 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp @@ -136,7 +136,6 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); hipErrchk( hipGetLastError() ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); if (separate_buffers) { for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; @@ -145,6 +144,7 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) len*num_vars); } } + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, @@ -254,7 +254,6 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(res); - res.wait(); if (separate_buffers) { for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; @@ -263,6 +262,7 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) len*num_vars); } } + res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, From a0c1e0216360451059a215326b587ceeca80dee6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 21:00:56 -0800 Subject: [PATCH 142/454] Remove unnecessary includes --- src/comm/HALOPACKING.cpp | 2 -- src/comm/HALOPACKING_FUSED.cpp | 2 -- src/comm/MPI_HALOEXCHANGE.cpp | 2 -- src/comm/MPI_HALOEXCHANGE_FUSED.cpp | 2 -- src/comm/MPI_HALOSENDRECV.cpp | 2 -- 5 files changed, 10 deletions(-) diff --git a/src/comm/HALOPACKING.cpp b/src/comm/HALOPACKING.cpp index 9229d2446..3caf54fc5 100644 --- a/src/comm/HALOPACKING.cpp +++ b/src/comm/HALOPACKING.cpp @@ -10,8 +10,6 @@ #include "RAJA/RAJA.hpp" -#include - namespace rajaperf { namespace comm diff --git a/src/comm/HALOPACKING_FUSED.cpp b/src/comm/HALOPACKING_FUSED.cpp index 50c54270e..96cf85ed4 100644 --- a/src/comm/HALOPACKING_FUSED.cpp +++ b/src/comm/HALOPACKING_FUSED.cpp @@ -10,8 +10,6 @@ #include "RAJA/RAJA.hpp" -#include - namespace rajaperf { namespace comm diff --git a/src/comm/MPI_HALOEXCHANGE.cpp b/src/comm/MPI_HALOEXCHANGE.cpp index a8ec6fcbd..f9ad3f15d 100644 --- a/src/comm/MPI_HALOEXCHANGE.cpp +++ b/src/comm/MPI_HALOEXCHANGE.cpp @@ -12,8 +12,6 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include - namespace rajaperf { namespace comm diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp index 406e5cc7c..196ea9cb2 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED.cpp @@ -12,8 +12,6 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include - namespace rajaperf { namespace comm diff --git a/src/comm/MPI_HALOSENDRECV.cpp b/src/comm/MPI_HALOSENDRECV.cpp index a0bde59f8..1631ea107 100644 --- a/src/comm/MPI_HALOSENDRECV.cpp +++ b/src/comm/MPI_HALOSENDRECV.cpp @@ -12,8 +12,6 @@ #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include - namespace rajaperf { namespace comm From 8c28c7382a4ef7b9df8c27ec513018caaa6ebd28 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 21:14:41 -0800 Subject: [PATCH 143/454] Change HALOPACKING to use multiple buffers This puts it in line with the HALOEXCHANGE implementation --- src/comm/HALOPACKING-Cuda.cpp | 38 +++++++++++--- src/comm/HALOPACKING-Hip.cpp | 38 +++++++++++--- src/comm/HALOPACKING-OMP.cpp | 54 ++++++++++++++++---- src/comm/HALOPACKING-OMPTarget.cpp | 36 ++++++++++--- src/comm/HALOPACKING-Seq.cpp | 59 ++++++++++++++++++---- src/comm/HALOPACKING.cpp | 60 ++++++++++++++++++++-- src/comm/HALOPACKING.hpp | 28 +++++++---- src/comm/HALOPACKING_FUSED-Cuda.cpp | 36 +++++++++++-- src/comm/HALOPACKING_FUSED-Hip.cpp | 38 ++++++++++++-- src/comm/HALOPACKING_FUSED-OMP.cpp | 56 ++++++++++++++++++--- src/comm/HALOPACKING_FUSED-OMPTarget.cpp | 36 +++++++++++-- src/comm/HALOPACKING_FUSED-Seq.cpp | 64 ++++++++++++++++++++---- src/comm/HALOPACKING_FUSED.cpp | 60 ++++++++++++++++++++-- src/comm/HALOPACKING_FUSED.hpp | 30 ++++++----- 14 files changed, 534 insertions(+), 99 deletions(-) diff --git a/src/comm/HALOPACKING-Cuda.cpp b/src/comm/HALOPACKING-Cuda.cpp index 960eeca02..89f04ef9a 100644 --- a/src/comm/HALOPACKING-Cuda.cpp +++ b/src/comm/HALOPACKING-Cuda.cpp @@ -61,7 +61,7 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -73,13 +73,26 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaGetLastError() ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -103,7 +116,7 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -116,13 +129,26 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + res.wait(); } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALOPACKING-Hip.cpp b/src/comm/HALOPACKING-Hip.cpp index ba12a1acf..9579a5133 100644 --- a/src/comm/HALOPACKING-Hip.cpp +++ b/src/comm/HALOPACKING-Hip.cpp @@ -61,7 +61,7 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -74,13 +74,26 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) hipErrchk( hipGetLastError() ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + hipErrchk( hipStreamSynchronize( res.get_stream() ) ); } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -105,7 +118,7 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -118,13 +131,26 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + res.wait(); } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALOPACKING-OMP.cpp b/src/comm/HALOPACKING-OMP.cpp index fb0b02502..5fbb44133 100644 --- a/src/comm/HALOPACKING-OMP.cpp +++ b/src/comm/HALOPACKING-OMP.cpp @@ -34,7 +34,7 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -45,12 +45,24 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp parallel for @@ -73,7 +85,7 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -87,12 +99,24 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { @@ -120,7 +144,7 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -133,12 +157,24 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALOPACKING-OMPTarget.cpp b/src/comm/HALOPACKING-OMPTarget.cpp index f50cb7edb..752f22755 100644 --- a/src/comm/HALOPACKING-OMPTarget.cpp +++ b/src/comm/HALOPACKING-OMPTarget.cpp @@ -39,7 +39,7 @@ void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -51,12 +51,24 @@ void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp target is_device_ptr(buffer, list, var) device( did ) @@ -79,7 +91,7 @@ void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -92,12 +104,24 @@ void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALOPACKING-Seq.cpp b/src/comm/HALOPACKING-Seq.cpp index 3dbc1978c..5db4c37e2 100644 --- a/src/comm/HALOPACKING-Seq.cpp +++ b/src/comm/HALOPACKING-Seq.cpp @@ -31,10 +31,11 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { @@ -42,12 +43,24 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; for (Index_type i = 0; i < len; i++) { @@ -70,9 +83,9 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=](Index_type i) { @@ -83,12 +96,24 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i } buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { @@ -115,7 +140,7 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -128,12 +153,24 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i haloexchange_pack_base_lam ); buffer += len; } + + if (separate_buffers) { + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALOPACKING.cpp b/src/comm/HALOPACKING.cpp index 3caf54fc5..fc335d1d9 100644 --- a/src/comm/HALOPACKING.cpp +++ b/src/comm/HALOPACKING.cpp @@ -73,10 +73,32 @@ void HALOPACKING::setUp(VariantID vid, size_t tune_idx) } } - m_buffers.resize(s_num_neighbors, nullptr); + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_buffers[l], buffer_len, vid); + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } } } @@ -85,14 +107,44 @@ void HALOPACKING::updateChecksum(VariantID vid, size_t tune_idx) for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_send_buffers[l], buffer_len, vid); + } else { + checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_send_buffers[l], buffer_len, vid); + } + } } void HALOPACKING::tearDown(VariantID vid, size_t tune_idx) { + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l], vid); + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } + } + m_recv_buffers.clear(); + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } } - m_buffers.clear(); + m_send_buffers.clear(); + m_pack_buffers.clear(); for (int v = 0; v < m_num_vars; ++v) { deallocData(m_vars[v], vid); diff --git a/src/comm/HALOPACKING.hpp b/src/comm/HALOPACKING.hpp index e7c6c9480..507aaf622 100644 --- a/src/comm/HALOPACKING.hpp +++ b/src/comm/HALOPACKING.hpp @@ -9,11 +9,9 @@ /// /// HALOPACKING kernel reference implementation: /// -/// // post a recv for each neighbor -/// /// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; /// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable @@ -24,13 +22,11 @@ /// } /// buffer += len; /// } -/// // send buffer to neighbor /// } /// /// // unpack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive buffer from neighbor -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable @@ -43,8 +39,6 @@ /// } /// } /// -/// // wait for all sends to complete -/// #ifndef RAJAPerf_Comm_HALOPACKING_HPP #define RAJAPerf_Comm_HALOPACKING_HPP @@ -55,13 +49,23 @@ Index_type num_vars = m_num_vars; \ std::vector vars = m_vars; \ \ - std::vector buffers = m_buffers; + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; #include "HALO_base.hpp" #include "RAJA/RAJA.hpp" +#include + namespace rajaperf { namespace comm @@ -101,7 +105,11 @@ class HALOPACKING : public HALO_base std::vector m_vars; - std::vector m_buffers; + std::vector m_pack_buffers; + std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; }; } // end namespace comm diff --git a/src/comm/HALOPACKING_FUSED-Cuda.cpp b/src/comm/HALOPACKING_FUSED-Cuda.cpp index d50000666..4c76b331a 100644 --- a/src/comm/HALOPACKING_FUSED-Cuda.cpp +++ b/src/comm/HALOPACKING_FUSED-Cuda.cpp @@ -110,7 +110,7 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -130,15 +130,29 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) haloexchange_fused_pack<<>>( pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); cudaErrchk( cudaGetLastError() ); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); Index_type unpack_index = 0; Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; unpack_buffer_ptrs[unpack_index] = buffer; @@ -199,7 +213,7 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -215,12 +229,26 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(res); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALOPACKING_FUSED-Hip.cpp b/src/comm/HALOPACKING_FUSED-Hip.cpp index 77e96bfdd..0928fb6d7 100644 --- a/src/comm/HALOPACKING_FUSED-Hip.cpp +++ b/src/comm/HALOPACKING_FUSED-Hip.cpp @@ -110,7 +110,7 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -130,15 +130,29 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); hipErrchk( hipGetLastError() ); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); Index_type unpack_index = 0; Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; unpack_buffer_ptrs[unpack_index] = buffer; @@ -203,7 +217,7 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -219,12 +233,26 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(res); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } res.wait(); for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALOPACKING_FUSED-OMP.cpp b/src/comm/HALOPACKING_FUSED-OMP.cpp index 61a3186e5..d9f224063 100644 --- a/src/comm/HALOPACKING_FUSED-OMP.cpp +++ b/src/comm/HALOPACKING_FUSED-OMP.cpp @@ -38,9 +38,9 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; @@ -77,13 +77,27 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } } #endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; @@ -139,7 +153,7 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -174,13 +188,27 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } } #endif + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); @@ -259,7 +287,7 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -275,11 +303,25 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALOPACKING_FUSED-OMPTarget.cpp b/src/comm/HALOPACKING_FUSED-OMPTarget.cpp index 6707258e2..7499563a2 100644 --- a/src/comm/HALOPACKING_FUSED-OMPTarget.cpp +++ b/src/comm/HALOPACKING_FUSED-OMPTarget.cpp @@ -80,7 +80,7 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN Index_type pack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -111,14 +111,28 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } } } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; Index_type unpack_len_sum = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; h_unpack_buffer_ptrs[unpack_index] = buffer; @@ -190,7 +204,7 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { @@ -206,11 +220,25 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALOPACKING_FUSED-Seq.cpp b/src/comm/HALOPACKING_FUSED-Seq.cpp index d4abf1a7a..6331bdc6d 100644 --- a/src/comm/HALOPACKING_FUSED-Seq.cpp +++ b/src/comm/HALOPACKING_FUSED-Seq.cpp @@ -36,9 +36,9 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; @@ -56,13 +56,27 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( HALO_PACK_BODY; } } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; @@ -100,9 +114,9 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( Index_type pack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); @@ -118,13 +132,27 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( pack_lambda(i); } } + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } Index_type unpack_index = 0; for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&unpack_lambdas[unpack_index]) unpack_lambda_type(make_unpack_lambda(buffer, list, var)); @@ -186,9 +214,9 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[send_tags[l]]; + Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_pack_base_lam = [=](Index_type i) { @@ -202,11 +230,25 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } workgroup group_pack = pool_pack.instantiate(); worksite site_pack = group_pack.run(); + if (separate_buffers) { + for (Index_type l = 0; l < num_neighbors; ++l) { + Index_type len = pack_index_list_lengths[l]; + copyData(DataSpace::Host, send_buffers[l], + dataSpace, pack_buffers[l], + len*num_vars); + } + } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[recv_tags[l]]; + Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; Index_type len = unpack_index_list_lengths[l]; + if (separate_buffers) { + copyData(dataSpace, unpack_buffers[l], + DataSpace::Host, recv_buffers[l], + len*num_vars); + } + for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALOPACKING_FUSED.cpp b/src/comm/HALOPACKING_FUSED.cpp index 96cf85ed4..257b5cc06 100644 --- a/src/comm/HALOPACKING_FUSED.cpp +++ b/src/comm/HALOPACKING_FUSED.cpp @@ -73,10 +73,32 @@ void HALOPACKING_FUSED::setUp(VariantID vid, size_t tune_idx) } } - m_buffers.resize(s_num_neighbors, nullptr); + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + m_pack_buffers.resize(s_num_neighbors, nullptr); + m_send_buffers.resize(s_num_neighbors, nullptr); for (Index_type l = 0; l < s_num_neighbors; ++l) { Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; - allocAndInitData(m_buffers[l], buffer_len, vid); + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_pack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_send_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_pack_buffers[l], buffer_len); + m_send_buffers[l] = m_pack_buffers[l]; + } + } + + m_unpack_buffers.resize(s_num_neighbors, nullptr); + m_recv_buffers.resize(s_num_neighbors, nullptr); + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_unpack_index_list_lengths[l]; + if (separate_buffers) { + allocAndInitData(getDataSpace(vid), m_unpack_buffers[l], buffer_len); + allocAndInitData(DataSpace::Host, m_recv_buffers[l], buffer_len); + } else { + allocAndInitData(getMPIDataSpace(vid), m_unpack_buffers[l], buffer_len); + m_recv_buffers[l] = m_unpack_buffers[l]; + } } } @@ -85,14 +107,44 @@ void HALOPACKING_FUSED::updateChecksum(VariantID vid, size_t tune_idx) for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } + + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + + for (Index_type l = 0; l < s_num_neighbors; ++l) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[l]; + if (separate_buffers) { + checksum[vid][tune_idx] += calcChecksum(DataSpace::Host, m_send_buffers[l], buffer_len, vid); + } else { + checksum[vid][tune_idx] += calcChecksum(getMPIDataSpace(vid), m_send_buffers[l], buffer_len, vid); + } + } } void HALOPACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) { + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); + for (int l = 0; l < s_num_neighbors; ++l) { - deallocData(m_buffers[l], vid); + if (separate_buffers) { + deallocData(DataSpace::Host, m_recv_buffers[l]); + deallocData(getDataSpace(vid), m_unpack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_unpack_buffers[l]); + } + } + m_recv_buffers.clear(); + m_unpack_buffers.clear(); + + for (int l = 0; l < s_num_neighbors; ++l) { + if (separate_buffers) { + deallocData(DataSpace::Host, m_send_buffers[l]); + deallocData(getDataSpace(vid), m_pack_buffers[l]); + } else { + deallocData(getMPIDataSpace(vid), m_pack_buffers[l]); + } } - m_buffers.clear(); + m_send_buffers.clear(); + m_pack_buffers.clear(); for (int v = 0; v < m_num_vars; ++v) { deallocData(m_vars[v], vid); diff --git a/src/comm/HALOPACKING_FUSED.hpp b/src/comm/HALOPACKING_FUSED.hpp index 387872527..804c4720e 100644 --- a/src/comm/HALOPACKING_FUSED.hpp +++ b/src/comm/HALOPACKING_FUSED.hpp @@ -9,11 +9,9 @@ /// /// HALOPACKING_FUSED kernel reference implementation: /// -/// // post a recv for each neighbor -/// /// // pack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; /// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable @@ -26,13 +24,9 @@ /// } /// } /// -/// // send buffers to neighbors -/// -/// // receive buffers from neighbors -/// /// // unpack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; +/// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; /// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable @@ -45,8 +39,6 @@ /// } /// } /// -/// // wait for all sends to complete -/// #ifndef RAJAPerf_Comm_HALOPACKING_FUSED_HPP #define RAJAPerf_Comm_HALOPACKING_FUSED_HPP @@ -57,7 +49,15 @@ Index_type num_vars = m_num_vars; \ std::vector vars = m_vars; \ \ - std::vector buffers = m_buffers; + const DataSpace dataSpace = getDataSpace(vid); \ + \ + const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); \ + \ + std::vector pack_buffers = m_pack_buffers; \ + std::vector unpack_buffers = m_unpack_buffers; \ + \ + std::vector send_buffers = m_send_buffers; \ + std::vector recv_buffers = m_recv_buffers; #define HALOPACKING_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ @@ -108,6 +108,8 @@ #include "RAJA/RAJA.hpp" +#include + namespace rajaperf { namespace comm @@ -147,7 +149,11 @@ class HALOPACKING_FUSED : public HALO_base std::vector m_vars; - std::vector m_buffers; + std::vector m_pack_buffers; + std::vector m_unpack_buffers; + + std::vector m_send_buffers; + std::vector m_recv_buffers; }; } // end namespace comm From 50a60c172f100d520f857abb39165973897becb7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 21:14:54 -0800 Subject: [PATCH 144/454] Fix spacing --- src/comm/MPI_HALOEXCHANGE-Cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/comm/MPI_HALOEXCHANGE-Cuda.cpp b/src/comm/MPI_HALOEXCHANGE-Cuda.cpp index a712b1854..796e68ec0 100644 --- a/src/comm/MPI_HALOEXCHANGE-Cuda.cpp +++ b/src/comm/MPI_HALOEXCHANGE-Cuda.cpp @@ -143,7 +143,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; - RAJA::forall(res, + RAJA::forall( res, RAJA::TypedRangeSegment(0, len), haloexchange_pack_base_lam ); buffer += len; @@ -178,7 +178,7 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; - RAJA::forall(res, + RAJA::forall( res, RAJA::TypedRangeSegment(0, len), haloexchange_unpack_base_lam ); buffer += len; From 8999d275aabeb051476196a50a2e15bfac93f304 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 21:15:23 -0800 Subject: [PATCH 145/454] Change HipPinned to HipPinnedCoarse --- src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp | 32 ++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp index 980d303b3..9ded5251e 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp @@ -26,28 +26,28 @@ namespace comm Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ Index_type* pack_len_ptrs; \ - allocData(DataSpace::HipPinned, pack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, pack_len_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs, num_neighbors * num_vars); \ Real_ptr* unpack_buffer_ptrs; \ Int_ptr* unpack_list_ptrs; \ Real_ptr* unpack_var_ptrs; \ Index_type* unpack_len_ptrs; \ - allocData(DataSpace::HipPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_list_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_var_ptrs, num_neighbors * num_vars); \ - allocData(DataSpace::HipPinned, unpack_len_ptrs, num_neighbors * num_vars); + allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ + allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); #define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ - deallocData(DataSpace::HipPinned, pack_buffer_ptrs); \ - deallocData(DataSpace::HipPinned, pack_list_ptrs); \ - deallocData(DataSpace::HipPinned, pack_var_ptrs); \ - deallocData(DataSpace::HipPinned, pack_len_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_buffer_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_list_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_var_ptrs); \ - deallocData(DataSpace::HipPinned, unpack_len_ptrs); + deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \ + deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs); template < size_t block_size > __launch_bounds__(block_size) From 197266bf28c3606d3d6c47dd9322f02215013b4a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 21:18:01 -0800 Subject: [PATCH 146/454] Make HALOEXCHANGE_FUSED OMP_TASK consistent --- src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp index 5b0a41a64..a54dbf3fa 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp @@ -57,7 +57,8 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU buffer += len; } } -#if _OPENMP >= 200805 + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) #pragma omp parallel #pragma omp single nowait for (Index_type j = 0; j < pack_index; j++) { @@ -120,7 +121,8 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU buffer += len; } } -#if _OPENMP >= 200805 + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) #pragma omp parallel #pragma omp single nowait for (Index_type j = 0; j < unpack_index; j++) { @@ -185,7 +187,8 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU buffer += len; } } -#if _OPENMP >= 200805 + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) #pragma omp parallel #pragma omp single nowait for (Index_type j = 0; j < pack_index; j++) { @@ -244,7 +247,8 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU buffer += len; } } -#if _OPENMP >= 200805 + +#if defined(RAJA_ENABLE_OMP_TASK_INTERNAL) #pragma omp parallel #pragma omp single nowait for (Index_type j = 0; j < unpack_index; j++) { From 91f1c8dee8da49d44cea7932ef8bef782ad24c29 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 21:23:28 -0800 Subject: [PATCH 147/454] Remove MPI_ from names --- src/CMakeLists.txt | 18 +++++----- src/comm/CMakeLists.txt | 36 +++++++++---------- ...XCHANGE-Cuda.cpp => HALOEXCHANGE-Cuda.cpp} | 10 +++--- ...OEXCHANGE-Hip.cpp => HALOEXCHANGE-Hip.cpp} | 10 +++--- ...OEXCHANGE-OMP.cpp => HALOEXCHANGE-OMP.cpp} | 8 ++--- ...PTarget.cpp => HALOEXCHANGE-OMPTarget.cpp} | 8 ++--- ...OEXCHANGE-Seq.cpp => HALOEXCHANGE-Seq.cpp} | 8 ++--- ...{MPI_HALOEXCHANGE.cpp => HALOEXCHANGE.cpp} | 14 ++++---- ...{MPI_HALOEXCHANGE.hpp => HALOEXCHANGE.hpp} | 14 ++++---- ...D-Cuda.cpp => HALOEXCHANGE_FUSED-Cuda.cpp} | 18 +++++----- ...SED-Hip.cpp => HALOEXCHANGE_FUSED-Hip.cpp} | 18 +++++----- ...SED-OMP.cpp => HALOEXCHANGE_FUSED-OMP.cpp} | 16 ++++----- ...t.cpp => HALOEXCHANGE_FUSED-OMPTarget.cpp} | 24 ++++++------- ...SED-Seq.cpp => HALOEXCHANGE_FUSED-Seq.cpp} | 16 ++++----- ...HANGE_FUSED.cpp => HALOEXCHANGE_FUSED.cpp} | 14 ++++---- ...HANGE_FUSED.hpp => HALOEXCHANGE_FUSED.hpp} | 22 ++++++------ ...ENDRECV-Cuda.cpp => HALOSENDRECV-Cuda.cpp} | 8 ++--- ...OSENDRECV-Hip.cpp => HALOSENDRECV-Hip.cpp} | 8 ++--- ...OSENDRECV-OMP.cpp => HALOSENDRECV-OMP.cpp} | 8 ++--- ...PTarget.cpp => HALOSENDRECV-OMPTarget.cpp} | 8 ++--- ...OSENDRECV-Seq.cpp => HALOSENDRECV-Seq.cpp} | 8 ++--- ...{MPI_HALOSENDRECV.cpp => HALOSENDRECV.cpp} | 14 ++++---- ...{MPI_HALOSENDRECV.hpp => HALOSENDRECV.hpp} | 14 ++++---- src/common/Executor.cpp | 4 +-- src/common/RAJAPerfSuite.cpp | 24 ++++++------- src/common/RAJAPerfSuite.hpp | 6 ++-- 26 files changed, 178 insertions(+), 178 deletions(-) rename src/comm/{MPI_HALOEXCHANGE-Cuda.cpp => HALOEXCHANGE-Cuda.cpp} (95%) rename src/comm/{MPI_HALOEXCHANGE-Hip.cpp => HALOEXCHANGE-Hip.cpp} (95%) rename src/comm/{MPI_HALOEXCHANGE-OMP.cpp => HALOEXCHANGE-OMP.cpp} (96%) rename src/comm/{MPI_HALOEXCHANGE-OMPTarget.cpp => HALOEXCHANGE-OMPTarget.cpp} (95%) rename src/comm/{MPI_HALOEXCHANGE-Seq.cpp => HALOEXCHANGE-Seq.cpp} (96%) rename src/comm/{MPI_HALOEXCHANGE.cpp => HALOEXCHANGE.cpp} (92%) rename src/comm/{MPI_HALOEXCHANGE.hpp => HALOEXCHANGE.hpp} (93%) rename src/comm/{MPI_HALOEXCHANGE_FUSED-Cuda.cpp => HALOEXCHANGE_FUSED-Cuda.cpp} (95%) rename src/comm/{MPI_HALOEXCHANGE_FUSED-Hip.cpp => HALOEXCHANGE_FUSED-Hip.cpp} (95%) rename src/comm/{MPI_HALOEXCHANGE_FUSED-OMP.cpp => HALOEXCHANGE_FUSED-OMP.cpp} (96%) rename src/comm/{MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp => HALOEXCHANGE_FUSED-OMPTarget.cpp} (93%) rename src/comm/{MPI_HALOEXCHANGE_FUSED-Seq.cpp => HALOEXCHANGE_FUSED-Seq.cpp} (95%) rename src/comm/{MPI_HALOEXCHANGE_FUSED.cpp => HALOEXCHANGE_FUSED.cpp} (91%) rename src/comm/{MPI_HALOEXCHANGE_FUSED.hpp => HALOEXCHANGE_FUSED.hpp} (91%) rename src/comm/{MPI_HALOSENDRECV-Cuda.cpp => HALOSENDRECV-Cuda.cpp} (86%) rename src/comm/{MPI_HALOSENDRECV-Hip.cpp => HALOSENDRECV-Hip.cpp} (86%) rename src/comm/{MPI_HALOSENDRECV-OMP.cpp => HALOSENDRECV-OMP.cpp} (87%) rename src/comm/{MPI_HALOSENDRECV-OMPTarget.cpp => HALOSENDRECV-OMPTarget.cpp} (86%) rename src/comm/{MPI_HALOSENDRECV-Seq.cpp => HALOSENDRECV-Seq.cpp} (86%) rename src/comm/{MPI_HALOSENDRECV.cpp => HALOSENDRECV.cpp} (89%) rename src/comm/{MPI_HALOSENDRECV.hpp => HALOSENDRECV.hpp} (92%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6266194e5..cec242d23 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -251,15 +251,15 @@ blt_add_executable( comm/HALOPACKING_FUSED.cpp comm/HALOPACKING_FUSED-Seq.cpp comm/HALOPACKING_FUSED-OMPTarget.cpp - comm/MPI_HALOSENDRECV.cpp - comm/MPI_HALOSENDRECV-Seq.cpp - comm/MPI_HALOSENDRECV-OMPTarget.cpp - comm/MPI_HALOEXCHANGE.cpp - comm/MPI_HALOEXCHANGE-Seq.cpp - comm/MPI_HALOEXCHANGE-OMPTarget.cpp - comm/MPI_HALOEXCHANGE_FUSED.cpp - comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp - comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp + comm/HALOSENDRECV.cpp + comm/HALOSENDRECV-Seq.cpp + comm/HALOSENDRECV-OMPTarget.cpp + comm/HALOEXCHANGE.cpp + comm/HALOEXCHANGE-Seq.cpp + comm/HALOEXCHANGE-OMPTarget.cpp + comm/HALOEXCHANGE_FUSED.cpp + comm/HALOEXCHANGE_FUSED-Seq.cpp + comm/HALOEXCHANGE_FUSED-OMPTarget.cpp DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS} ) install( TARGETS raja-perf-omptarget.exe diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt index 91d4f6d3d..c431d9221 100644 --- a/src/comm/CMakeLists.txt +++ b/src/comm/CMakeLists.txt @@ -21,23 +21,23 @@ blt_add_library( HALOPACKING_FUSED-Cuda.cpp HALOPACKING_FUSED-OMP.cpp HALOPACKING_FUSED-OMPTarget.cpp - MPI_HALOSENDRECV.cpp - MPI_HALOSENDRECV-Seq.cpp - MPI_HALOSENDRECV-Hip.cpp - MPI_HALOSENDRECV-Cuda.cpp - MPI_HALOSENDRECV-OMP.cpp - MPI_HALOSENDRECV-OMPTarget.cpp - MPI_HALOEXCHANGE.cpp - MPI_HALOEXCHANGE-Seq.cpp - MPI_HALOEXCHANGE-Hip.cpp - MPI_HALOEXCHANGE-Cuda.cpp - MPI_HALOEXCHANGE-OMP.cpp - MPI_HALOEXCHANGE-OMPTarget.cpp - MPI_HALOEXCHANGE_FUSED.cpp - MPI_HALOEXCHANGE_FUSED-Seq.cpp - MPI_HALOEXCHANGE_FUSED-Hip.cpp - MPI_HALOEXCHANGE_FUSED-Cuda.cpp - MPI_HALOEXCHANGE_FUSED-OMP.cpp - MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp + HALOSENDRECV.cpp + HALOSENDRECV-Seq.cpp + HALOSENDRECV-Hip.cpp + HALOSENDRECV-Cuda.cpp + HALOSENDRECV-OMP.cpp + HALOSENDRECV-OMPTarget.cpp + HALOEXCHANGE.cpp + HALOEXCHANGE-Seq.cpp + HALOEXCHANGE-Hip.cpp + HALOEXCHANGE-Cuda.cpp + HALOEXCHANGE-OMP.cpp + HALOEXCHANGE-OMPTarget.cpp + HALOEXCHANGE_FUSED.cpp + HALOEXCHANGE_FUSED-Seq.cpp + HALOEXCHANGE_FUSED-Hip.cpp + HALOEXCHANGE_FUSED-Cuda.cpp + HALOEXCHANGE_FUSED-OMP.cpp + HALOEXCHANGE_FUSED-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/comm/MPI_HALOEXCHANGE-Cuda.cpp b/src/comm/HALOEXCHANGE-Cuda.cpp similarity index 95% rename from src/comm/MPI_HALOEXCHANGE-Cuda.cpp rename to src/comm/HALOEXCHANGE-Cuda.cpp index 796e68ec0..437e64572 100644 --- a/src/comm/MPI_HALOEXCHANGE-Cuda.cpp +++ b/src/comm/HALOEXCHANGE-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE.hpp" +#include "HALOEXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -47,13 +47,13 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > -void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) +void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - MPI_HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -192,11 +192,11 @@ void MPI_HALOEXCHANGE::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n MPI_HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/MPI_HALOEXCHANGE-Hip.cpp b/src/comm/HALOEXCHANGE-Hip.cpp similarity index 95% rename from src/comm/MPI_HALOEXCHANGE-Hip.cpp rename to src/comm/HALOEXCHANGE-Hip.cpp index 2c5b2c174..56841c3e6 100644 --- a/src/comm/MPI_HALOEXCHANGE-Hip.cpp +++ b/src/comm/HALOEXCHANGE-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE.hpp" +#include "HALOEXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -47,13 +47,13 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > -void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) +void HALOEXCHANGE::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - MPI_HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; if ( vid == Base_HIP ) { @@ -194,11 +194,11 @@ void MPI_HALOEXCHANGE::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n MPI_HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/MPI_HALOEXCHANGE-OMP.cpp b/src/comm/HALOEXCHANGE-OMP.cpp similarity index 96% rename from src/comm/MPI_HALOEXCHANGE-OMP.cpp rename to src/comm/HALOEXCHANGE-OMP.cpp index fa29518b3..ccad6969f 100644 --- a/src/comm/MPI_HALOEXCHANGE-OMP.cpp +++ b/src/comm/HALOEXCHANGE-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE.hpp" +#include "HALOEXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -20,13 +20,13 @@ namespace comm { -void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - MPI_HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; switch ( vid ) { @@ -238,7 +238,7 @@ void MPI_HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } default : { - getCout() << "\n MPI_HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp b/src/comm/HALOEXCHANGE-OMPTarget.cpp similarity index 95% rename from src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp rename to src/comm/HALOEXCHANGE-OMPTarget.cpp index 8da8400f3..c1e989a7d 100644 --- a/src/comm/MPI_HALOEXCHANGE-OMPTarget.cpp +++ b/src/comm/HALOEXCHANGE-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE.hpp" +#include "HALOEXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -27,11 +27,11 @@ namespace comm const size_t threads_per_team = 256; -void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -166,7 +166,7 @@ void MPI_HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU stopTimer(); } else { - getCout() << "\n MPI_HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOEXCHANGE-Seq.cpp b/src/comm/HALOEXCHANGE-Seq.cpp similarity index 96% rename from src/comm/MPI_HALOEXCHANGE-Seq.cpp rename to src/comm/HALOEXCHANGE-Seq.cpp index 36415aa92..b94d9c459 100644 --- a/src/comm/MPI_HALOEXCHANGE-Seq.cpp +++ b/src/comm/HALOEXCHANGE-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE.hpp" +#include "HALOEXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -20,11 +20,11 @@ namespace comm { -void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOEXCHANGE_DATA_SETUP; + HALOEXCHANGE_DATA_SETUP; switch ( vid ) { @@ -234,7 +234,7 @@ void MPI_HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t #endif // RUN_RAJA_SEQ default : { - getCout() << "\n MPI_HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOEXCHANGE.cpp b/src/comm/HALOEXCHANGE.cpp similarity index 92% rename from src/comm/MPI_HALOEXCHANGE.cpp rename to src/comm/HALOEXCHANGE.cpp index f9ad3f15d..929f77dcc 100644 --- a/src/comm/MPI_HALOEXCHANGE.cpp +++ b/src/comm/HALOEXCHANGE.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE.hpp" +#include "HALOEXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) - : HALO_base(rajaperf::Comm_MPI_HALOEXCHANGE, params) +HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) + : HALO_base(rajaperf::Comm_HALOEXCHANGE, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -62,11 +62,11 @@ MPI_HALOEXCHANGE::MPI_HALOEXCHANGE(const RunParams& params) } } -MPI_HALOEXCHANGE::~MPI_HALOEXCHANGE() +HALOEXCHANGE::~HALOEXCHANGE() { } -void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) +void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); @@ -111,14 +111,14 @@ void MPI_HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) } } -void MPI_HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) +void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } -void MPI_HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) +void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/MPI_HALOEXCHANGE.hpp b/src/comm/HALOEXCHANGE.hpp similarity index 93% rename from src/comm/MPI_HALOEXCHANGE.hpp rename to src/comm/HALOEXCHANGE.hpp index 51f8e0da0..8bfc80ed6 100644 --- a/src/comm/MPI_HALOEXCHANGE.hpp +++ b/src/comm/HALOEXCHANGE.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// MPI_HALOEXCHANGE kernel reference implementation: +/// HALOEXCHANGE kernel reference implementation: /// /// // post a recv for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -56,10 +56,10 @@ /// -#ifndef RAJAPerf_Comm_MPI_HALOEXCHANGE_HPP -#define RAJAPerf_Comm_MPI_HALOEXCHANGE_HPP +#ifndef RAJAPerf_Comm_HALOEXCHANGE_HPP +#define RAJAPerf_Comm_HALOEXCHANGE_HPP -#define MPI_HALOEXCHANGE_DATA_SETUP \ +#define HALOEXCHANGE_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -95,13 +95,13 @@ namespace rajaperf namespace comm { -class MPI_HALOEXCHANGE : public HALO_base +class HALOEXCHANGE : public HALO_base { public: - MPI_HALOEXCHANGE(const RunParams& params); + HALOEXCHANGE(const RunParams& params); - ~MPI_HALOEXCHANGE(); + ~HALOEXCHANGE(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp similarity index 95% rename from src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp rename to src/comm/HALOEXCHANGE_FUSED-Cuda.cpp index b3db3d473..6a59be4fd 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE_FUSED.hpp" +#include "HALOEXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ @@ -89,17 +89,17 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) +void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_CUDA ) { - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -190,7 +190,7 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } stopTimer(); - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; } else if ( vid == RAJA_CUDA ) { @@ -298,11 +298,11 @@ void MPI_HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE_FUSED, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/HALOEXCHANGE_FUSED-Hip.cpp similarity index 95% rename from src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp rename to src/comm/HALOEXCHANGE_FUSED-Hip.cpp index 9ded5251e..fb196e63d 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE_FUSED.hpp" +#include "HALOEXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ @@ -89,17 +89,17 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) +void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_HIP ) { - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; + HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -190,7 +190,7 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } stopTimer(); - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; } else if ( vid == RAJA_HIP ) { @@ -302,11 +302,11 @@ void MPI_HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MPI_HALOEXCHANGE_FUSED, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/HALOEXCHANGE_FUSED-OMP.cpp similarity index 96% rename from src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp rename to src/comm/HALOEXCHANGE_FUSED-OMP.cpp index a54dbf3fa..95b03ccec 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE_FUSED.hpp" +#include "HALOEXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -20,19 +20,19 @@ namespace comm { -void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; switch ( vid ) { case Base_OpenMP : { - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -155,14 +155,14 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU } stopTimer(); - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; break; } case Lambda_OpenMP : { - MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -277,7 +277,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU } stopTimer(); - MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -390,7 +390,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNU } default : { - getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp similarity index 93% rename from src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp rename to src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp index bf4bd9621..9b126393e 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE_FUSED.hpp" +#include "HALOEXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -26,7 +26,7 @@ namespace comm // //const size_t threads_per_team = 256; -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ void** pack_ptrs; \ allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ @@ -50,28 +50,28 @@ namespace comm Real_ptr* h_unpack_var_ptrs = reinterpret_cast(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \ Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ deallocData(DataSpace::OmpTarget, pack_ptrs); \ delete[] h_pack_ptrs; \ deallocData(DataSpace::OmpTarget, unpack_ptrs); \ delete[] h_unpack_ptrs; -void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -100,7 +100,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE buffer += len; } } - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; + HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -157,7 +157,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE buffer += len; } } - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; + HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -180,7 +180,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE } stopTimer(); - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; } else if ( vid == RAJA_OpenMPTarget ) { @@ -287,7 +287,7 @@ void MPI_HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPE stopTimer(); } else { - getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/HALOEXCHANGE_FUSED-Seq.cpp similarity index 95% rename from src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp rename to src/comm/HALOEXCHANGE_FUSED-Seq.cpp index 01beb7b00..ca7798674 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALOEXCHANGE_FUSED-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE_FUSED.hpp" +#include "HALOEXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -20,17 +20,17 @@ namespace comm { -void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOEXCHANGE_FUSED_DATA_SETUP; + HALOEXCHANGE_FUSED_DATA_SETUP; switch ( vid ) { case Base_Seq : { - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -115,7 +115,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; break; } @@ -123,7 +123,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -204,7 +204,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED } stopTimer(); - MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -318,7 +318,7 @@ void MPI_HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED #endif // RUN_RAJA_SEQ default : { - getCout() << "\n MPI_HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp b/src/comm/HALOEXCHANGE_FUSED.cpp similarity index 91% rename from src/comm/MPI_HALOEXCHANGE_FUSED.cpp rename to src/comm/HALOEXCHANGE_FUSED.cpp index 196ea9cb2..87869ebb2 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.cpp +++ b/src/comm/HALOEXCHANGE_FUSED.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOEXCHANGE_FUSED.hpp" +#include "HALOEXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) - : HALO_base(rajaperf::Comm_MPI_HALOEXCHANGE_FUSED, params) +HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) + : HALO_base(rajaperf::Comm_HALOEXCHANGE_FUSED, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -62,11 +62,11 @@ MPI_HALOEXCHANGE_FUSED::MPI_HALOEXCHANGE_FUSED(const RunParams& params) } } -MPI_HALOEXCHANGE_FUSED::~MPI_HALOEXCHANGE_FUSED() +HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() { } -void MPI_HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) +void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); @@ -111,14 +111,14 @@ void MPI_HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) } } -void MPI_HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } -void MPI_HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) +void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp b/src/comm/HALOEXCHANGE_FUSED.hpp similarity index 91% rename from src/comm/MPI_HALOEXCHANGE_FUSED.hpp rename to src/comm/HALOEXCHANGE_FUSED.hpp index 09a5e76dd..f4f4a72d9 100644 --- a/src/comm/MPI_HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALOEXCHANGE_FUSED.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// MPI_HALOEXCHANGE_FUSED kernel reference implementation: +/// HALOEXCHANGE_FUSED kernel reference implementation: /// /// // post a recv for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -59,10 +59,10 @@ /// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); /// -#ifndef RAJAPerf_Comm_MPI_HALOEXCHANGE_FUSED_HPP -#define RAJAPerf_Comm_MPI_HALOEXCHANGE_FUSED_HPP +#ifndef RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP +#define RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP -#define MPI_HALOEXCHANGE_FUSED_DATA_SETUP \ +#define HALOEXCHANGE_FUSED_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -83,7 +83,7 @@ std::vector send_buffers = m_send_buffers; \ std::vector recv_buffers = m_recv_buffers; -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ Real_ptr buffer; \ Int_ptr list; \ @@ -94,14 +94,14 @@ ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define MPI_HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ +#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ delete[] pack_ptr_holders; \ delete[] pack_lens; \ delete[] unpack_ptr_holders; \ delete[] unpack_lens; -#define MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ +#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ HALO_PACK_BODY; \ @@ -121,7 +121,7 @@ malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define MPI_HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ +#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ free(pack_lambdas); \ delete[] pack_lens; \ free(unpack_lambdas); \ @@ -139,13 +139,13 @@ namespace rajaperf namespace comm { -class MPI_HALOEXCHANGE_FUSED : public HALO_base +class HALOEXCHANGE_FUSED : public HALO_base { public: - MPI_HALOEXCHANGE_FUSED(const RunParams& params); + HALOEXCHANGE_FUSED(const RunParams& params); - ~MPI_HALOEXCHANGE_FUSED(); + ~HALOEXCHANGE_FUSED(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/MPI_HALOSENDRECV-Cuda.cpp b/src/comm/HALOSENDRECV-Cuda.cpp similarity index 86% rename from src/comm/MPI_HALOSENDRECV-Cuda.cpp rename to src/comm/HALOSENDRECV-Cuda.cpp index 9893de266..075b1996d 100644 --- a/src/comm/MPI_HALOSENDRECV-Cuda.cpp +++ b/src/comm/HALOSENDRECV-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOSENDRECV.hpp" +#include "HALOSENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -22,11 +22,11 @@ namespace comm { -void MPI_HALOSENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOSENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOSENDRECV_DATA_SETUP; + HALOSENDRECV_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -53,7 +53,7 @@ void MPI_HALOSENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( stopTimer(); } else { - getCout() << "\n MPI_HALOSENDRECV : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALOSENDRECV : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOSENDRECV-Hip.cpp b/src/comm/HALOSENDRECV-Hip.cpp similarity index 86% rename from src/comm/MPI_HALOSENDRECV-Hip.cpp rename to src/comm/HALOSENDRECV-Hip.cpp index 28775c15e..31b830358 100644 --- a/src/comm/MPI_HALOSENDRECV-Hip.cpp +++ b/src/comm/HALOSENDRECV-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOSENDRECV.hpp" +#include "HALOSENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -22,11 +22,11 @@ namespace comm { -void MPI_HALOSENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOSENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOSENDRECV_DATA_SETUP; + HALOSENDRECV_DATA_SETUP; if ( vid == Base_HIP ) { @@ -53,7 +53,7 @@ void MPI_HALOSENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t stopTimer(); } else { - getCout() << "\n MPI_HALOSENDRECV : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALOSENDRECV : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOSENDRECV-OMP.cpp b/src/comm/HALOSENDRECV-OMP.cpp similarity index 87% rename from src/comm/MPI_HALOSENDRECV-OMP.cpp rename to src/comm/HALOSENDRECV-OMP.cpp index 7b5e3bed4..f8513f6d0 100644 --- a/src/comm/MPI_HALOSENDRECV-OMP.cpp +++ b/src/comm/HALOSENDRECV-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOSENDRECV.hpp" +#include "HALOSENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -20,13 +20,13 @@ namespace comm { -void MPI_HALOSENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOSENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - MPI_HALOSENDRECV_DATA_SETUP; + HALOSENDRECV_DATA_SETUP; switch ( vid ) { @@ -58,7 +58,7 @@ void MPI_HALOSENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } default : { - getCout() << "\n MPI_HALOSENDRECV : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOSENDRECV : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOSENDRECV-OMPTarget.cpp b/src/comm/HALOSENDRECV-OMPTarget.cpp similarity index 86% rename from src/comm/MPI_HALOSENDRECV-OMPTarget.cpp rename to src/comm/HALOSENDRECV-OMPTarget.cpp index 87a1d69d7..f16b3b31d 100644 --- a/src/comm/MPI_HALOSENDRECV-OMPTarget.cpp +++ b/src/comm/HALOSENDRECV-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOSENDRECV.hpp" +#include "HALOSENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -27,11 +27,11 @@ namespace comm const size_t threads_per_team = 256; -void MPI_HALOSENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOSENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOSENDRECV_DATA_SETUP; + HALOSENDRECV_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -58,7 +58,7 @@ void MPI_HALOSENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNU stopTimer(); } else { - getCout() << "\n MPI_HALOSENDRECV : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALOSENDRECV : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOSENDRECV-Seq.cpp b/src/comm/HALOSENDRECV-Seq.cpp similarity index 86% rename from src/comm/MPI_HALOSENDRECV-Seq.cpp rename to src/comm/HALOSENDRECV-Seq.cpp index 28c4938b6..9ec1583f5 100644 --- a/src/comm/MPI_HALOSENDRECV-Seq.cpp +++ b/src/comm/HALOSENDRECV-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOSENDRECV.hpp" +#include "HALOSENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -20,11 +20,11 @@ namespace comm { -void MPI_HALOSENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOSENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - MPI_HALOSENDRECV_DATA_SETUP; + HALOSENDRECV_DATA_SETUP; switch ( vid ) { @@ -56,7 +56,7 @@ void MPI_HALOSENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } default : { - getCout() << "\n MPI_HALOSENDRECV : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALOSENDRECV : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/MPI_HALOSENDRECV.cpp b/src/comm/HALOSENDRECV.cpp similarity index 89% rename from src/comm/MPI_HALOSENDRECV.cpp rename to src/comm/HALOSENDRECV.cpp index 1631ea107..36c54af8d 100644 --- a/src/comm/MPI_HALOSENDRECV.cpp +++ b/src/comm/HALOSENDRECV.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MPI_HALOSENDRECV.hpp" +#include "HALOSENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -MPI_HALOSENDRECV::MPI_HALOSENDRECV(const RunParams& params) - : HALO_base(rajaperf::Comm_MPI_HALOSENDRECV, params) +HALOSENDRECV::HALOSENDRECV(const RunParams& params) + : HALO_base(rajaperf::Comm_HALOSENDRECV, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -51,11 +51,11 @@ MPI_HALOSENDRECV::MPI_HALOSENDRECV(const RunParams& params) } } -MPI_HALOSENDRECV::~MPI_HALOSENDRECV() +HALOSENDRECV::~HALOSENDRECV() { } -void MPI_HALOSENDRECV::setUp(VariantID vid, size_t tune_idx) +void HALOSENDRECV::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); @@ -82,7 +82,7 @@ void MPI_HALOSENDRECV::setUp(VariantID vid, size_t tune_idx) } } -void MPI_HALOSENDRECV::updateChecksum(VariantID vid, size_t tune_idx) +void HALOSENDRECV::updateChecksum(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); @@ -96,7 +96,7 @@ void MPI_HALOSENDRECV::updateChecksum(VariantID vid, size_t tune_idx) } } -void MPI_HALOSENDRECV::tearDown(VariantID vid, size_t tune_idx) +void HALOSENDRECV::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/MPI_HALOSENDRECV.hpp b/src/comm/HALOSENDRECV.hpp similarity index 92% rename from src/comm/MPI_HALOSENDRECV.hpp rename to src/comm/HALOSENDRECV.hpp index db1a1e00e..75f817dcb 100644 --- a/src/comm/MPI_HALOSENDRECV.hpp +++ b/src/comm/HALOSENDRECV.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// MPI_HALOSENDRECV kernel reference implementation: +/// HALOSENDRECV kernel reference implementation: /// /// // post a recv for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -56,10 +56,10 @@ /// -#ifndef RAJAPerf_Comm_MPI_HALOSENDRECV_HPP -#define RAJAPerf_Comm_MPI_HALOSENDRECV_HPP +#ifndef RAJAPerf_Comm_HALOSENDRECV_HPP +#define RAJAPerf_Comm_HALOSENDRECV_HPP -#define MPI_HALOSENDRECV_DATA_SETUP \ +#define HALOSENDRECV_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -87,13 +87,13 @@ namespace rajaperf namespace comm { -class MPI_HALOSENDRECV : public HALO_base +class HALOSENDRECV : public HALO_base { public: - MPI_HALOSENDRECV(const RunParams& params); + HALOSENDRECV(const RunParams& params); - ~MPI_HALOSENDRECV(); + ~HALOSENDRECV(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index a629548cb..8bb996d17 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -26,7 +26,7 @@ #include "algorithm/SORT.hpp" #include "comm/HALOPACKING_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include "comm/MPI_HALOEXCHANGE_FUSED.hpp" +#include "comm/HALOEXCHANGE_FUSED.hpp" #endif #include @@ -710,7 +710,7 @@ void Executor::runWarmupKernels() #ifdef RAJA_PERFSUITE_ENABLE_MPI case MPI: - kernel_ids.insert(Comm_MPI_HALOEXCHANGE_FUSED); break; + kernel_ids.insert(Comm_HALOEXCHANGE_FUSED); break; #endif default: diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index c2fb6bae1..debeb21ca 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -111,9 +111,9 @@ #include "comm/HALOPACKING.hpp" #include "comm/HALOPACKING_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include "comm/MPI_HALOSENDRECV.hpp" -#include "comm/MPI_HALOEXCHANGE.hpp" -#include "comm/MPI_HALOEXCHANGE_FUSED.hpp" +#include "comm/HALOSENDRECV.hpp" +#include "comm/HALOEXCHANGE.hpp" +#include "comm/HALOEXCHANGE_FUSED.hpp" #endif @@ -261,9 +261,9 @@ static const std::string KernelNames [] = std::string("Comm_HALOPACKING"), std::string("Comm_HALOPACKING_FUSED"), #if defined(RAJA_PERFSUITE_ENABLE_MPI) - std::string("Comm_MPI_HALOSENDRECV"), - std::string("Comm_MPI_HALOEXCHANGE"), - std::string("Comm_MPI_HALOEXCHANGE_FUSED"), + std::string("Comm_HALOSENDRECV"), + std::string("Comm_HALOEXCHANGE"), + std::string("Comm_HALOEXCHANGE_FUSED"), #endif std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... @@ -999,16 +999,16 @@ KernelBase* getKernelObject(KernelID kid, break; } #if defined(RAJA_PERFSUITE_ENABLE_MPI) - case Comm_MPI_HALOSENDRECV : { - kernel = new comm::MPI_HALOSENDRECV(run_params); + case Comm_HALOSENDRECV : { + kernel = new comm::HALOSENDRECV(run_params); break; } - case Comm_MPI_HALOEXCHANGE : { - kernel = new comm::MPI_HALOEXCHANGE(run_params); + case Comm_HALOEXCHANGE : { + kernel = new comm::HALOEXCHANGE(run_params); break; } - case Comm_MPI_HALOEXCHANGE_FUSED : { - kernel = new comm::MPI_HALOEXCHANGE_FUSED(run_params); + case Comm_HALOEXCHANGE_FUSED : { + kernel = new comm::HALOEXCHANGE_FUSED(run_params); break; } #endif diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index b5b95f9f2..7db7e8868 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -171,9 +171,9 @@ enum KernelID { Comm_HALOPACKING, Comm_HALOPACKING_FUSED, #if defined(RAJA_PERFSUITE_ENABLE_MPI) - Comm_MPI_HALOSENDRECV, - Comm_MPI_HALOEXCHANGE, - Comm_MPI_HALOEXCHANGE_FUSED, + Comm_HALOSENDRECV, + Comm_HALOEXCHANGE, + Comm_HALOEXCHANGE_FUSED, #endif NumKernels // Keep this one last and NEVER comment out (!!) From 6d1b050272300d80c908c86bf617cb4386320852 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 27 Nov 2023 21:34:30 -0800 Subject: [PATCH 148/454] Renamed to add _ after HALO --- src/CMakeLists.txt | 30 +++++----- src/comm/CMakeLists.txt | 60 +++++++++---------- ...CHANGE-Cuda.cpp => HALO_EXCHANGE-Cuda.cpp} | 26 ++++---- ...EXCHANGE-Hip.cpp => HALO_EXCHANGE-Hip.cpp} | 26 ++++---- ...EXCHANGE-OMP.cpp => HALO_EXCHANGE-OMP.cpp} | 24 ++++---- ...Target.cpp => HALO_EXCHANGE-OMPTarget.cpp} | 16 ++--- ...EXCHANGE-Seq.cpp => HALO_EXCHANGE-Seq.cpp} | 24 ++++---- .../{HALOEXCHANGE.cpp => HALO_EXCHANGE.cpp} | 14 ++--- .../{HALOEXCHANGE.hpp => HALO_EXCHANGE.hpp} | 14 ++--- ...-Cuda.cpp => HALO_EXCHANGE_FUSED-Cuda.cpp} | 34 +++++------ ...ED-Hip.cpp => HALO_EXCHANGE_FUSED-Hip.cpp} | 34 +++++------ ...ED-OMP.cpp => HALO_EXCHANGE_FUSED-OMP.cpp} | 24 ++++---- ....cpp => HALO_EXCHANGE_FUSED-OMPTarget.cpp} | 32 +++++----- ...ED-Seq.cpp => HALO_EXCHANGE_FUSED-Seq.cpp} | 24 ++++---- ...ANGE_FUSED.cpp => HALO_EXCHANGE_FUSED.cpp} | 14 ++--- ...ANGE_FUSED.hpp => HALO_EXCHANGE_FUSED.hpp} | 22 +++---- ...PACKING-Cuda.cpp => HALO_PACKING-Cuda.cpp} | 26 ++++---- ...LOPACKING-Hip.cpp => HALO_PACKING-Hip.cpp} | 26 ++++---- ...LOPACKING-OMP.cpp => HALO_PACKING-OMP.cpp} | 24 ++++---- ...PTarget.cpp => HALO_PACKING-OMPTarget.cpp} | 16 ++--- ...LOPACKING-Seq.cpp => HALO_PACKING-Seq.cpp} | 24 ++++---- .../{HALOPACKING.cpp => HALO_PACKING.cpp} | 14 ++--- .../{HALOPACKING.hpp => HALO_PACKING.hpp} | 14 ++--- ...D-Cuda.cpp => HALO_PACKING_FUSED-Cuda.cpp} | 34 +++++------ ...SED-Hip.cpp => HALO_PACKING_FUSED-Hip.cpp} | 34 +++++------ ...SED-OMP.cpp => HALO_PACKING_FUSED-OMP.cpp} | 24 ++++---- ...t.cpp => HALO_PACKING_FUSED-OMPTarget.cpp} | 32 +++++----- ...SED-Seq.cpp => HALO_PACKING_FUSED-Seq.cpp} | 24 ++++---- ...CKING_FUSED.cpp => HALO_PACKING_FUSED.cpp} | 14 ++--- ...CKING_FUSED.hpp => HALO_PACKING_FUSED.hpp} | 22 +++---- ...NDRECV-Cuda.cpp => HALO_SENDRECV-Cuda.cpp} | 8 +-- ...SENDRECV-Hip.cpp => HALO_SENDRECV-Hip.cpp} | 8 +-- ...SENDRECV-OMP.cpp => HALO_SENDRECV-OMP.cpp} | 8 +-- ...Target.cpp => HALO_SENDRECV-OMPTarget.cpp} | 8 +-- ...SENDRECV-Seq.cpp => HALO_SENDRECV-Seq.cpp} | 8 +-- .../{HALOSENDRECV.cpp => HALO_SENDRECV.cpp} | 14 ++--- .../{HALOSENDRECV.hpp => HALO_SENDRECV.hpp} | 14 ++--- src/comm/HALO_base.hpp | 34 +---------- src/common/Executor.cpp | 8 +-- src/common/RAJAPerfSuite.cpp | 40 ++++++------- src/common/RAJAPerfSuite.hpp | 10 ++-- test/test-raja-perf-suite.cpp | 2 +- 42 files changed, 438 insertions(+), 470 deletions(-) rename src/comm/{HALOEXCHANGE-Cuda.cpp => HALO_EXCHANGE-Cuda.cpp} (85%) rename src/comm/{HALOEXCHANGE-Hip.cpp => HALO_EXCHANGE-Hip.cpp} (86%) rename src/comm/{HALOEXCHANGE-OMP.cpp => HALO_EXCHANGE-OMP.cpp} (91%) rename src/comm/{HALOEXCHANGE-OMPTarget.cpp => HALO_EXCHANGE-OMPTarget.cpp} (91%) rename src/comm/{HALOEXCHANGE-Seq.cpp => HALO_EXCHANGE-Seq.cpp} (91%) rename src/comm/{HALOEXCHANGE.cpp => HALO_EXCHANGE.cpp} (92%) rename src/comm/{HALOEXCHANGE.hpp => HALO_EXCHANGE.hpp} (93%) rename src/comm/{HALOEXCHANGE_FUSED-Cuda.cpp => HALO_EXCHANGE_FUSED-Cuda.cpp} (89%) rename src/comm/{HALOEXCHANGE_FUSED-Hip.cpp => HALO_EXCHANGE_FUSED-Hip.cpp} (89%) rename src/comm/{HALOEXCHANGE_FUSED-OMP.cpp => HALO_EXCHANGE_FUSED-OMP.cpp} (94%) rename src/comm/{HALOEXCHANGE_FUSED-OMPTarget.cpp => HALO_EXCHANGE_FUSED-OMPTarget.cpp} (91%) rename src/comm/{HALOEXCHANGE_FUSED-Seq.cpp => HALO_EXCHANGE_FUSED-Seq.cpp} (93%) rename src/comm/{HALOEXCHANGE_FUSED.cpp => HALO_EXCHANGE_FUSED.cpp} (91%) rename src/comm/{HALOEXCHANGE_FUSED.hpp => HALO_EXCHANGE_FUSED.hpp} (91%) rename src/comm/{HALOPACKING-Cuda.cpp => HALO_PACKING-Cuda.cpp} (82%) rename src/comm/{HALOPACKING-Hip.cpp => HALO_PACKING-Hip.cpp} (83%) rename src/comm/{HALOPACKING-OMP.cpp => HALO_PACKING-OMP.cpp} (88%) rename src/comm/{HALOPACKING-OMPTarget.cpp => HALO_PACKING-OMPTarget.cpp} (89%) rename src/comm/{HALOPACKING-Seq.cpp => HALO_PACKING-Seq.cpp} (88%) rename src/comm/{HALOPACKING.cpp => HALO_PACKING.cpp} (93%) rename src/comm/{HALOPACKING.hpp => HALO_PACKING.hpp} (92%) rename src/comm/{HALOPACKING_FUSED-Cuda.cpp => HALO_PACKING_FUSED-Cuda.cpp} (88%) rename src/comm/{HALOPACKING_FUSED-Hip.cpp => HALO_PACKING_FUSED-Hip.cpp} (88%) rename src/comm/{HALOPACKING_FUSED-OMP.cpp => HALO_PACKING_FUSED-OMP.cpp} (93%) rename src/comm/{HALOPACKING_FUSED-OMPTarget.cpp => HALO_PACKING_FUSED-OMPTarget.cpp} (90%) rename src/comm/{HALOPACKING_FUSED-Seq.cpp => HALO_PACKING_FUSED-Seq.cpp} (92%) rename src/comm/{HALOPACKING_FUSED.cpp => HALO_PACKING_FUSED.cpp} (92%) rename src/comm/{HALOPACKING_FUSED.hpp => HALO_PACKING_FUSED.hpp} (90%) rename src/comm/{HALOSENDRECV-Cuda.cpp => HALO_SENDRECV-Cuda.cpp} (86%) rename src/comm/{HALOSENDRECV-Hip.cpp => HALO_SENDRECV-Hip.cpp} (86%) rename src/comm/{HALOSENDRECV-OMP.cpp => HALO_SENDRECV-OMP.cpp} (87%) rename src/comm/{HALOSENDRECV-OMPTarget.cpp => HALO_SENDRECV-OMPTarget.cpp} (87%) rename src/comm/{HALOSENDRECV-Seq.cpp => HALO_SENDRECV-Seq.cpp} (87%) rename src/comm/{HALOSENDRECV.cpp => HALO_SENDRECV.cpp} (90%) rename src/comm/{HALOSENDRECV.hpp => HALO_SENDRECV.hpp} (92%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cec242d23..c74ef07e3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -245,21 +245,21 @@ blt_add_executable( algorithm/MEMCPY-Seq.cpp algorithm/MEMCPY-OMPTarget.cpp comm/HALO_base.cpp - comm/HALOPACKING.cpp - comm/HALOPACKING-Seq.cpp - comm/HALOPACKING-OMPTarget.cpp - comm/HALOPACKING_FUSED.cpp - comm/HALOPACKING_FUSED-Seq.cpp - comm/HALOPACKING_FUSED-OMPTarget.cpp - comm/HALOSENDRECV.cpp - comm/HALOSENDRECV-Seq.cpp - comm/HALOSENDRECV-OMPTarget.cpp - comm/HALOEXCHANGE.cpp - comm/HALOEXCHANGE-Seq.cpp - comm/HALOEXCHANGE-OMPTarget.cpp - comm/HALOEXCHANGE_FUSED.cpp - comm/HALOEXCHANGE_FUSED-Seq.cpp - comm/HALOEXCHANGE_FUSED-OMPTarget.cpp + comm/HALO_PACKING.cpp + comm/HALO_PACKING-Seq.cpp + comm/HALO_PACKING-OMPTarget.cpp + comm/HALO_PACKING_FUSED.cpp + comm/HALO_PACKING_FUSED-Seq.cpp + comm/HALO_PACKING_FUSED-OMPTarget.cpp + comm/HALO_SENDRECV.cpp + comm/HALO_SENDRECV-Seq.cpp + comm/HALO_SENDRECV-OMPTarget.cpp + comm/HALO_EXCHANGE.cpp + comm/HALO_EXCHANGE-Seq.cpp + comm/HALO_EXCHANGE-OMPTarget.cpp + comm/HALO_EXCHANGE_FUSED.cpp + comm/HALO_EXCHANGE_FUSED-Seq.cpp + comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS} ) install( TARGETS raja-perf-omptarget.exe diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt index c431d9221..467436594 100644 --- a/src/comm/CMakeLists.txt +++ b/src/comm/CMakeLists.txt @@ -9,35 +9,35 @@ blt_add_library( NAME comm SOURCES HALO_base.cpp - HALOPACKING.cpp - HALOPACKING-Seq.cpp - HALOPACKING-Hip.cpp - HALOPACKING-Cuda.cpp - HALOPACKING-OMP.cpp - HALOPACKING-OMPTarget.cpp - HALOPACKING_FUSED.cpp - HALOPACKING_FUSED-Seq.cpp - HALOPACKING_FUSED-Hip.cpp - HALOPACKING_FUSED-Cuda.cpp - HALOPACKING_FUSED-OMP.cpp - HALOPACKING_FUSED-OMPTarget.cpp - HALOSENDRECV.cpp - HALOSENDRECV-Seq.cpp - HALOSENDRECV-Hip.cpp - HALOSENDRECV-Cuda.cpp - HALOSENDRECV-OMP.cpp - HALOSENDRECV-OMPTarget.cpp - HALOEXCHANGE.cpp - HALOEXCHANGE-Seq.cpp - HALOEXCHANGE-Hip.cpp - HALOEXCHANGE-Cuda.cpp - HALOEXCHANGE-OMP.cpp - HALOEXCHANGE-OMPTarget.cpp - HALOEXCHANGE_FUSED.cpp - HALOEXCHANGE_FUSED-Seq.cpp - HALOEXCHANGE_FUSED-Hip.cpp - HALOEXCHANGE_FUSED-Cuda.cpp - HALOEXCHANGE_FUSED-OMP.cpp - HALOEXCHANGE_FUSED-OMPTarget.cpp + HALO_PACKING.cpp + HALO_PACKING-Seq.cpp + HALO_PACKING-Hip.cpp + HALO_PACKING-Cuda.cpp + HALO_PACKING-OMP.cpp + HALO_PACKING-OMPTarget.cpp + HALO_PACKING_FUSED.cpp + HALO_PACKING_FUSED-Seq.cpp + HALO_PACKING_FUSED-Hip.cpp + HALO_PACKING_FUSED-Cuda.cpp + HALO_PACKING_FUSED-OMP.cpp + HALO_PACKING_FUSED-OMPTarget.cpp + HALO_SENDRECV.cpp + HALO_SENDRECV-Seq.cpp + HALO_SENDRECV-Hip.cpp + HALO_SENDRECV-Cuda.cpp + HALO_SENDRECV-OMP.cpp + HALO_SENDRECV-OMPTarget.cpp + HALO_EXCHANGE.cpp + HALO_EXCHANGE-Seq.cpp + HALO_EXCHANGE-Hip.cpp + HALO_EXCHANGE-Cuda.cpp + HALO_EXCHANGE-OMP.cpp + HALO_EXCHANGE-OMPTarget.cpp + HALO_EXCHANGE_FUSED.cpp + HALO_EXCHANGE_FUSED-Seq.cpp + HALO_EXCHANGE_FUSED-Hip.cpp + HALO_EXCHANGE_FUSED-Cuda.cpp + HALO_EXCHANGE_FUSED-OMP.cpp + HALO_EXCHANGE_FUSED-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/comm/HALOEXCHANGE-Cuda.cpp b/src/comm/HALO_EXCHANGE-Cuda.cpp similarity index 85% rename from src/comm/HALOEXCHANGE-Cuda.cpp rename to src/comm/HALO_EXCHANGE-Cuda.cpp index 437e64572..0a2078f46 100644 --- a/src/comm/HALOEXCHANGE-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_EXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -47,13 +47,13 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > -void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) +void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOEXCHANGE_DATA_SETUP; + HALO_EXCHANGE_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -75,7 +75,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_pack<<>>(buffer, list, var, len); + HALO_exchange_pack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -109,7 +109,7 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_unpack<<>>(buffer, list, var, len); + HALO_exchange_unpack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -140,12 +140,12 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -175,12 +175,12 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -192,11 +192,11 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE-Hip.cpp b/src/comm/HALO_EXCHANGE-Hip.cpp similarity index 86% rename from src/comm/HALOEXCHANGE-Hip.cpp rename to src/comm/HALO_EXCHANGE-Hip.cpp index 56841c3e6..3a78dc389 100644 --- a/src/comm/HALOEXCHANGE-Hip.cpp +++ b/src/comm/HALO_EXCHANGE-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_EXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -47,13 +47,13 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > -void HALOEXCHANGE::runHipVariantImpl(VariantID vid) +void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - HALOEXCHANGE_DATA_SETUP; + HALO_EXCHANGE_DATA_SETUP; if ( vid == Base_HIP ) { @@ -75,7 +75,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; @@ -110,7 +110,7 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; @@ -142,12 +142,12 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -177,12 +177,12 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -194,11 +194,11 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE-OMP.cpp b/src/comm/HALO_EXCHANGE-OMP.cpp similarity index 91% rename from src/comm/HALOEXCHANGE-OMP.cpp rename to src/comm/HALO_EXCHANGE-OMP.cpp index ccad6969f..44a55db92 100644 --- a/src/comm/HALOEXCHANGE-OMP.cpp +++ b/src/comm/HALO_EXCHANGE-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_EXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -20,13 +20,13 @@ namespace comm { -void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALO_EXCHANGE_DATA_SETUP; switch ( vid ) { @@ -112,12 +112,12 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - haloexchange_pack_base_lam(i); + HALO_exchange_pack_base_lam(i); } buffer += len; } @@ -147,12 +147,12 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - haloexchange_unpack_base_lam(i); + HALO_exchange_unpack_base_lam(i); } buffer += len; } @@ -185,12 +185,12 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -219,12 +219,12 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -238,7 +238,7 @@ void HALOEXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } default : { - getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE-OMPTarget.cpp b/src/comm/HALO_EXCHANGE-OMPTarget.cpp similarity index 91% rename from src/comm/HALOEXCHANGE-OMPTarget.cpp rename to src/comm/HALO_EXCHANGE-OMPTarget.cpp index c1e989a7d..0a0340810 100644 --- a/src/comm/HALOEXCHANGE-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_EXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -27,11 +27,11 @@ namespace comm const size_t threads_per_team = 256; -void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_EXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALO_EXCHANGE_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -116,12 +116,12 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -150,12 +150,12 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -166,7 +166,7 @@ void HALOEXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ stopTimer(); } else { - getCout() << "\n HALOEXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE-Seq.cpp b/src/comm/HALO_EXCHANGE-Seq.cpp similarity index 91% rename from src/comm/HALOEXCHANGE-Seq.cpp rename to src/comm/HALO_EXCHANGE-Seq.cpp index b94d9c459..62337fa1a 100644 --- a/src/comm/HALOEXCHANGE-Seq.cpp +++ b/src/comm/HALO_EXCHANGE-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_EXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -20,11 +20,11 @@ namespace comm { -void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_DATA_SETUP; + HALO_EXCHANGE_DATA_SETUP; switch ( vid ) { @@ -109,11 +109,11 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; for (Index_type i = 0; i < len; i++) { - haloexchange_pack_base_lam(i); + HALO_exchange_pack_base_lam(i); } buffer += len; } @@ -143,11 +143,11 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; for (Index_type i = 0; i < len; i++) { - haloexchange_unpack_base_lam(i); + HALO_exchange_unpack_base_lam(i); } buffer += len; } @@ -180,12 +180,12 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -214,12 +214,12 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -234,7 +234,7 @@ void HALOEXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp similarity index 92% rename from src/comm/HALOEXCHANGE.cpp rename to src/comm/HALO_EXCHANGE.cpp index 929f77dcc..87cab85ab 100644 --- a/src/comm/HALOEXCHANGE.cpp +++ b/src/comm/HALO_EXCHANGE.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE.hpp" +#include "HALO_EXCHANGE.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) - : HALO_base(rajaperf::Comm_HALOEXCHANGE, params) +HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_EXCHANGE, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -62,11 +62,11 @@ HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) } } -HALOEXCHANGE::~HALOEXCHANGE() +HALO_EXCHANGE::~HALO_EXCHANGE() { } -void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) +void HALO_EXCHANGE::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); @@ -111,14 +111,14 @@ void HALOEXCHANGE::setUp(VariantID vid, size_t tune_idx) } } -void HALOEXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) +void HALO_EXCHANGE::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } -void HALOEXCHANGE::tearDown(VariantID vid, size_t tune_idx) +void HALO_EXCHANGE::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/HALOEXCHANGE.hpp b/src/comm/HALO_EXCHANGE.hpp similarity index 93% rename from src/comm/HALOEXCHANGE.hpp rename to src/comm/HALO_EXCHANGE.hpp index 8bfc80ed6..5474a6837 100644 --- a/src/comm/HALOEXCHANGE.hpp +++ b/src/comm/HALO_EXCHANGE.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOEXCHANGE kernel reference implementation: +/// HALO_EXCHANGE kernel reference implementation: /// /// // post a recv for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -56,10 +56,10 @@ /// -#ifndef RAJAPerf_Comm_HALOEXCHANGE_HPP -#define RAJAPerf_Comm_HALOEXCHANGE_HPP +#ifndef RAJAPerf_Comm_HALO_EXCHANGE_HPP +#define RAJAPerf_Comm_HALO_EXCHANGE_HPP -#define HALOEXCHANGE_DATA_SETUP \ +#define HALO_EXCHANGE_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -95,13 +95,13 @@ namespace rajaperf namespace comm { -class HALOEXCHANGE : public HALO_base +class HALO_EXCHANGE : public HALO_base { public: - HALOEXCHANGE(const RunParams& params); + HALO_EXCHANGE(const RunParams& params); - ~HALOEXCHANGE(); + ~HALO_EXCHANGE(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp similarity index 89% rename from src/comm/HALOEXCHANGE_FUSED-Cuda.cpp rename to src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp index 6a59be4fd..bd7e41578 100644 --- a/src/comm/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_EXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, +__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,7 +70,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, +__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -89,17 +89,17 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) +void HALO_EXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_EXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_CUDA ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -133,7 +133,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - haloexchange_fused_pack<<>>( + HALO_exchange_fused_pack<<>>( pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); cudaErrchk( cudaGetLastError() ); if (separate_buffers) { @@ -180,7 +180,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - haloexchange_fused_unpack<<>>( + HALO_exchange_fused_unpack<<>>( unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); @@ -190,7 +190,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; } else if ( vid == RAJA_CUDA ) { @@ -239,12 +239,12 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -279,12 +279,12 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -298,11 +298,11 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE_FUSED, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp similarity index 89% rename from src/comm/HALOEXCHANGE_FUSED-Hip.cpp rename to src/comm/HALO_EXCHANGE_FUSED-Hip.cpp index fb196e63d..f11f0488c 100644 --- a/src/comm/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_EXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, +__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,7 +70,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, +__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -89,17 +89,17 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) +void HALO_EXCHANGE_FUSED::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_EXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_HIP ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -133,7 +133,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); hipErrchk( hipGetLastError() ); if (separate_buffers) { @@ -180,7 +180,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); @@ -190,7 +190,7 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; } else if ( vid == RAJA_HIP ) { @@ -243,12 +243,12 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -283,12 +283,12 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -302,11 +302,11 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_EXCHANGE_FUSED, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOEXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp similarity index 94% rename from src/comm/HALOEXCHANGE_FUSED-OMP.cpp rename to src/comm/HALO_EXCHANGE_FUSED-OMP.cpp index 95b03ccec..9711ee29a 100644 --- a/src/comm/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_EXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -20,19 +20,19 @@ namespace comm { -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_EXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_EXCHANGE_FUSED_DATA_SETUP; switch ( vid ) { case Base_OpenMP : { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -155,14 +155,14 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; break; } case Lambda_OpenMP : { - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -277,7 +277,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -330,12 +330,12 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -369,12 +369,12 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -390,7 +390,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp similarity index 91% rename from src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp rename to src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp index 9b126393e..2afbf0bf8 100644 --- a/src/comm/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_EXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -26,7 +26,7 @@ namespace comm // //const size_t threads_per_team = 256; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ void** pack_ptrs; \ allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ @@ -50,28 +50,28 @@ namespace comm Real_ptr* h_unpack_var_ptrs = reinterpret_cast(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \ Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ deallocData(DataSpace::OmpTarget, pack_ptrs); \ delete[] h_pack_ptrs; \ deallocData(DataSpace::OmpTarget, unpack_ptrs); \ delete[] h_unpack_ptrs; -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_EXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_EXCHANGE_FUSED_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -100,7 +100,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U buffer += len; } } - HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -157,7 +157,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U buffer += len; } } - HALOEXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -180,7 +180,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; } else if ( vid == RAJA_OpenMPTarget ) { @@ -230,12 +230,12 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -269,12 +269,12 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -287,7 +287,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U stopTimer(); } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp similarity index 93% rename from src/comm/HALOEXCHANGE_FUSED-Seq.cpp rename to src/comm/HALO_EXCHANGE_FUSED-Seq.cpp index ca7798674..7e0f44dfd 100644 --- a/src/comm/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_EXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -20,17 +20,17 @@ namespace comm { -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_EXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOEXCHANGE_FUSED_DATA_SETUP; + HALO_EXCHANGE_FUSED_DATA_SETUP; switch ( vid ) { case Base_Seq : { - HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -115,7 +115,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; + HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN; break; } @@ -123,7 +123,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -204,7 +204,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -257,12 +257,12 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -296,12 +296,12 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -318,7 +318,7 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG #endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_EXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOEXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp similarity index 91% rename from src/comm/HALOEXCHANGE_FUSED.cpp rename to src/comm/HALO_EXCHANGE_FUSED.cpp index 87869ebb2..855868ba8 100644 --- a/src/comm/HALOEXCHANGE_FUSED.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOEXCHANGE_FUSED.hpp" +#include "HALO_EXCHANGE_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) - : HALO_base(rajaperf::Comm_HALOEXCHANGE_FUSED, params) +HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_EXCHANGE_FUSED, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -62,11 +62,11 @@ HALOEXCHANGE_FUSED::HALOEXCHANGE_FUSED(const RunParams& params) } } -HALOEXCHANGE_FUSED::~HALOEXCHANGE_FUSED() +HALO_EXCHANGE_FUSED::~HALO_EXCHANGE_FUSED() { } -void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) +void HALO_EXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); @@ -111,14 +111,14 @@ void HALOEXCHANGE_FUSED::setUp(VariantID vid, size_t tune_idx) } } -void HALOEXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +void HALO_EXCHANGE_FUSED::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); } } -void HALOEXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) +void HALO_EXCHANGE_FUSED::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/HALOEXCHANGE_FUSED.hpp b/src/comm/HALO_EXCHANGE_FUSED.hpp similarity index 91% rename from src/comm/HALOEXCHANGE_FUSED.hpp rename to src/comm/HALO_EXCHANGE_FUSED.hpp index f4f4a72d9..3d9d02a31 100644 --- a/src/comm/HALOEXCHANGE_FUSED.hpp +++ b/src/comm/HALO_EXCHANGE_FUSED.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOEXCHANGE_FUSED kernel reference implementation: +/// HALO_EXCHANGE_FUSED kernel reference implementation: /// /// // post a recv for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -59,10 +59,10 @@ /// MPI_Waitall(num_neighbors, pack_mpi_requests.data(), MPI_STATUSES_IGNORE); /// -#ifndef RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP -#define RAJAPerf_Comm_HALOEXCHANGE_FUSED_HPP +#ifndef RAJAPerf_Comm_HALO_EXCHANGE_FUSED_HPP +#define RAJAPerf_Comm_HALO_EXCHANGE_FUSED_HPP -#define HALOEXCHANGE_FUSED_DATA_SETUP \ +#define HALO_EXCHANGE_FUSED_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -83,7 +83,7 @@ std::vector send_buffers = m_send_buffers; \ std::vector recv_buffers = m_recv_buffers; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_SETUP \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ Real_ptr buffer; \ Int_ptr list; \ @@ -94,14 +94,14 @@ ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ +#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN \ delete[] pack_ptr_holders; \ delete[] pack_lens; \ delete[] unpack_ptr_holders; \ delete[] unpack_lens; -#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ +#define HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ HALO_PACK_BODY; \ @@ -121,7 +121,7 @@ malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOEXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ +#define HALO_EXCHANGE_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ free(pack_lambdas); \ delete[] pack_lens; \ free(unpack_lambdas); \ @@ -139,13 +139,13 @@ namespace rajaperf namespace comm { -class HALOEXCHANGE_FUSED : public HALO_base +class HALO_EXCHANGE_FUSED : public HALO_base { public: - HALOEXCHANGE_FUSED(const RunParams& params); + HALO_EXCHANGE_FUSED(const RunParams& params); - ~HALOEXCHANGE_FUSED(); + ~HALO_EXCHANGE_FUSED(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALOPACKING-Cuda.cpp b/src/comm/HALO_PACKING-Cuda.cpp similarity index 82% rename from src/comm/HALOPACKING-Cuda.cpp rename to src/comm/HALO_PACKING-Cuda.cpp index 89f04ef9a..393a8cb95 100644 --- a/src/comm/HALOPACKING-Cuda.cpp +++ b/src/comm/HALO_PACKING-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -47,13 +47,13 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > -void HALOPACKING::runCudaVariantImpl(VariantID vid) +void HALO_PACKING::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOPACKING_DATA_SETUP; + HALO_PACKING_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -69,7 +69,7 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_pack<<>>(buffer, list, var, len); + HALO_exchange_pack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -98,7 +98,7 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_unpack<<>>(buffer, list, var, len); + HALO_exchange_unpack<<>>(buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -121,12 +121,12 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -151,12 +151,12 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -166,11 +166,11 @@ void HALOPACKING::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOPACKING : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOPACKING-Hip.cpp b/src/comm/HALO_PACKING-Hip.cpp similarity index 83% rename from src/comm/HALOPACKING-Hip.cpp rename to src/comm/HALO_PACKING-Hip.cpp index 9579a5133..0be88599e 100644 --- a/src/comm/HALOPACKING-Hip.cpp +++ b/src/comm/HALO_PACKING-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void haloexchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -47,13 +47,13 @@ __global__ void haloexchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > -void HALOPACKING::runHipVariantImpl(VariantID vid) +void HALO_PACKING::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - HALOPACKING_DATA_SETUP; + HALO_PACKING_DATA_SETUP; if ( vid == Base_HIP ) { @@ -69,7 +69,7 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; @@ -99,7 +99,7 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; @@ -123,12 +123,12 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -153,12 +153,12 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -168,11 +168,11 @@ void HALOPACKING::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOPACKING : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOPACKING-OMP.cpp b/src/comm/HALO_PACKING-OMP.cpp similarity index 88% rename from src/comm/HALOPACKING-OMP.cpp rename to src/comm/HALO_PACKING-OMP.cpp index 5fbb44133..165688a3e 100644 --- a/src/comm/HALOPACKING-OMP.cpp +++ b/src/comm/HALO_PACKING-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -18,13 +18,13 @@ namespace comm { -void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOPACKING_DATA_SETUP; + HALO_PACKING_DATA_SETUP; switch ( vid ) { @@ -90,12 +90,12 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - haloexchange_pack_base_lam(i); + HALO_exchange_pack_base_lam(i); } buffer += len; } @@ -119,12 +119,12 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - haloexchange_unpack_base_lam(i); + HALO_exchange_unpack_base_lam(i); } buffer += len; } @@ -149,12 +149,12 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -177,12 +177,12 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -194,7 +194,7 @@ void HALOPACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } default : { - getCout() << "\n HALOPACKING : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOPACKING-OMPTarget.cpp b/src/comm/HALO_PACKING-OMPTarget.cpp similarity index 89% rename from src/comm/HALOPACKING-OMPTarget.cpp rename to src/comm/HALO_PACKING-OMPTarget.cpp index 752f22755..cb0c818aa 100644 --- a/src/comm/HALOPACKING-OMPTarget.cpp +++ b/src/comm/HALO_PACKING-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -27,11 +27,11 @@ namespace comm const size_t threads_per_team = 256; -void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOPACKING_DATA_SETUP; + HALO_PACKING_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -96,12 +96,12 @@ void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -124,12 +124,12 @@ void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -138,7 +138,7 @@ void HALOPACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A stopTimer(); } else { - getCout() << "\n HALOPACKING : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOPACKING-Seq.cpp b/src/comm/HALO_PACKING-Seq.cpp similarity index 88% rename from src/comm/HALOPACKING-Seq.cpp rename to src/comm/HALO_PACKING-Seq.cpp index 5db4c37e2..1b8194390 100644 --- a/src/comm/HALOPACKING-Seq.cpp +++ b/src/comm/HALO_PACKING-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -18,11 +18,11 @@ namespace comm { -void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOPACKING_DATA_SETUP; + HALO_PACKING_DATA_SETUP; switch ( vid ) { @@ -88,11 +88,11 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; for (Index_type i = 0; i < len; i++) { - haloexchange_pack_base_lam(i); + HALO_exchange_pack_base_lam(i); } buffer += len; } @@ -116,11 +116,11 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; for (Index_type i = 0; i < len; i++) { - haloexchange_unpack_base_lam(i); + HALO_exchange_unpack_base_lam(i); } buffer += len; } @@ -145,12 +145,12 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + HALO_exchange_pack_base_lam ); buffer += len; } @@ -173,12 +173,12 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + HALO_exchange_unpack_base_lam ); buffer += len; } } @@ -191,7 +191,7 @@ void HALOPACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i #endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOPACKING : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOPACKING.cpp b/src/comm/HALO_PACKING.cpp similarity index 93% rename from src/comm/HALOPACKING.cpp rename to src/comm/HALO_PACKING.cpp index fc335d1d9..7575bae43 100644 --- a/src/comm/HALOPACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING.hpp" +#include "HALO_PACKING.hpp" #include "RAJA/RAJA.hpp" @@ -15,8 +15,8 @@ namespace rajaperf namespace comm { -HALOPACKING::HALOPACKING(const RunParams& params) - : HALO_base(rajaperf::Comm_HALOPACKING, params) +HALO_PACKING::HALO_PACKING(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_PACKING, params) { setDefaultReps(200); @@ -51,11 +51,11 @@ HALOPACKING::HALOPACKING(const RunParams& params) setVariantDefined( RAJA_HIP ); } -HALOPACKING::~HALOPACKING() +HALO_PACKING::~HALO_PACKING() { } -void HALOPACKING::setUp(VariantID vid, size_t tune_idx) +void HALO_PACKING::setUp(VariantID vid, size_t tune_idx) { int my_mpi_rank = 0; const int mpi_dims[3] = {1,1,1}; @@ -102,7 +102,7 @@ void HALOPACKING::setUp(VariantID vid, size_t tune_idx) } } -void HALOPACKING::updateChecksum(VariantID vid, size_t tune_idx) +void HALO_PACKING::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); @@ -120,7 +120,7 @@ void HALOPACKING::updateChecksum(VariantID vid, size_t tune_idx) } } -void HALOPACKING::tearDown(VariantID vid, size_t tune_idx) +void HALO_PACKING::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/HALOPACKING.hpp b/src/comm/HALO_PACKING.hpp similarity index 92% rename from src/comm/HALOPACKING.hpp rename to src/comm/HALO_PACKING.hpp index 507aaf622..e14199a4d 100644 --- a/src/comm/HALOPACKING.hpp +++ b/src/comm/HALO_PACKING.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOPACKING kernel reference implementation: +/// HALO_PACKING kernel reference implementation: /// /// // pack a buffer for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -40,10 +40,10 @@ /// } /// -#ifndef RAJAPerf_Comm_HALOPACKING_HPP -#define RAJAPerf_Comm_HALOPACKING_HPP +#ifndef RAJAPerf_Comm_HALO_PACKING_HPP +#define RAJAPerf_Comm_HALO_PACKING_HPP -#define HALOPACKING_DATA_SETUP \ +#define HALO_PACKING_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -71,13 +71,13 @@ namespace rajaperf namespace comm { -class HALOPACKING : public HALO_base +class HALO_PACKING : public HALO_base { public: - HALOPACKING(const RunParams& params); + HALO_PACKING(const RunParams& params); - ~HALOPACKING(); + ~HALO_PACKING(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALOPACKING_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp similarity index 88% rename from src/comm/HALOPACKING_FUSED-Cuda.cpp rename to src/comm/HALO_PACKING_FUSED-Cuda.cpp index 4c76b331a..e4e596e7a 100644 --- a/src/comm/HALOPACKING_FUSED-Cuda.cpp +++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars); -#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \ deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \ deallocData(DataSpace::CudaPinned, pack_list_ptrs); \ deallocData(DataSpace::CudaPinned, pack_var_ptrs); \ @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, +__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,7 +70,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, +__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -89,17 +89,17 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) +void HALO_PACKING_FUSED::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getCudaResource()}; - HALOPACKING_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; if ( vid == Base_CUDA ) { - HALOPACKING_FUSED_MANUAL_FUSER_SETUP_CUDA; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -127,7 +127,7 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - haloexchange_fused_pack<<>>( + HALO_exchange_fused_pack<<>>( pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); cudaErrchk( cudaGetLastError() ); if (separate_buffers) { @@ -167,7 +167,7 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - haloexchange_fused_unpack<<>>( + HALO_exchange_fused_unpack<<>>( unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); @@ -175,7 +175,7 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) } stopTimer(); - HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; } else if ( vid == RAJA_CUDA ) { @@ -218,12 +218,12 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -251,12 +251,12 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -268,11 +268,11 @@ void HALOPACKING_FUSED::runCudaVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOPACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING_FUSED, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING_FUSED, Cuda) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOPACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp similarity index 88% rename from src/comm/HALOPACKING_FUSED-Hip.cpp rename to src/comm/HALO_PACKING_FUSED-Hip.cpp index 0928fb6d7..076e3d4e4 100644 --- a/src/comm/HALOPACKING_FUSED-Hip.cpp +++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -21,7 +21,7 @@ namespace rajaperf namespace comm { -#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP_HIP \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP \ Real_ptr* pack_buffer_ptrs; \ Int_ptr* pack_list_ptrs; \ Real_ptr* pack_var_ptrs; \ @@ -39,7 +39,7 @@ namespace comm allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \ allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars); -#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP \ deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \ deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \ @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, +__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,7 +70,7 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, +__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -89,17 +89,17 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) +void HALO_PACKING_FUSED::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); auto res{getHipResource()}; - HALOPACKING_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; if ( vid == Base_HIP ) { - HALOPACKING_FUSED_MANUAL_FUSER_SETUP_HIP; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_HIP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -127,7 +127,7 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); hipErrchk( hipGetLastError() ); if (separate_buffers) { @@ -167,7 +167,7 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), + hipLaunchKernelGGL((HALO_exchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); @@ -175,7 +175,7 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) } stopTimer(); - HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_HIP; } else if ( vid == RAJA_HIP ) { @@ -222,12 +222,12 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -255,12 +255,12 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -272,11 +272,11 @@ void HALOPACKING_FUSED::runHipVariantImpl(VariantID vid) stopTimer(); } else { - getCout() << "\n HALOPACKING_FUSED : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOPACKING_FUSED, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALO_PACKING_FUSED, Hip) } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALOPACKING_FUSED-OMP.cpp b/src/comm/HALO_PACKING_FUSED-OMP.cpp similarity index 93% rename from src/comm/HALOPACKING_FUSED-OMP.cpp rename to src/comm/HALO_PACKING_FUSED-OMP.cpp index d9f224063..9db7e2b59 100644 --- a/src/comm/HALOPACKING_FUSED-OMP.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -18,19 +18,19 @@ namespace comm { -void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOPACKING_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; switch ( vid ) { case Base_OpenMP : { - HALOPACKING_FUSED_MANUAL_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -138,14 +138,14 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN; break; } case Lambda_OpenMP : { - HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -245,7 +245,7 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } stopTimer(); - HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -292,12 +292,12 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -324,12 +324,12 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -343,7 +343,7 @@ void HALOPACKING_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } default : { - getCout() << "\n HALOPACKING_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOPACKING_FUSED-OMPTarget.cpp b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp similarity index 90% rename from src/comm/HALOPACKING_FUSED-OMPTarget.cpp rename to src/comm/HALO_PACKING_FUSED-OMPTarget.cpp index 7499563a2..ca92fe897 100644 --- a/src/comm/HALOPACKING_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -26,7 +26,7 @@ namespace comm // //const size_t threads_per_team = 256; -#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET \ void** pack_ptrs; \ allocData(DataSpace::OmpTarget, pack_ptrs, 4 * num_neighbors * num_vars); \ Real_ptr* pack_buffer_ptrs = reinterpret_cast(pack_ptrs) + 0 * num_neighbors * num_vars; \ @@ -50,28 +50,28 @@ namespace comm Real_ptr* h_unpack_var_ptrs = reinterpret_cast(h_unpack_ptrs) + 2 * num_neighbors * num_vars; \ Index_type* h_unpack_len_ptrs = reinterpret_cast(h_unpack_ptrs) + 3 * num_neighbors * num_vars; -#define HALOPACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET \ initOpenMPDeviceData(pack_ptrs, h_pack_ptrs, 4 * num_neighbors * num_vars); -#define HALOPACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET \ initOpenMPDeviceData(unpack_ptrs, h_unpack_ptrs, 4 * num_neighbors * num_vars); -#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET \ deallocData(DataSpace::OmpTarget, pack_ptrs); \ delete[] h_pack_ptrs; \ deallocData(DataSpace::OmpTarget, unpack_ptrs); \ delete[] h_unpack_ptrs; -void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOPACKING_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { - HALOPACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_OMP_TARGET; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -94,7 +94,7 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN buffer += len; } } - HALOPACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_COPY_PACK_OMP_TARGET; Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; #pragma omp target is_device_ptr(pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -144,7 +144,7 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN buffer += len; } } - HALOPACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_COPY_UNPACK_OMP_TARGET; Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; #pragma omp target is_device_ptr(unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs) device( did ) #pragma omp teams distribute parallel for collapse(2) schedule(static, 1) @@ -165,7 +165,7 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN } stopTimer(); - HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; } else if ( vid == RAJA_OpenMPTarget ) { @@ -209,12 +209,12 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -241,12 +241,12 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -257,7 +257,7 @@ void HALOPACKING_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN stopTimer(); } else { - getCout() << "\n HALOPACKING_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOPACKING_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp similarity index 92% rename from src/comm/HALOPACKING_FUSED-Seq.cpp rename to src/comm/HALO_PACKING_FUSED-Seq.cpp index 6331bdc6d..2a4de8b8a 100644 --- a/src/comm/HALOPACKING_FUSED-Seq.cpp +++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -18,17 +18,17 @@ namespace comm { -void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_PACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOPACKING_FUSED_DATA_SETUP; + HALO_PACKING_FUSED_DATA_SETUP; switch ( vid ) { case Base_Seq : { - HALOPACKING_FUSED_MANUAL_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -98,7 +98,7 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } stopTimer(); - HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN; break; } @@ -106,7 +106,7 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { - HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -172,7 +172,7 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } stopTimer(); - HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; + HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN; break; } @@ -219,12 +219,12 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; pool_pack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + HALO_exchange_fused_pack_base_lam ); buffer += len; } } @@ -251,12 +251,12 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + auto HALO_exchange_fused_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; pool_unpack.enqueue( RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + HALO_exchange_fused_unpack_base_lam ); buffer += len; } } @@ -271,7 +271,7 @@ void HALOPACKING_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( #endif // RUN_RAJA_SEQ default : { - getCout() << "\n HALOPACKING_FUSED : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOPACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp similarity index 92% rename from src/comm/HALOPACKING_FUSED.cpp rename to src/comm/HALO_PACKING_FUSED.cpp index 257b5cc06..54ed08ec3 100644 --- a/src/comm/HALOPACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOPACKING_FUSED.hpp" +#include "HALO_PACKING_FUSED.hpp" #include "RAJA/RAJA.hpp" @@ -15,8 +15,8 @@ namespace rajaperf namespace comm { -HALOPACKING_FUSED::HALOPACKING_FUSED(const RunParams& params) - : HALO_base(rajaperf::Comm_HALOPACKING_FUSED, params) +HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_PACKING_FUSED, params) { setDefaultReps(200); @@ -51,11 +51,11 @@ HALOPACKING_FUSED::HALOPACKING_FUSED(const RunParams& params) setVariantDefined( RAJA_HIP ); } -HALOPACKING_FUSED::~HALOPACKING_FUSED() +HALO_PACKING_FUSED::~HALO_PACKING_FUSED() { } -void HALOPACKING_FUSED::setUp(VariantID vid, size_t tune_idx) +void HALO_PACKING_FUSED::setUp(VariantID vid, size_t tune_idx) { int my_mpi_rank = 0; const int mpi_dims[3] = {1,1,1}; @@ -102,7 +102,7 @@ void HALOPACKING_FUSED::setUp(VariantID vid, size_t tune_idx) } } -void HALOPACKING_FUSED::updateChecksum(VariantID vid, size_t tune_idx) +void HALO_PACKING_FUSED::updateChecksum(VariantID vid, size_t tune_idx) { for (Real_ptr var : m_vars) { checksum[vid][tune_idx] += calcChecksum(var, m_var_size, vid); @@ -120,7 +120,7 @@ void HALOPACKING_FUSED::updateChecksum(VariantID vid, size_t tune_idx) } } -void HALOPACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) +void HALO_PACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/HALOPACKING_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp similarity index 90% rename from src/comm/HALOPACKING_FUSED.hpp rename to src/comm/HALO_PACKING_FUSED.hpp index 804c4720e..b881731a8 100644 --- a/src/comm/HALOPACKING_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOPACKING_FUSED kernel reference implementation: +/// HALO_PACKING_FUSED kernel reference implementation: /// /// // pack buffers for neighbors /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -40,10 +40,10 @@ /// } /// -#ifndef RAJAPerf_Comm_HALOPACKING_FUSED_HPP -#define RAJAPerf_Comm_HALOPACKING_FUSED_HPP +#ifndef RAJAPerf_Comm_HALO_PACKING_FUSED_HPP +#define RAJAPerf_Comm_HALO_PACKING_FUSED_HPP -#define HALOPACKING_FUSED_DATA_SETUP \ +#define HALO_PACKING_FUSED_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -59,7 +59,7 @@ std::vector send_buffers = m_send_buffers; \ std::vector recv_buffers = m_recv_buffers; -#define HALOPACKING_FUSED_MANUAL_FUSER_SETUP \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP \ struct ptr_holder { \ Real_ptr buffer; \ Int_ptr list; \ @@ -70,14 +70,14 @@ ptr_holder* unpack_ptr_holders = new ptr_holder[num_neighbors * num_vars]; \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOPACKING_FUSED_MANUAL_FUSER_TEARDOWN \ +#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN \ delete[] pack_ptr_holders; \ delete[] pack_lens; \ delete[] unpack_ptr_holders; \ delete[] unpack_lens; -#define HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ +#define HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_SETUP \ auto make_pack_lambda = [](Real_ptr buffer, Int_ptr list, Real_ptr var) { \ return [=](Index_type i) { \ HALO_PACK_BODY; \ @@ -97,7 +97,7 @@ malloc(sizeof(unpack_lambda_type) * (num_neighbors * num_vars))); \ Index_type* unpack_lens = new Index_type[num_neighbors * num_vars]; -#define HALOPACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ +#define HALO_PACKING_FUSED_MANUAL_LAMBDA_FUSER_TEARDOWN \ free(pack_lambdas); \ delete[] pack_lens; \ free(unpack_lambdas); \ @@ -115,13 +115,13 @@ namespace rajaperf namespace comm { -class HALOPACKING_FUSED : public HALO_base +class HALO_PACKING_FUSED : public HALO_base { public: - HALOPACKING_FUSED(const RunParams& params); + HALO_PACKING_FUSED(const RunParams& params); - ~HALOPACKING_FUSED(); + ~HALO_PACKING_FUSED(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALOSENDRECV-Cuda.cpp b/src/comm/HALO_SENDRECV-Cuda.cpp similarity index 86% rename from src/comm/HALOSENDRECV-Cuda.cpp rename to src/comm/HALO_SENDRECV-Cuda.cpp index 075b1996d..f6f182e60 100644 --- a/src/comm/HALOSENDRECV-Cuda.cpp +++ b/src/comm/HALO_SENDRECV-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOSENDRECV.hpp" +#include "HALO_SENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -22,11 +22,11 @@ namespace comm { -void HALOSENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_SENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOSENDRECV_DATA_SETUP; + HALO_SENDRECV_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -53,7 +53,7 @@ void HALOSENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune stopTimer(); } else { - getCout() << "\n HALOSENDRECV : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n HALO_SENDRECV : Unknown Cuda variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOSENDRECV-Hip.cpp b/src/comm/HALO_SENDRECV-Hip.cpp similarity index 86% rename from src/comm/HALOSENDRECV-Hip.cpp rename to src/comm/HALO_SENDRECV-Hip.cpp index 31b830358..fc03ab487 100644 --- a/src/comm/HALOSENDRECV-Hip.cpp +++ b/src/comm/HALO_SENDRECV-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOSENDRECV.hpp" +#include "HALO_SENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -22,11 +22,11 @@ namespace comm { -void HALOSENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_SENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOSENDRECV_DATA_SETUP; + HALO_SENDRECV_DATA_SETUP; if ( vid == Base_HIP ) { @@ -53,7 +53,7 @@ void HALOSENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ stopTimer(); } else { - getCout() << "\n HALOSENDRECV : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n HALO_SENDRECV : Unknown Hip variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOSENDRECV-OMP.cpp b/src/comm/HALO_SENDRECV-OMP.cpp similarity index 87% rename from src/comm/HALOSENDRECV-OMP.cpp rename to src/comm/HALO_SENDRECV-OMP.cpp index f8513f6d0..23efb5143 100644 --- a/src/comm/HALOSENDRECV-OMP.cpp +++ b/src/comm/HALO_SENDRECV-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOSENDRECV.hpp" +#include "HALO_SENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -20,13 +20,13 @@ namespace comm { -void HALOSENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_SENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); - HALOSENDRECV_DATA_SETUP; + HALO_SENDRECV_DATA_SETUP; switch ( vid ) { @@ -58,7 +58,7 @@ void HALOSENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } default : { - getCout() << "\n HALOSENDRECV : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_SENDRECV : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOSENDRECV-OMPTarget.cpp b/src/comm/HALO_SENDRECV-OMPTarget.cpp similarity index 87% rename from src/comm/HALOSENDRECV-OMPTarget.cpp rename to src/comm/HALO_SENDRECV-OMPTarget.cpp index f16b3b31d..e5a3f7723 100644 --- a/src/comm/HALOSENDRECV-OMPTarget.cpp +++ b/src/comm/HALO_SENDRECV-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOSENDRECV.hpp" +#include "HALO_SENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -27,11 +27,11 @@ namespace comm const size_t threads_per_team = 256; -void HALOSENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_SENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOSENDRECV_DATA_SETUP; + HALO_SENDRECV_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -58,7 +58,7 @@ void HALOSENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ stopTimer(); } else { - getCout() << "\n HALOSENDRECV : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n HALO_SENDRECV : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOSENDRECV-Seq.cpp b/src/comm/HALO_SENDRECV-Seq.cpp similarity index 87% rename from src/comm/HALOSENDRECV-Seq.cpp rename to src/comm/HALO_SENDRECV-Seq.cpp index 9ec1583f5..050b6da70 100644 --- a/src/comm/HALOSENDRECV-Seq.cpp +++ b/src/comm/HALO_SENDRECV-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOSENDRECV.hpp" +#include "HALO_SENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -20,11 +20,11 @@ namespace comm { -void HALOSENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALO_SENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); - HALOSENDRECV_DATA_SETUP; + HALO_SENDRECV_DATA_SETUP; switch ( vid ) { @@ -56,7 +56,7 @@ void HALOSENDRECV::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } default : { - getCout() << "\n HALOSENDRECV : Unknown variant id = " << vid << std::endl; + getCout() << "\n HALO_SENDRECV : Unknown variant id = " << vid << std::endl; } } diff --git a/src/comm/HALOSENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp similarity index 90% rename from src/comm/HALOSENDRECV.cpp rename to src/comm/HALO_SENDRECV.cpp index 36c54af8d..1aaddb2af 100644 --- a/src/comm/HALOSENDRECV.cpp +++ b/src/comm/HALO_SENDRECV.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "HALOSENDRECV.hpp" +#include "HALO_SENDRECV.hpp" #include "RAJA/RAJA.hpp" @@ -17,8 +17,8 @@ namespace rajaperf namespace comm { -HALOSENDRECV::HALOSENDRECV(const RunParams& params) - : HALO_base(rajaperf::Comm_HALOSENDRECV, params) +HALO_SENDRECV::HALO_SENDRECV(const RunParams& params) + : HALO_base(rajaperf::Comm_HALO_SENDRECV, params) { m_mpi_size = params.getMPISize(); m_my_mpi_rank = params.getMPIRank(); @@ -51,11 +51,11 @@ HALOSENDRECV::HALOSENDRECV(const RunParams& params) } } -HALOSENDRECV::~HALOSENDRECV() +HALO_SENDRECV::~HALO_SENDRECV() { } -void HALOSENDRECV::setUp(VariantID vid, size_t tune_idx) +void HALO_SENDRECV::setUp(VariantID vid, size_t tune_idx) { setUp_base(m_my_mpi_rank, m_mpi_dims.data(), vid, tune_idx); @@ -82,7 +82,7 @@ void HALOSENDRECV::setUp(VariantID vid, size_t tune_idx) } } -void HALOSENDRECV::updateChecksum(VariantID vid, size_t tune_idx) +void HALO_SENDRECV::updateChecksum(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); @@ -96,7 +96,7 @@ void HALOSENDRECV::updateChecksum(VariantID vid, size_t tune_idx) } } -void HALOSENDRECV::tearDown(VariantID vid, size_t tune_idx) +void HALO_SENDRECV::tearDown(VariantID vid, size_t tune_idx) { const bool separate_buffers = (getMPIDataSpace(vid) == DataSpace::Copy); diff --git a/src/comm/HALOSENDRECV.hpp b/src/comm/HALO_SENDRECV.hpp similarity index 92% rename from src/comm/HALOSENDRECV.hpp rename to src/comm/HALO_SENDRECV.hpp index 75f817dcb..a64ca4d12 100644 --- a/src/comm/HALOSENDRECV.hpp +++ b/src/comm/HALO_SENDRECV.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOSENDRECV kernel reference implementation: +/// HALO_SENDRECV kernel reference implementation: /// /// // post a recv for each neighbor /// for (Index_type l = 0; l < num_neighbors; ++l) { @@ -56,10 +56,10 @@ /// -#ifndef RAJAPerf_Comm_HALOSENDRECV_HPP -#define RAJAPerf_Comm_HALOSENDRECV_HPP +#ifndef RAJAPerf_Comm_HALO_SENDRECV_HPP +#define RAJAPerf_Comm_HALO_SENDRECV_HPP -#define HALOSENDRECV_DATA_SETUP \ +#define HALO_SENDRECV_DATA_SETUP \ HALO_BASE_DATA_SETUP \ \ Index_type num_vars = m_num_vars; \ @@ -87,13 +87,13 @@ namespace rajaperf namespace comm { -class HALOSENDRECV : public HALO_base +class HALO_SENDRECV : public HALO_base { public: - HALOSENDRECV(const RunParams& params); + HALO_SENDRECV(const RunParams& params); - ~HALOSENDRECV(); + ~HALO_SENDRECV(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp index f7cd2cd26..9ab07193a 100644 --- a/src/comm/HALO_base.hpp +++ b/src/comm/HALO_base.hpp @@ -7,39 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// HALOPACKING kernel reference implementation: -/// -/// // pack a buffer for each neighbor -/// for (Index_type l = 0; l < num_neighbors; ++l) { -/// Real_ptr buffer = buffers[l]; -/// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; -/// // pack part of each variable -/// for (Index_type v = 0; v < num_vars; ++v) { -/// Real_ptr var = vars[v]; -/// for (Index_type i = 0; i < len; i++) { -/// buffer[i] = var[list[i]]; -/// } -/// buffer += len; -/// } -/// // send buffer to neighbor -/// } -/// -/// // unpack a buffer for each neighbor -/// for (Index_type l = 0; l < num_neighbors; ++l) { -/// // receive buffer from neighbor -/// Real_ptr buffer = buffers[l]; -/// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; -/// // unpack part of each variable -/// for (Index_type v = 0; v < num_vars; ++v) { -/// Real_ptr var = vars[v]; -/// for (Index_type i = 0; i < len; i++) { -/// var[list[i]] = buffer[i]; -/// } -/// buffer += len; -/// } -/// } +/// HALO_base provides a common starting point for the other HALO_ classes. /// #ifndef RAJAPerf_Comm_HALO_BASE_HPP diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 8bb996d17..3b02271b8 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -24,9 +24,9 @@ #include "basic/REDUCE3_INT.hpp" #include "basic/INDEXLIST_3LOOP.hpp" #include "algorithm/SORT.hpp" -#include "comm/HALOPACKING_FUSED.hpp" +#include "comm/HALO_PACKING_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include "comm/HALOEXCHANGE_FUSED.hpp" +#include "comm/HALO_EXCHANGE_FUSED.hpp" #endif #include @@ -697,7 +697,7 @@ void Executor::runWarmupKernels() kernel_ids.insert(Basic_INDEXLIST_3LOOP); break; case Workgroup: - kernel_ids.insert(Comm_HALOPACKING_FUSED); break; + kernel_ids.insert(Comm_HALO_PACKING_FUSED); break; case Reduction: kernel_ids.insert(Basic_REDUCE3_INT); break; @@ -710,7 +710,7 @@ void Executor::runWarmupKernels() #ifdef RAJA_PERFSUITE_ENABLE_MPI case MPI: - kernel_ids.insert(Comm_HALOEXCHANGE_FUSED); break; + kernel_ids.insert(Comm_HALO_EXCHANGE_FUSED); break; #endif default: diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index debeb21ca..71f577dd9 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -108,12 +108,12 @@ // // Comm kernels... // -#include "comm/HALOPACKING.hpp" -#include "comm/HALOPACKING_FUSED.hpp" +#include "comm/HALO_PACKING.hpp" +#include "comm/HALO_PACKING_FUSED.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) -#include "comm/HALOSENDRECV.hpp" -#include "comm/HALOEXCHANGE.hpp" -#include "comm/HALOEXCHANGE_FUSED.hpp" +#include "comm/HALO_SENDRECV.hpp" +#include "comm/HALO_EXCHANGE.hpp" +#include "comm/HALO_EXCHANGE_FUSED.hpp" #endif @@ -258,12 +258,12 @@ static const std::string KernelNames [] = // // Comm kernels... // - std::string("Comm_HALOPACKING"), - std::string("Comm_HALOPACKING_FUSED"), + std::string("Comm_HALO_PACKING"), + std::string("Comm_HALO_PACKING_FUSED"), #if defined(RAJA_PERFSUITE_ENABLE_MPI) - std::string("Comm_HALOSENDRECV"), - std::string("Comm_HALOEXCHANGE"), - std::string("Comm_HALOEXCHANGE_FUSED"), + std::string("Comm_HALO_SENDRECV"), + std::string("Comm_HALO_EXCHANGE"), + std::string("Comm_HALO_EXCHANGE_FUSED"), #endif std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... @@ -990,25 +990,25 @@ KernelBase* getKernelObject(KernelID kid, // // Comm kernels... // - case Comm_HALOPACKING : { - kernel = new comm::HALOPACKING(run_params); + case Comm_HALO_PACKING : { + kernel = new comm::HALO_PACKING(run_params); break; } - case Comm_HALOPACKING_FUSED : { - kernel = new comm::HALOPACKING_FUSED(run_params); + case Comm_HALO_PACKING_FUSED : { + kernel = new comm::HALO_PACKING_FUSED(run_params); break; } #if defined(RAJA_PERFSUITE_ENABLE_MPI) - case Comm_HALOSENDRECV : { - kernel = new comm::HALOSENDRECV(run_params); + case Comm_HALO_SENDRECV : { + kernel = new comm::HALO_SENDRECV(run_params); break; } - case Comm_HALOEXCHANGE : { - kernel = new comm::HALOEXCHANGE(run_params); + case Comm_HALO_EXCHANGE : { + kernel = new comm::HALO_EXCHANGE(run_params); break; } - case Comm_HALOEXCHANGE_FUSED : { - kernel = new comm::HALOEXCHANGE_FUSED(run_params); + case Comm_HALO_EXCHANGE_FUSED : { + kernel = new comm::HALO_EXCHANGE_FUSED(run_params); break; } #endif diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 7db7e8868..bf667fa66 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -168,12 +168,12 @@ enum KernelID { // // Comm kernels... // - Comm_HALOPACKING, - Comm_HALOPACKING_FUSED, + Comm_HALO_PACKING, + Comm_HALO_PACKING_FUSED, #if defined(RAJA_PERFSUITE_ENABLE_MPI) - Comm_HALOSENDRECV, - Comm_HALOEXCHANGE, - Comm_HALOEXCHANGE_FUSED, + Comm_HALO_SENDRECV, + Comm_HALO_EXCHANGE, + Comm_HALO_EXCHANGE_FUSED, #endif NumKernels // Keep this one last and NEVER comment out (!!) diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index 3b11e0326..f5992f3a9 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -63,7 +63,7 @@ TEST(ShortSuiteTest, Basic) (HIP_VERSION_MAJOR < 5 || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 1)) sargv.emplace_back(std::string("--exclude-kernels")); - sargv.emplace_back(std::string("HALOPACKING_FUSED")); + sargv.emplace_back(std::string("HALO_PACKING_FUSED")); #endif #if (defined(RAJA_COMPILER_CLANG) && __clang_major__ == 11) From 1ea22f6ac28ffba807143c033059d2773c73e112 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 28 Nov 2023 08:39:35 -0800 Subject: [PATCH 149/454] Update src/algorithm/REDUCE_SUM-Cuda.cpp Co-authored-by: Rich Hornung --- src/algorithm/REDUCE_SUM-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index fbd3c41d5..71097e72b 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -72,7 +72,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dsum; allocData(rds, dsum, 1); From d6ccc6162e739b8488098bc0e541ada5d7d10d91 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 28 Nov 2023 12:18:28 -0800 Subject: [PATCH 150/454] Updated build scripts to be consistent with RAJA. Also, updated RAJA TPL so host-configs match. --- scripts/lc-builds/blueos_clang_omptarget.sh | 2 +- scripts/lc-builds/corona_sycl.sh | 67 +++++++++++ scripts/lc-builds/toss3_pgi.sh | 48 -------- scripts/lc-builds/toss4_amdclang.sh | 37 +++--- scripts/lc-builds/toss4_amdclang_asan.sh | 107 ++++++++++++++++++ scripts/lc-builds/toss4_cce_hip.sh | 77 +++++++++++++ .../{toss3_clang.sh => toss4_clang.sh} | 15 +-- scripts/lc-builds/toss4_clang_caliper.sh | 2 +- .../lc-builds/toss4_cray-mpich_amdclang.sh | 2 +- .../lc-builds/{toss3_gcc.sh => toss4_gcc.sh} | 15 +-- scripts/lc-builds/toss4_gcc_caliper.sh | 2 +- .../{toss3_hipcc.sh => toss4_hipcc.sh} | 50 +++++--- ..._mvapich2_gcc.sh => toss4_icpc-classic.sh} | 34 +++--- .../{toss3_icpc.sh => toss4_icpc.sh} | 30 ++--- scripts/lc-builds/toss4_icpx.sh | 51 +++++++++ tpl/RAJA | 2 +- 16 files changed, 408 insertions(+), 133 deletions(-) create mode 100755 scripts/lc-builds/corona_sycl.sh delete mode 100755 scripts/lc-builds/toss3_pgi.sh create mode 100755 scripts/lc-builds/toss4_amdclang_asan.sh create mode 100755 scripts/lc-builds/toss4_cce_hip.sh rename scripts/lc-builds/{toss3_clang.sh => toss4_clang.sh} (69%) rename scripts/lc-builds/{toss3_gcc.sh => toss4_gcc.sh} (69%) rename scripts/lc-builds/{toss3_hipcc.sh => toss4_hipcc.sh} (52%) rename scripts/lc-builds/{toss3_mvapich2_gcc.sh => toss4_icpc-classic.sh} (57%) rename scripts/lc-builds/{toss3_icpc.sh => toss4_icpc.sh} (69%) create mode 100755 scripts/lc-builds/toss4_icpx.sh diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index 76c08af4c..3a7d1f429 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -11,7 +11,7 @@ if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " blueos_clang_omptarget.sh 10.0.1-gcc-8.3.1" - echo " - or - " + echo " - or -" echo " blueos_clang_omptarget.sh ibm-10.0.1-gcc-8.3.1" exit fi diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh new file mode 100755 index 000000000..1fe68341c --- /dev/null +++ b/scripts/lc-builds/corona_sycl.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 1 ]]; then + echo + echo "You must pass 1 argument to the script (in this order): " + echo " 1) SYCL compiler installation path" + echo + echo "For example: " + echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_hip_gcc10.2.1_rocm5.1.0/install" + exit +fi + +SYCL_PATH=$1 +shift 1 + +BUILD_SUFFIX=corona-sycl +: ${BUILD_TYPE:=RelWithDebInfo} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/corona_sycl.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX}_${USER} >/dev/null +mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER} + +DATE=$(printf '%(%Y-%m-%d)T\n' -1) + +export PATH=${SYCL_PATH}/bin:$PATH + +## NOTE: RAJA tests are turned off due to compilation issues. + +cmake \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DSYCL_LIB_PATH:STRING="${SYCL_PATH}/lib" \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_OPENMP=Off \ + -DENABLE_CUDA=Off \ + -DRAJA_ENABLE_TARGET_OPENMP=Off \ + -DENABLE_ALL_WARNINGS=Off \ + -DRAJA_ENABLE_SYCL=On \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_LINKER=clang++ \ + -DCMAKE_CXX_STANDARD=17 \ + -DENABLE_TESTS=Off \ + -DENABLE_EXAMPLES=On \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "Remember to export PATH=${SYCL_PATH}/bin:\$PATH to obtain the correct compiler paths." +echo +echo "cd into directory build_${BUILD_SUFFIX}_${USER} and run make to build RAJA" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_pgi.sh b/scripts/lc-builds/toss3_pgi.sh deleted file mode 100755 index 5207ae816..000000000 --- a/scripts/lc-builds/toss3_pgi.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -############################################################################### -# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJAPERF/COPYRIGHT file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### - -if [[ $# -lt 1 ]]; then - echo - echo "You must pass a compiler version number to script. For example," - echo " toss3_pgi.sh 20.1" - exit -fi - -COMP_VER=$1 -shift 1 - -BUILD_SUFFIX=lc_toss3-pgi-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/pgi_X.cmake - -echo -echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" -echo "Configuration extra arguments:" -echo " $@" -echo - -rm -rf build_${BUILD_SUFFIX} 2>/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} - -module load cmake/3.20.2 - -cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgc++ \ - -DCMAKE_C_COMPILER=/usr/tce/packages/pgi/pgi-${COMP_VER}/bin/pgcc \ - -DBLT_CXX_STD=c++14 \ - -C ${RAJA_HOSTCONFIG} \ - -DENABLE_OPENMP=On \ - -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ - "$@" \ - .. - -echo -echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 7d2de5397..1d827e415 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) @@ -15,7 +15,7 @@ if [[ $# -lt 2 ]]; then echo " 3...) optional arguments to cmake" echo echo "For example: " - echo " toss4_amdclang.sh 5.1.0 gfx906" + echo " toss4_amdclang.sh 5.7.0 gfx906" exit fi @@ -44,6 +44,12 @@ echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in i echo "Configuration extra arguments:" echo " $@" echo +echo "To get cmake to work you may have to configure with" +echo " -DHIP_PLATFORM=amd" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo rm -rf build_${BUILD_SUFFIX} >/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} @@ -53,23 +59,28 @@ module load cmake/3.23.1 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER # are inconsistent causing the rocprim from the module to be used unexpectedly -module unload rocm +# module unload rocm +if [[ ${COMP_VER} =~ .*magic.* ]]; then + ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}" +else + ROCM_PATH="/usr/tce/packages/rocmcc-tce/rocmcc-${COMP_VER}" +fi cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ - -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ - -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ - -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \ - -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \ - -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ - -DGPU_TARGETS="${COMP_ARCH}" \ - -DAMDGPU_TARGETS="${COMP_ARCH}" \ + -DROCM_ROOT_DIR="${ROCM_PATH}" \ + -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \ + -DHIP_PATH=${ROCM_PATH}/llvm/bin \ + -DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/amdclang \ + -DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/amdclang++ \ + -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}:xnack+" \ + -DGPU_TARGETS="${COMP_ARCH}:xnack+" \ + -DAMDGPU_TARGETS="${COMP_ARCH}:xnack+" \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ - -DENABLE_OPENMP=OFF \ + -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ @@ -78,7 +89,7 @@ cmake \ echo echo "***********************************************************************" echo -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJAPerf" echo echo " Please note that you have to have a consistent build environment" echo " when you make RAJA as cmake may reconfigure; unload the rocm module" diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh new file mode 100755 index 000000000..28efdbff0 --- /dev/null +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 2 ]]; then + echo + echo "You must pass 2 or more arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) HIP compute architecture" + echo " 3...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_amdclang_asan.sh 5.7.0 gfx90a" + exit +fi + +COMP_VER=$1 +COMP_ARCH=$2 +shift 2 + +HOSTCONFIG="hip_3_X" + +if [[ ${COMP_VER} == 4.* ]] +then +##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1" + HOSTCONFIG="hip_4_link_X" +elif [[ ${COMP_VER} == 3.* ]] +then + HOSTCONFIG="hip_3_X" +else + echo "Unknown hip version, using ${HOSTCONFIG} host-config" +fi + +BUILD_SUFFIX=lc_toss4-amdclang-${COMP_VER}-${COMP_ARCH}-asan +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo +echo "To get cmake to work you may have to configure with" +echo " -DHIP_PLATFORM=amd" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + + +module load cmake/3.23.1 + +# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER +# are inconsistent causing the rocprim from the module to be used unexpectedly +# module unload rocm + +if [[ ${COMP_VER} =~ .*magic.* ]]; then + ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}" +else + ROCM_PATH="/usr/tce/packages/rocmcc-tce/rocmcc-${COMP_VER}" +fi + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DROCM_ROOT_DIR="${ROCM_PATH}" \ + -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \ + -DHIP_PATH=${ROCM_PATH}/llvm/bin \ + -DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/amdclang \ + -DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/amdclang++ \ + -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}:xnack+" \ + -DGPU_TARGETS="${COMP_ARCH}:xnack+" \ + -DAMDGPU_TARGETS="${COMP_ARCH}:xnack+" \ + -DCMAKE_C_FLAGS="-fsanitize=address -shared-libsan" \ + -DCMAKE_CXX_FLAGS="-fsanitize=address -shared-libsan" \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_HIP=ON \ + -DENABLE_OPENMP=ON \ + -DENABLE_CUDA=OFF \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJAPerf" +echo +echo " Please note that you have to have a consistent build environment" +echo " when you make RAJA as cmake may reconfigure; load the appropriate" +echo " rocm and rocmcc modules (${COMP_VER}) when building." +echo +echo " module load rocm/COMP_VER rocmcc/COMP_VER" +echo " srun -n1 make" +echo +echo " Run with these environment options when using asan" +echo " ASAN_OPTIONS=print_suppressions=0:detect_leaks=0" +echo " HSA_XNACK=1" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh new file mode 100755 index 000000000..030d65163 --- /dev/null +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 3 ]]; then + echo + echo "You must pass 3 or more arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) HIP version" + echo " 3) HIP compute architecture" + echo " 4...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_cce_hip.sh 14.0.3 5.2.3 gfx90a" + exit +fi + +COMP_VER=$1 +HIP_VER=$2 +HIP_ARCH=$3 +shift 3 + +HOSTCONFIG="hip_3_X" + +BUILD_SUFFIX=lc_toss4-cce-${COMP_VER}-hip-${HIP_VER}-${HIP_ARCH} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + + +module load cmake/3.24.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/craycc" \ + -DCMAKE_CXX_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/crayCC" \ + -DHIP_PATH=/opt/rocm-${HIP_VER}/hip \ + -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \ + -DGPU_TARGETS=${HIP_ARCH} \ + -DAMDGPU_TARGETS=${HIP_ARCH} \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_HIP=ON \ + -DENABLE_OPENMP=ON \ + -DENABLE_CUDA=OFF \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you have to have a consistent build environment" +echo " when you make RAJA as cmake may reconfigure; load the appropriate" +echo " cce module (${COMP_VER}) when building." +echo +echo " module load cce-tce/${COMP_VER}" +echo " srun -n1 make" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss4_clang.sh similarity index 69% rename from scripts/lc-builds/toss3_clang.sh rename to scripts/lc-builds/toss4_clang.sh index 75fc28c67..185795ce8 100755 --- a/scripts/lc-builds/toss3_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -7,18 +7,18 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -lt 1 ]]; then +if [ "$1" == "" ]; then echo echo "You must pass a compiler version number to script. For example," - echo " toss3_clang.sh 10.0.1" + echo " toss4_clang.sh 10.3.1" exit fi COMP_VER=$1 shift 1 -BUILD_SUFFIX=lc_toss3-clang-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +BUILD_SUFFIX=lc_toss4-clang-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -29,7 +29,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ @@ -40,8 +40,3 @@ cmake \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. - -echo -echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index dcfcdb101..6adf38f54 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -36,7 +36,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.21.1 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index afd60389f..4e83671f0 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss4_gcc.sh similarity index 69% rename from scripts/lc-builds/toss3_gcc.sh rename to scripts/lc-builds/toss4_gcc.sh index cbc127945..a7e5d66c1 100755 --- a/scripts/lc-builds/toss3_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -7,18 +7,18 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -lt 1 ]]; then +if [ "$1" == "" ]; then echo echo "You must pass a compiler version number to script. For example," - echo " toss3_gcc.sh 8.3.1" + echo " toss4_gcc.sh 10.3.1" exit fi COMP_VER=$1 shift 1 -BUILD_SUFFIX=lc_toss3-gcc-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake +BUILD_SUFFIX=lc_toss4-gcc-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/gcc_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -29,7 +29,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ @@ -40,8 +40,3 @@ cmake \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. - -echo -echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" -echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh index 3499d6bfa..65d680fee 100755 --- a/scripts/lc-builds/toss4_gcc_caliper.sh +++ b/scripts/lc-builds/toss4_gcc_caliper.sh @@ -36,7 +36,7 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.21.1 +module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh similarity index 52% rename from scripts/lc-builds/toss3_hipcc.sh rename to scripts/lc-builds/toss4_hipcc.sh index 9877ee99a..e95f7cf9e 100755 --- a/scripts/lc-builds/toss3_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -9,12 +9,13 @@ if [[ $# -lt 2 ]]; then echo - echo "You must pass 2 arguments to the script (in this order): " + echo "You must pass 2 or more arguments to the script (in this order): " echo " 1) compiler version number" echo " 2) HIP compute architecture" + echo " 3...) optional arguments to cmake" echo echo "For example: " - echo " toss3_hipcc.sh 5.1.0 gfx906" + echo " toss4_hipcc.sh 4.1.0 gfx906" exit fi @@ -22,15 +23,11 @@ COMP_VER=$1 COMP_ARCH=$2 shift 2 -HIP_CLANG_FLAGS="--offload-arch=${COMP_ARCH}" HOSTCONFIG="hip_3_X" -if [[ ${COMP_VER} == 4.5.* ]] -then - HIP_CLANG_FLAGS="${HIP_CLANG_FLAGS} -mllvm -amdgpu-fixed-function-abi=1" - HOSTCONFIG="hip_4_5_link_X" -elif [[ ${COMP_VER} == 4.* ]] +if [[ ${COMP_VER} == 4.* ]] then +##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1" HOSTCONFIG="hip_4_link_X" elif [[ ${COMP_VER} == 3.* ]] then @@ -39,14 +36,17 @@ else echo "Unknown hip version, using ${HOSTCONFIG} host-config" fi -BUILD_SUFFIX=lc_toss3-hipcc-${COMP_VER}-${COMP_ARCH} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/hip_link_X.cmake +BUILD_SUFFIX=lc_toss4-hipcc-${COMP_VER}-${COMP_ARCH} +RAJA_HOST_CONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" echo "Configuration extra arguments:" echo " $@" echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo rm -rf build_${BUILD_SUFFIX} >/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} @@ -54,18 +54,25 @@ mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} module load cmake/3.23.1 +# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER +# are inconsistent causing the rocprim from the module to be used unexpectedly +module unload rocm + + cmake \ -DCMAKE_BUILD_TYPE=Release \ -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ - -DHIP_CLANG_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ - -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang \ - -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/clang++ \ - -DHIP_CLANG_FLAGS="${HIP_CLANG_FLAGS}" \ - -DBLT_CXX_STD=c++14 \ + -DHIP_PATH=/opt/rocm-${COMP_VER}/bin \ + -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \ + -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \ + -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ + -DGPU_TARGETS="${COMP_ARCH}" \ + -DAMDGPU_TARGETS="${COMP_ARCH}" \ + -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ - -DENABLE_OPENMP=OFF \ + -DENABLE_OPENMP=ON \ -DENABLE_CUDA=OFF \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ @@ -73,5 +80,14 @@ cmake \ echo echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you have to have a consistent build environment" +echo " when you make RAJA as cmake may reconfigure; unload the rocm module" +echo " or load the appropriate rocm module (${COMP_VER}) when building." +echo +echo " module unload rocm" +echo " srun -n1 make" +echo echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_mvapich2_gcc.sh b/scripts/lc-builds/toss4_icpc-classic.sh similarity index 57% rename from scripts/lc-builds/toss3_mvapich2_gcc.sh rename to scripts/lc-builds/toss4_icpc-classic.sh index a66a216ca..842b7733e 100755 --- a/scripts/lc-builds/toss3_mvapich2_gcc.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -7,19 +7,18 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -lt 2 ]]; then +if [ "$1" == "" ]; then echo echo "You must pass a compiler version number to script. For example," - echo " toss3_mvapich2_gcc.sh 2.3 10.2.1" + echo " toss4_icpc-classic.sh 19.1.2" exit fi -MPI_VER=$1 -COMP_VER=$2 -shift 2 +COMP_VER=$1 +shift 1 -BUILD_SUFFIX=lc_toss3-mvapich2-${MPI_VER}-gcc-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake +BUILD_SUFFIX=lc_toss4-icpc-classic-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpc-classic_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -30,15 +29,20 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 + +## +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# times at a potential cost of slower 'forall' execution. +## cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-gcc-${COMP_VER}/bin/mpic++ \ - -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \ + -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ - -DENABLE_MPI=On \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ @@ -47,11 +51,11 @@ cmake \ echo echo "***********************************************************************" echo -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" echo -echo " Please note that you have to run with mpi when you run" -echo " the RAJA Perf Suite; for example," +echo " Please note that you may need to add some intel openmp libraries to your" +echo " LD_LIBRARY_PATH to run with openmp." echo -echo " srun -n2 ./bin/raja-perf.exe" +echo " LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin" echo echo "***********************************************************************" diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss4_icpc.sh similarity index 69% rename from scripts/lc-builds/toss3_icpc.sh rename to scripts/lc-builds/toss4_icpc.sh index f5a10cfda..510a49bdb 100755 --- a/scripts/lc-builds/toss3_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -7,26 +7,18 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [[ $# -lt 1 ]]; then +if [ "$1" == "" ]; then echo echo "You must pass a compiler version number to script. For example," - echo " toss3_icpc.sh 19.1.0" + echo " toss4_icpc.sh 2022.3" exit fi COMP_VER=$1 shift 1 -COMP_MAJOR_VER=${COMP_VER:0:2} -GCC_HEADER_VER=7 - -if [ ${COMP_MAJOR_VER} -gt 18 ] -then - GCC_HEADER_VER=8 -fi - -BUILD_SUFFIX=lc_toss3-icpc-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/icpc_X_gcc${GCC_HEADER_VER}headers.cmake +BUILD_SUFFIX=lc_toss4-icpc-${COMP_VER} +RAJA_HOST_CONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpc_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" @@ -37,10 +29,10 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.20.2 +module load cmake/3.23.1 ## -# CMake option -DENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile # times at a potential cost of slower 'forall' execution. ## @@ -50,6 +42,7 @@ cmake \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ @@ -57,5 +50,12 @@ cmake \ echo echo "***********************************************************************" -echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you may need to add some intel openmp libraries to your" +echo " LD_LIBRARY_PATH to run with openmp." +echo +echo " LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin" +echo echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh new file mode 100755 index 000000000..f7e20aa1d --- /dev/null +++ b/scripts/lc-builds/toss4_icpx.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [ "$1" == "" ]; then + echo + echo "You must pass a compiler version number to script. For example," + echo " toss4_icpx.sh 2022.1.0" + exit +fi + +COMP_VER=$1 +shift 1 + +BUILD_SUFFIX=lc_toss4-icpx-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/icpx_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.23.1 + +## +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# times at a potential cost of slower 'forall' execution. +## + +source /usr/tce/packages/intel/intel-${COMP_VER}/setvars.sh + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icpx \ + -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icx \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. diff --git a/tpl/RAJA b/tpl/RAJA index ac4d5e5cd..e00f05675 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit ac4d5e5cd00b18cd2b827055b25a904532ba25c0 +Subproject commit e00f05675b7e633c8bfdde583e25efd3a50bf267 From 494699574fc5ac83bea449e3e0aae690c1bb83c1 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 28 Nov 2023 12:45:14 -0800 Subject: [PATCH 151/454] Use newer intel compiler --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9d7f6b197..4665aaff8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -111,7 +111,7 @@ RUN . /opt/spack/share/spack/setup-env.sh && \ ## make -j 6 && \ ## cd .. && rm -rf build -FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl +FROM ghcr.io/rse-ops/intel-ubuntu-23.04:intel-2023.2.1 AS sycl ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build From f3d4bfd0c330afaaeee6d9d1e836c3f73fbac028 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 28 Nov 2023 12:57:22 -0800 Subject: [PATCH 152/454] Match recent changes on sycl pr branch --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4665aaff8..0da428cc7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -116,7 +116,7 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ - cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ ./bin/raja-perf.exe --checkrun 5 -sp" && \ cd .. && rm -rf build From 96c72a26edd73c4e10fb6820c1d9c4c12fe21ac8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 28 Nov 2023 14:09:40 -0800 Subject: [PATCH 153/454] Fix cmake option --- scripts/lc-builds/corona_sycl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 1fe68341c..6d34ae5b6 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -47,7 +47,7 @@ cmake \ -DENABLE_CUDA=Off \ -DRAJA_ENABLE_TARGET_OPENMP=Off \ -DENABLE_ALL_WARNINGS=Off \ - -DRAJA_ENABLE_SYCL=On \ + -DENABLE_SYCL=On \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_LINKER=clang++ \ From 8a93f918b634628d8c75d52f0bab7f380a39d322 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 28 Nov 2023 14:12:16 -0800 Subject: [PATCH 154/454] Remove xnack stuff. Not needed here. --- scripts/lc-builds/toss4_amdclang.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 1d827e415..c8ac1dbe5 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -74,9 +74,9 @@ cmake \ -DHIP_PATH=${ROCM_PATH}/llvm/bin \ -DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/amdclang \ -DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/amdclang++ \ - -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}:xnack+" \ - -DGPU_TARGETS="${COMP_ARCH}:xnack+" \ - -DAMDGPU_TARGETS="${COMP_ARCH}:xnack+" \ + -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ + -DGPU_TARGETS="${COMP_ARCH}" \ + -DAMDGPU_TARGETS="${COMP_ARCH}" \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ From 7beeb7e072d836b337d16f980a3019414aa7b469 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 28 Nov 2023 14:51:06 -0800 Subject: [PATCH 155/454] Make script argument conditional consistent --- scripts/lc-builds/toss4_clang.sh | 2 +- scripts/lc-builds/toss4_gcc.sh | 2 +- scripts/lc-builds/toss4_icpc-classic.sh | 2 +- scripts/lc-builds/toss4_icpc.sh | 2 +- scripts/lc-builds/toss4_icpx.sh | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index 185795ce8..fd951b04a 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss4_clang.sh 10.3.1" diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index a7e5d66c1..eac77b71e 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss4_gcc.sh 10.3.1" diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh index 842b7733e..3cc8b8ce1 100755 --- a/scripts/lc-builds/toss4_icpc-classic.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss4_icpc-classic.sh 19.1.2" diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh index 510a49bdb..a70b02015 100755 --- a/scripts/lc-builds/toss4_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss4_icpc.sh 2022.3" diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index f7e20aa1d..ff13bde99 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -7,7 +7,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 1 ]]; then echo echo "You must pass a compiler version number to script. For example," echo " toss4_icpx.sh 2022.1.0" From 89940562b27ec70c81a63095dd1ec0083b0274e5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 29 Nov 2023 10:36:21 -0800 Subject: [PATCH 156/454] Improve readability from Rich's suggestion --- src/algorithm/REDUCE_SUM-Cuda.cpp | 4 ++-- src/algorithm/REDUCE_SUM-Hip.cpp | 6 +++--- src/basic/PI_ATOMIC-Cuda.cpp | 2 +- src/basic/PI_ATOMIC-Hip.cpp | 2 +- src/basic/PI_REDUCE-Cuda.cpp | 4 ++-- src/basic/PI_REDUCE-Hip.cpp | 4 ++-- src/basic/REDUCE3_INT-Cuda.cpp | 4 ++-- src/basic/REDUCE3_INT-Hip.cpp | 4 ++-- src/basic/REDUCE_STRUCT-Cuda.cpp | 4 ++-- src/basic/REDUCE_STRUCT-Hip.cpp | 4 ++-- src/basic/TRAP_INT-Cuda.cpp | 4 ++-- src/basic/TRAP_INT-Hip.cpp | 4 ++-- src/stream/DOT-Cuda.cpp | 4 ++-- src/stream/DOT-Hip.cpp | 4 ++-- 14 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 71097e72b..82722646d 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -153,7 +153,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dsum; allocData(rds, dsum, 1); @@ -236,7 +236,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dsum; allocData(rds, dsum, 1); diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 521cb79fa..3d28e0787 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -77,7 +77,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dsum; allocData(rds, dsum, 1); @@ -180,7 +180,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dsum; allocData(rds, dsum, 1); @@ -262,7 +262,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dsum; allocData(rds, dsum, 1); diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 1ecdae2cd..6bd2bfc7d 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -49,7 +49,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr hpi = pi; if (separate_buffers) { diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index fa3d705b7..32911ee82 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -49,7 +49,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr hpi = pi; if (separate_buffers) { diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index fdda43dec..4cbc5649c 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -69,7 +69,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dpi; allocData(rds, dpi, 1); @@ -149,7 +149,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dpi; allocData(rds, dpi, 1); diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 5c2078df3..828a83791 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -69,7 +69,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dpi; allocData(rds, dpi, 1); @@ -148,7 +148,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dpi; allocData(rds, dpi, 1); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 44accd32d..97b02550a 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -82,7 +82,7 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Int_ptr vmem; allocData(rds, vmem, 3); @@ -174,7 +174,7 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Int_ptr vmem; allocData(rds, vmem, 3); diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 361f947a4..03f0f4696 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -82,7 +82,7 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Int_ptr vmem; allocData(rds, vmem, 3); @@ -174,7 +174,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Int_ptr vmem; allocData(rds, vmem, 3); diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 1959104c2..55aa2f8f5 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -111,7 +111,7 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax allocData(rds, mem, 6); @@ -221,7 +221,7 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax allocData(rds, mem, 6); diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index f29dcada9..0a7c36add 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -112,7 +112,7 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax allocData(rds, mem, 6); @@ -223,7 +223,7 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax allocData(rds, mem, 6); diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index b03a5955f..39a0cc254 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -88,7 +88,7 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr sumx; allocData(rds, sumx, 1); @@ -170,7 +170,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr sumx; allocData(rds, sumx, 1); diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index a04374a41..6cf272671 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -88,7 +88,7 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr sumx; allocData(rds, sumx, 1); @@ -169,7 +169,7 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr sumx; allocData(rds, sumx, 1); diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 805ddba90..cba0df85f 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -67,7 +67,7 @@ void DOT::runCudaVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dprod; allocData(rds, dprod, 1); @@ -145,7 +145,7 @@ void DOT::runCudaVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dprod; allocData(rds, dprod, 1); diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index f5fa41083..1d1517fbb 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -67,7 +67,7 @@ void DOT::runHipVariantBlock(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dprod; allocData(rds, dprod, 1); @@ -146,7 +146,7 @@ void DOT::runHipVariantOccGS(VariantID vid) DataSpace rds = getReductionDataSpace(vid); DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = hrds != rds; + const bool separate_buffers = (hrds != rds); Real_ptr dprod; allocData(rds, dprod, 1); From 380c825ba39be3832843645db51bf83909de4649 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 29 Nov 2023 12:37:52 -0800 Subject: [PATCH 157/454] Fix bug in info request command line options --- src/common/RunParams.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index b988cbded..d49ee70e9 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -763,9 +763,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) processTuningInput(); - if ( input_state != BadInput && + if ( input_state != InfoRequest && + input_state != BadInput && input_state != DryRun && - input_state != CheckRun ) { + input_state != CheckRun) { input_state = PerfRun; } @@ -1045,15 +1046,15 @@ void RunParams::printVariantNames(std::ostream& str) const void RunParams::printDataSpaceNames(std::ostream& str) const { str << "\nAvailable data spaces:"; - str << "\n-------------------\n"; + str << "\n----------------------\n"; for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { DataSpace ds = static_cast(ids); if (isDataSpaceAvailable(ds)) { str << getDataSpaceName(ds) << std::endl; } } - str << "\nUnavailable data spaces:"; - str << "\n-------------------\n"; + str << "\nUnavailable data spaces in current build configuration:"; + str << "\n-------------------------------------------------------\n"; for (int ids = 0; ids < static_cast(DataSpace::NumSpaces); ++ids) { DataSpace ds = static_cast(ids); if (!isDataSpaceAvailable(ds)) { From 3b6f08c6dda6183bee7f22d5557d206b4d4bef46 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 29 Nov 2023 12:43:47 -0800 Subject: [PATCH 158/454] Do not run if any info request is a command line option --- src/common/RunParams.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index d49ee70e9..47c520e7f 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -747,6 +747,10 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) } + if (input_state == InfoRequest) { + break; + } + } // Default size and size_meaning if unset From e571a01cc734b1a703dc33f17fb30cc6b682e69b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 Nov 2023 09:45:14 -0800 Subject: [PATCH 159/454] Make reduction variable naming more consistent --- src/algorithm/REDUCE_SUM-Cuda.cpp | 50 ++++++++++++++-------------- src/algorithm/REDUCE_SUM-Hip.cpp | 54 +++++++++++++++---------------- src/basic/PI_REDUCE-Cuda.cpp | 36 ++++++++++----------- src/basic/PI_REDUCE-Hip.cpp | 36 ++++++++++----------- src/stream/DOT-Cuda.cpp | 28 ++++++++-------- src/stream/DOT-Hip.cpp | 28 ++++++++-------- 6 files changed, 116 insertions(+), 116 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 82722646d..52b465613 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -28,7 +28,7 @@ namespace algorithm template < size_t block_size > __launch_bounds__(block_size) -__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, +__global__ void reduce_sum(Real_ptr x, Real_ptr sum, Real_type sum_init, Index_type iend) { extern __shared__ Real_type psum[ ]; @@ -49,7 +49,7 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, } if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( dsum, psum[ 0 ] ); + RAJA::atomicAdd( sum, psum[ 0 ] ); } } @@ -74,9 +74,9 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dsum; - allocData(rds, dsum, 1); - Real_ptr hsum = dsum; + Real_ptr sum; + allocData(rds, sum, 1); + Real_ptr hsum = sum; if (separate_buffers) { allocData(hrds, hsum, 1); } @@ -87,7 +87,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - dsum, + sum, len, ::cub::Sum(), m_sum_init, @@ -106,14 +106,14 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) cudaErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - dsum, + sum, len, ::cub::Sum(), m_sum_init, stream)); if (separate_buffers) { - cudaErrchk( cudaMemcpyAsync( hsum, dsum, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), cudaMemcpyDeviceToHost, stream ) ); } @@ -125,7 +125,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) // Free temporary storage deallocData(DataSpace::CudaDevice, temp_storage); - deallocData(rds, dsum); + deallocData(rds, sum); if (separate_buffers) { deallocData(hrds, hsum); } @@ -155,9 +155,9 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dsum; - allocData(rds, dsum, 1); - Real_ptr hsum = dsum; + Real_ptr sum; + allocData(rds, sum, 1); + Real_ptr hsum = sum; if (separate_buffers) { allocData(hrds, hsum, 1); } @@ -167,22 +167,22 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) if (separate_buffers) { *hsum = m_sum_init; - cudaErrchk( cudaMemcpyAsync( dsum, hsum, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( sum, hsum, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); } else { - *dsum = m_sum_init; + *sum = m_sum_init; } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; reduce_sum<<>>( x, - dsum, m_sum_init, + sum, m_sum_init, iend ); cudaErrchk( cudaGetLastError() ); if (separate_buffers) { - cudaErrchk( cudaMemcpyAsync( hsum, dsum, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); @@ -191,7 +191,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, dsum); + deallocData(rds, sum); if (separate_buffers) { deallocData(hrds, hsum); } @@ -238,9 +238,9 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dsum; - allocData(rds, dsum, 1); - Real_ptr hsum = dsum; + Real_ptr sum; + allocData(rds, sum, 1); + Real_ptr hsum = sum; if (separate_buffers) { allocData(hrds, hsum, 1); } @@ -254,22 +254,22 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) if (separate_buffers) { *hsum = m_sum_init; - cudaErrchk( cudaMemcpyAsync( dsum, hsum, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( sum, hsum, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); } else { - *dsum = m_sum_init; + *sum = m_sum_init; } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); reduce_sum<<>>( x, - dsum, m_sum_init, + sum, m_sum_init, iend ); cudaErrchk( cudaGetLastError() ); if (separate_buffers) { - cudaErrchk( cudaMemcpyAsync( hsum, dsum, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); @@ -278,7 +278,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, dsum); + deallocData(rds, sum); if (separate_buffers) { deallocData(hrds, hsum); } diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 3d28e0787..36db61c3b 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -33,7 +33,7 @@ namespace algorithm template < size_t block_size > __launch_bounds__(block_size) -__global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, +__global__ void reduce_sum(Real_ptr x, Real_ptr sum, Real_type sum_init, Index_type iend) { HIP_DYNAMIC_SHARED(Real_type, psum); @@ -54,7 +54,7 @@ __global__ void reduce_sum(Real_ptr x, Real_ptr dsum, Real_type sum_init, } if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( dsum, psum[ 0 ] ); + RAJA::atomicAdd( sum, psum[ 0 ] ); } } @@ -79,9 +79,9 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dsum; - allocData(rds, dsum, 1); - Real_ptr hsum = dsum; + Real_ptr sum; + allocData(rds, sum, 1); + Real_ptr hsum = sum; if (separate_buffers) { allocData(hrds, hsum, 1); } @@ -93,7 +93,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::rocprim::reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - dsum, + sum, m_sum_init, len, rocprim::plus(), @@ -102,7 +102,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - dsum, + sum, len, ::cub::Sum(), m_sum_init, @@ -123,7 +123,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::rocprim::reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - dsum, + sum, m_sum_init, len, rocprim::plus(), @@ -132,7 +132,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) hipErrchk(::cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, x+ibegin, - dsum, + sum, len, ::cub::Sum(), m_sum_init, @@ -140,7 +140,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) #endif if (separate_buffers) { - hipErrchk( hipMemcpyAsync( hsum, dsum, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), hipMemcpyDeviceToHost, stream ) ); } @@ -152,7 +152,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); - deallocData(rds, dsum); + deallocData(rds, sum); if (separate_buffers) { deallocData(hrds, hsum); } @@ -182,9 +182,9 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dsum; - allocData(rds, dsum, 1); - Real_ptr hsum = dsum; + Real_ptr sum; + allocData(rds, sum, 1); + Real_ptr hsum = sum; if (separate_buffers) { allocData(hrds, hsum, 1); } @@ -194,21 +194,21 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) if (separate_buffers) { *hsum = m_sum_init; - hipErrchk( hipMemcpyAsync( dsum, hsum, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( sum, hsum, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); } else { - *dsum = m_sum_init; + *sum = m_sum_init; } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; hipLaunchKernelGGL( (reduce_sum), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, dsum, m_sum_init, iend ); + x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); if (separate_buffers) { - hipErrchk( hipMemcpyAsync( hsum, dsum, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); @@ -217,7 +217,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, dsum); + deallocData(rds, sum); if (separate_buffers) { deallocData(hrds, hsum); } @@ -264,9 +264,9 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dsum; - allocData(rds, dsum, 1); - Real_ptr hsum = dsum; + Real_ptr sum; + allocData(rds, sum, 1); + Real_ptr hsum = sum; if (separate_buffers) { allocData(hrds, hsum, 1); } @@ -280,21 +280,21 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) if (separate_buffers) { *hsum = m_sum_init; - hipErrchk( hipMemcpyAsync( dsum, hsum, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( sum, hsum, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); } else { - *dsum = m_sum_init; + *sum = m_sum_init; } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); hipLaunchKernelGGL( (reduce_sum), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, dsum, m_sum_init, iend ); + x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); if (separate_buffers) { - hipErrchk( hipMemcpyAsync( hsum, dsum, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); @@ -303,7 +303,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, dsum); + deallocData(rds, sum); if (separate_buffers) { deallocData(hrds, hsum); } diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 4cbc5649c..6be24f908 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -26,7 +26,7 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) __global__ void pi_reduce(Real_type dx, - Real_ptr dpi, Real_type pi_init, + Real_ptr pi, Real_type pi_init, Index_type iend) { extern __shared__ Real_type ppi[ ]; @@ -48,7 +48,7 @@ __global__ void pi_reduce(Real_type dx, } if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( dpi, ppi[ 0 ] ); + RAJA::atomicAdd( pi, ppi[ 0 ] ); } } @@ -71,9 +71,9 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dpi; - allocData(rds, dpi, 1); - Real_ptr hpi = dpi; + Real_ptr pi; + allocData(rds, pi, 1); + Real_ptr hpi = pi; if (separate_buffers) { allocData(hrds, hpi, 1); } @@ -83,22 +83,22 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) if (separate_buffers) { *hpi = m_pi_init; - cudaErrchk( cudaMemcpyAsync( dpi, hpi, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); } else { - *dpi = m_pi_init; + *pi = m_pi_init; } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; pi_reduce<<>>( dx, - dpi, m_pi_init, + pi, m_pi_init, iend ); cudaErrchk( cudaGetLastError() ); if (separate_buffers) { - cudaErrchk( cudaMemcpyAsync( hpi, dpi, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); @@ -107,7 +107,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, dpi); + deallocData(rds, pi); if (separate_buffers) { deallocData(hrds, hpi); } @@ -151,9 +151,9 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dpi; - allocData(rds, dpi, 1); - Real_ptr hpi = dpi; + Real_ptr pi; + allocData(rds, pi, 1); + Real_ptr hpi = pi; if (separate_buffers) { allocData(hrds, hpi, 1); } @@ -167,22 +167,22 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) if (separate_buffers) { *hpi = m_pi_init; - cudaErrchk( cudaMemcpyAsync( dpi, hpi, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); } else { - *dpi = m_pi_init; + *pi = m_pi_init; } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); pi_reduce<<>>( dx, - dpi, m_pi_init, + pi, m_pi_init, iend ); cudaErrchk( cudaGetLastError() ); if (separate_buffers) { - cudaErrchk( cudaMemcpyAsync( hpi, dpi, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); @@ -191,7 +191,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, dpi); + deallocData(rds, pi); if (separate_buffers) { deallocData(hrds, hpi); } diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 828a83791..a3dd33af5 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -26,7 +26,7 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) __global__ void pi_reduce(Real_type dx, - Real_ptr dpi, Real_type pi_init, + Real_ptr pi, Real_type pi_init, Index_type iend) { HIP_DYNAMIC_SHARED(Real_type, ppi); @@ -48,7 +48,7 @@ __global__ void pi_reduce(Real_type dx, } if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( dpi, ppi[ 0 ] ); + RAJA::atomicAdd( pi, ppi[ 0 ] ); } } @@ -71,9 +71,9 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dpi; - allocData(rds, dpi, 1); - Real_ptr hpi = dpi; + Real_ptr pi; + allocData(rds, pi, 1); + Real_ptr hpi = pi; if (separate_buffers) { allocData(hrds, hpi, 1); } @@ -83,21 +83,21 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) if (separate_buffers) { *hpi = m_pi_init; - hipErrchk( hipMemcpyAsync( dpi, hpi, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); } else { - *dpi = m_pi_init; + *pi = m_pi_init; } const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - dx, dpi, m_pi_init, iend ); + dx, pi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); if (separate_buffers) { - hipErrchk( hipMemcpyAsync( hpi, dpi, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); @@ -106,7 +106,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, dpi); + deallocData(rds, pi); if (separate_buffers) { deallocData(hrds, hpi); } @@ -150,9 +150,9 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) DataSpace hrds = hostAccessibleDataSpace(rds); const bool separate_buffers = (hrds != rds); - Real_ptr dpi; - allocData(rds, dpi, 1); - Real_ptr hpi = dpi; + Real_ptr pi; + allocData(rds, pi, 1); + Real_ptr hpi = pi; if (separate_buffers) { allocData(hrds, hpi, 1); } @@ -166,21 +166,21 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) if (separate_buffers) { *hpi = m_pi_init; - hipErrchk( hipMemcpyAsync( dpi, hpi, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); } else { - *dpi = m_pi_init; + *pi = m_pi_init; } const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - dx, dpi, m_pi_init, iend ); + dx, pi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); if (separate_buffers) { - hipErrchk( hipMemcpyAsync( hpi, dpi, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); @@ -189,7 +189,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, dpi); + deallocData(rds, pi); if (separate_buffers) { deallocData(hrds, hpi); } diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index cba0df85f..351911b34 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -71,17 +71,17 @@ void DOT::runCudaVariantBlock(VariantID vid) Real_ptr dprod; allocData(rds, dprod, 1); - Real_ptr hprod = dprod; + Real_ptr hdprod = dprod; if (separate_buffers) { - allocData(hrds, hprod, 1); + allocData(hrds, hdprod, 1); } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { if (separate_buffers) { - *hprod = m_dot_init; - cudaErrchk( cudaMemcpyAsync( dprod, hprod, sizeof(Real_type), + *hdprod = m_dot_init; + cudaErrchk( cudaMemcpyAsync( dprod, hdprod, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); } else { *dprod = m_dot_init; @@ -94,18 +94,18 @@ void DOT::runCudaVariantBlock(VariantID vid) cudaErrchk( cudaGetLastError() ); if (separate_buffers) { - cudaErrchk( cudaMemcpyAsync( hprod, dprod, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( hdprod, dprod, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_dot += *hprod; + m_dot += *hdprod; } stopTimer(); deallocData(rds, dprod); if (separate_buffers) { - deallocData(hrds, hprod); + deallocData(hrds, hdprod); } } else if ( vid == RAJA_CUDA ) { @@ -149,9 +149,9 @@ void DOT::runCudaVariantOccGS(VariantID vid) Real_ptr dprod; allocData(rds, dprod, 1); - Real_ptr hprod = dprod; + Real_ptr hdprod = dprod; if (separate_buffers) { - allocData(hrds, hprod, 1); + allocData(hrds, hdprod, 1); } constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -162,8 +162,8 @@ void DOT::runCudaVariantOccGS(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { if (separate_buffers) { - *hprod = m_dot_init; - cudaErrchk( cudaMemcpyAsync( dprod, hprod, sizeof(Real_type), + *hdprod = m_dot_init; + cudaErrchk( cudaMemcpyAsync( dprod, hdprod, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); } else { *dprod = m_dot_init; @@ -176,18 +176,18 @@ void DOT::runCudaVariantOccGS(VariantID vid) cudaErrchk( cudaGetLastError() ); if (separate_buffers) { - cudaErrchk( cudaMemcpyAsync( hprod, dprod, sizeof(Real_type), + cudaErrchk( cudaMemcpyAsync( hdprod, dprod, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_dot += *hprod; + m_dot += *hdprod; } stopTimer(); deallocData(rds, dprod); if (separate_buffers) { - deallocData(hrds, hprod); + deallocData(hrds, hdprod); } } else if ( vid == RAJA_CUDA ) { diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 1d1517fbb..379872940 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -71,17 +71,17 @@ void DOT::runHipVariantBlock(VariantID vid) Real_ptr dprod; allocData(rds, dprod, 1); - Real_ptr hprod = dprod; + Real_ptr hdprod = dprod; if (separate_buffers) { - allocData(hrds, hprod, 1); + allocData(hrds, hdprod, 1); } startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { if (separate_buffers) { - *hprod = m_dot_init; - hipErrchk( hipMemcpyAsync( dprod, hprod, sizeof(Real_type), + *hdprod = m_dot_init; + hipErrchk( hipMemcpyAsync( dprod, hdprod, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); } else { *dprod = m_dot_init; @@ -95,18 +95,18 @@ void DOT::runHipVariantBlock(VariantID vid) hipErrchk( hipGetLastError() ); if (separate_buffers) { - hipErrchk( hipMemcpyAsync( hprod, dprod, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( hdprod, dprod, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_dot += *hprod; + m_dot += *hdprod; } stopTimer(); deallocData(rds, dprod); if (separate_buffers) { - deallocData(hrds, hprod); + deallocData(hrds, hdprod); } } else if ( vid == RAJA_HIP ) { @@ -150,9 +150,9 @@ void DOT::runHipVariantOccGS(VariantID vid) Real_ptr dprod; allocData(rds, dprod, 1); - Real_ptr hprod = dprod; + Real_ptr hdprod = dprod; if (separate_buffers) { - allocData(hrds, hprod, 1); + allocData(hrds, hdprod, 1); } constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -163,8 +163,8 @@ void DOT::runHipVariantOccGS(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { if (separate_buffers) { - *hprod = m_dot_init; - hipErrchk( hipMemcpyAsync( dprod, hprod, sizeof(Real_type), + *hdprod = m_dot_init; + hipErrchk( hipMemcpyAsync( dprod, hdprod, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); } else { *dprod = m_dot_init; @@ -178,18 +178,18 @@ void DOT::runHipVariantOccGS(VariantID vid) hipErrchk( hipGetLastError() ); if (separate_buffers) { - hipErrchk( hipMemcpyAsync( hprod, dprod, sizeof(Real_type), + hipErrchk( hipMemcpyAsync( hdprod, dprod, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_dot += *hprod; + m_dot += *hdprod; } stopTimer(); deallocData(rds, dprod); if (separate_buffers) { - deallocData(hrds, hprod); + deallocData(hrds, hdprod); } } else if ( vid == RAJA_HIP ) { From f6e2898b4ccdea6f793a088b5e6c6973ecb8ae69 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 Nov 2023 10:21:09 -0800 Subject: [PATCH 160/454] Reduce reducer code duplication --- src/algorithm/REDUCE_SUM-Cuda.cpp | 58 ++++++------------------------- src/algorithm/REDUCE_SUM-Hip.cpp | 58 ++++++------------------------- src/basic/PI_REDUCE-Cuda.cpp | 40 +++++---------------- src/basic/PI_REDUCE-Hip.cpp | 40 +++++---------------- src/basic/REDUCE3_INT-Cuda.cpp | 40 +++++---------------- src/basic/REDUCE3_INT-Hip.cpp | 40 +++++---------------- src/basic/REDUCE_STRUCT-Cuda.cpp | 40 +++++---------------- src/basic/REDUCE_STRUCT-Hip.cpp | 40 +++++---------------- src/basic/TRAP_INT-Cuda.cpp | 40 +++++---------------- src/basic/TRAP_INT-Hip.cpp | 40 +++++---------------- src/common/GPUUtils.hpp | 19 ++++++++++ src/stream/DOT-Cuda.cpp | 40 +++++---------------- src/stream/DOT-Hip.cpp | 40 +++++---------------- 13 files changed, 121 insertions(+), 414 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 52b465613..94481333e 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -70,16 +70,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) int len = iend - ibegin; - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sum; - allocData(rds, sum, 1); - Real_ptr hsum = sum; - if (separate_buffers) { - allocData(hrds, hsum, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -112,7 +103,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) m_sum_init, stream)); - if (separate_buffers) { + if (sum != hsum) { cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), cudaMemcpyDeviceToHost, stream ) ); } @@ -125,10 +116,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) // Free temporary storage deallocData(DataSpace::CudaDevice, temp_storage); - deallocData(rds, sum); - if (separate_buffers) { - deallocData(hrds, hsum); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); } else { @@ -151,21 +139,12 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sum; - allocData(rds, sum, 1); - Real_ptr hsum = sum; - if (separate_buffers) { - allocData(hrds, hsum, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sum != hsum) { *hsum = m_sum_init; cudaErrchk( cudaMemcpyAsync( sum, hsum, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -181,7 +160,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (sum != hsum) { cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -191,10 +170,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, sum); - if (separate_buffers) { - deallocData(hrds, hsum); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_CUDA ) { @@ -234,16 +210,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sum; - allocData(rds, sum, 1); - Real_ptr hsum = sum; - if (separate_buffers) { - allocData(hrds, hsum, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -252,7 +219,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sum != hsum) { *hsum = m_sum_init; cudaErrchk( cudaMemcpyAsync( sum, hsum, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -268,7 +235,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (sum != hsum) { cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -278,10 +245,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, sum); - if (separate_buffers) { - deallocData(hrds, hsum); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_CUDA ) { diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 36db61c3b..16edb0c8c 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -75,16 +75,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) int len = iend - ibegin; - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sum; - allocData(rds, sum, 1); - Real_ptr hsum = sum; - if (separate_buffers) { - allocData(hrds, hsum, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -139,7 +130,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) stream)); #endif - if (separate_buffers) { + if (sum != hsum) { hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), hipMemcpyDeviceToHost, stream ) ); } @@ -152,10 +143,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); - deallocData(rds, sum); - if (separate_buffers) { - deallocData(hrds, hsum); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); } else { @@ -178,21 +166,12 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sum; - allocData(rds, sum, 1); - Real_ptr hsum = sum; - if (separate_buffers) { - allocData(hrds, hsum, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sum != hsum) { *hsum = m_sum_init; hipErrchk( hipMemcpyAsync( sum, hsum, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -207,7 +186,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (sum != hsum) { hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -217,10 +196,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, sum); - if (separate_buffers) { - deallocData(hrds, hsum); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_HIP ) { @@ -260,16 +236,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sum; - allocData(rds, sum, 1); - Real_ptr hsum = sum; - if (separate_buffers) { - allocData(hrds, hsum, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -278,7 +245,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sum != hsum) { *hsum = m_sum_init; hipErrchk( hipMemcpyAsync( sum, hsum, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -293,7 +260,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (sum != hsum) { hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -303,10 +270,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, sum); - if (separate_buffers) { - deallocData(hrds, hsum); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 6be24f908..e1d046160 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -67,21 +67,12 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr pi; - allocData(rds, pi, 1); - Real_ptr hpi = pi; - if (separate_buffers) { - allocData(hrds, hpi, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -97,7 +88,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -107,10 +98,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, pi); - if (separate_buffers) { - deallocData(hrds, hpi); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_CUDA ) { @@ -147,16 +135,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr pi; - allocData(rds, pi, 1); - Real_ptr hpi = pi; - if (separate_buffers) { - allocData(hrds, hpi, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -165,7 +144,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -181,7 +160,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -191,10 +170,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, pi); - if (separate_buffers) { - deallocData(hrds, hpi); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index a3dd33af5..4049287fc 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -67,21 +67,12 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr pi; - allocData(rds, pi, 1); - Real_ptr hpi = pi; - if (separate_buffers) { - allocData(hrds, hpi, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -96,7 +87,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) dx, pi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -106,10 +97,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, pi); - if (separate_buffers) { - deallocData(hrds, hpi); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_HIP ) { @@ -146,16 +134,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr pi; - allocData(rds, pi, 1); - Real_ptr hpi = pi; - if (separate_buffers) { - allocData(hrds, hpi, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -164,7 +143,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -179,7 +158,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) dx, pi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -189,10 +168,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, pi); - if (separate_buffers) { - deallocData(hrds, hpi); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 97b02550a..a568c1c3c 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -80,21 +80,12 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Int_ptr vmem; - allocData(rds, vmem, 3); - Int_ptr hvmem = vmem; - if (separate_buffers) { - allocData(hrds, hvmem, 3); - } + RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (vmem != hvmem) { hvmem[0] = m_vsum_init; hvmem[1] = m_vmin_init; hvmem[2] = m_vmax_init; @@ -116,7 +107,7 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (vmem != hvmem) { cudaErrchk( cudaMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -128,10 +119,7 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, vmem); - if (separate_buffers) { - deallocData(hrds, hvmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_CUDA ) { @@ -172,16 +160,7 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Int_ptr vmem; - allocData(rds, vmem, 3); - Int_ptr hvmem = vmem; - if (separate_buffers) { - allocData(hrds, hvmem, 3); - } + RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -190,7 +169,7 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (vmem != hvmem) { hvmem[0] = m_vsum_init; hvmem[1] = m_vmin_init; hvmem[2] = m_vmax_init; @@ -212,7 +191,7 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (vmem != hvmem) { cudaErrchk( cudaMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -224,10 +203,7 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, vmem); - if (separate_buffers) { - deallocData(hrds, hvmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 03f0f4696..e0359d4e1 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -80,21 +80,12 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Int_ptr vmem; - allocData(rds, vmem, 3); - Int_ptr hvmem = vmem; - if (separate_buffers) { - allocData(hrds, hvmem, 3); - } + RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (vmem != hvmem) { hvmem[0] = m_vsum_init; hvmem[1] = m_vmin_init; hvmem[2] = m_vmax_init; @@ -116,7 +107,7 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (vmem != hvmem) { hipErrchk( hipMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -128,10 +119,7 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, vmem); - if (separate_buffers) { - deallocData(hrds, hvmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_HIP ) { @@ -172,16 +160,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Int_ptr vmem; - allocData(rds, vmem, 3); - Int_ptr hvmem = vmem; - if (separate_buffers) { - allocData(hrds, hvmem, 3); - } + RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -190,7 +169,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (vmem != hvmem) { hvmem[0] = m_vsum_init; hvmem[1] = m_vmin_init; hvmem[2] = m_vmax_init; @@ -213,7 +192,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (vmem != hvmem) { hipErrchk( hipMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -225,10 +204,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, vmem); - if (separate_buffers) { - deallocData(hrds, hvmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 55aa2f8f5..ed6e3d8af 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -109,21 +109,12 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(rds, mem, 6); - Real_ptr hmem = mem; - if (separate_buffers) { - allocData(hrds, hmem, 6); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (mem != hmem) { hmem[0] = m_init_sum; // xcenter hmem[1] = m_init_min; // xmin hmem[2] = m_init_max; // xmax @@ -152,7 +143,7 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) points.N); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (mem != hmem) { cudaErrchk( cudaMemcpyAsync( hmem, mem, 6*sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -167,10 +158,7 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, mem); - if (separate_buffers) { - deallocData(hrds, hmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_CUDA ) { @@ -219,16 +207,7 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(rds, mem, 6); - Real_ptr hmem = mem; - if (separate_buffers) { - allocData(hrds, hmem, 6); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -237,7 +216,7 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (mem != hmem) { hmem[0] = m_init_sum; // xcenter hmem[1] = m_init_min; // xmin hmem[2] = m_init_max; // xmax @@ -266,7 +245,7 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) points.N); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (mem != hmem) { cudaErrchk( cudaMemcpyAsync( hmem, mem, 6*sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -281,10 +260,7 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, mem); - if (separate_buffers) { - deallocData(hrds, hmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 0a7c36add..e6017d394 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -110,21 +110,12 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(rds, mem, 6); - Real_ptr hmem = mem; - if (separate_buffers) { - allocData(hrds, hmem, 6); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (mem != hmem) { hmem[0] = m_init_sum; // xcenter hmem[1] = m_init_min; // xmin hmem[2] = m_init_max; // xmax @@ -155,7 +146,7 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) points.N); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (mem != hmem) { hipErrchk( hipMemcpyAsync( hmem, mem, 6*sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -170,10 +161,7 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, mem); - if (separate_buffers) { - deallocData(hrds, hmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_HIP ) { @@ -221,16 +209,7 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr mem; //xcenter,xmin,xmax,ycenter,ymin,ymax - allocData(rds, mem, 6); - Real_ptr hmem = mem; - if (separate_buffers) { - allocData(hrds, hmem, 6); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -239,7 +218,7 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (mem != hmem) { hmem[0] = m_init_sum; // xcenter hmem[1] = m_init_min; // xmin hmem[2] = m_init_max; // xmax @@ -269,7 +248,7 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) points.N); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (mem != hmem) { hipErrchk( hipMemcpyAsync( hmem, mem, 6*sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -284,10 +263,7 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, mem); - if (separate_buffers) { - deallocData(hrds, hmem); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 39a0cc254..0c2a63cc7 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -86,21 +86,12 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sumx; - allocData(rds, sumx, 1); - Real_ptr hsumx = sumx; - if (separate_buffers) { - allocData(hrds, hsumx, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sumx != hsumx) { *hsumx = m_sumx_init; cudaErrchk( cudaMemcpyAsync( sumx, hsumx, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -118,7 +109,7 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) iend); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (sumx != hsumx) { cudaErrchk( cudaMemcpyAsync( hsumx, sumx, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -128,10 +119,7 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, sumx); - if (separate_buffers) { - deallocData(hrds, hsumx); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_CUDA ) { @@ -168,16 +156,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sumx; - allocData(rds, sumx, 1); - Real_ptr hsumx = sumx; - if (separate_buffers) { - allocData(hrds, hsumx, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -186,7 +165,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sumx != hsumx) { *hsumx = m_sumx_init; cudaErrchk( cudaMemcpyAsync( sumx, hsumx, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -204,7 +183,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) iend); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (sumx != hsumx) { cudaErrchk( cudaMemcpyAsync( hsumx, sumx, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -214,10 +193,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, sumx); - if (separate_buffers) { - deallocData(hrds, hsumx); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 6cf272671..5a8cb3d0e 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -86,21 +86,12 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sumx; - allocData(rds, sumx, 1); - Real_ptr hsumx = sumx; - if (separate_buffers) { - allocData(hrds, hsumx, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sumx != hsumx) { *hsumx = m_sumx_init; hipErrchk( hipMemcpyAsync( sumx, hsumx, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -117,7 +108,7 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) iend); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (sumx != hsumx) { hipErrchk( hipMemcpyAsync( hsumx, sumx, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -127,10 +118,7 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, sumx); - if (separate_buffers) { - deallocData(hrds, hsumx); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_HIP ) { @@ -167,16 +155,7 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr sumx; - allocData(rds, sumx, 1); - Real_ptr hsumx = sumx; - if (separate_buffers) { - allocData(hrds, hsumx, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -185,7 +164,7 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (sumx != hsumx) { *hsumx = m_sumx_init; hipErrchk( hipMemcpyAsync( sumx, hsumx, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -203,7 +182,7 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) iend); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (sumx != hsumx) { hipErrchk( hipMemcpyAsync( hsumx, sumx, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -213,10 +192,7 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, sumx); - if (separate_buffers) { - deallocData(hrds, hsumx); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_HIP ) { diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 8d6012a6d..7bc737829 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -168,6 +168,25 @@ inline void seq_for(camp::int_seq const&, Func&& func) } // closing brace for rajaperf namespace +// +#define RAJAPERF_GPU_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ + DataSpace reduction_data_space = getReductionDataSpace(vid); \ + DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); \ + \ + pointer_type device_ptr_name; \ + allocData(reduction_data_space, device_ptr_name, (length)); \ + pointer_type host_ptr_name = device_ptr_name; \ + if (reduction_data_space != host_data_space) { \ + allocData(host_data_space, host_ptr_name, (length)); \ + } + +// +#define RAJAPERF_GPU_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ + deallocData(reduction_data_space, device_ptr_name); \ + if (reduction_data_space != host_data_space) { \ + deallocData(host_data_space, host_ptr_name); \ + } + // #define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(kernel, variant) \ void kernel::run##variant##Variant(VariantID vid, size_t tune_idx) \ diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 351911b34..2acec1b2a 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -65,21 +65,12 @@ void DOT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr dprod; - allocData(rds, dprod, 1); - Real_ptr hdprod = dprod; - if (separate_buffers) { - allocData(hrds, hdprod, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (dprod != hdprod) { *hdprod = m_dot_init; cudaErrchk( cudaMemcpyAsync( dprod, hdprod, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -93,7 +84,7 @@ void DOT::runCudaVariantBlock(VariantID vid) a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (dprod != hdprod) { cudaErrchk( cudaMemcpyAsync( hdprod, dprod, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -103,10 +94,7 @@ void DOT::runCudaVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, dprod); - if (separate_buffers) { - deallocData(hrds, hdprod); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_CUDA ) { @@ -143,16 +131,7 @@ void DOT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr dprod; - allocData(rds, dprod, 1); - Real_ptr hdprod = dprod; - if (separate_buffers) { - allocData(hrds, hdprod, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -161,7 +140,7 @@ void DOT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (dprod != hdprod) { *hdprod = m_dot_init; cudaErrchk( cudaMemcpyAsync( dprod, hdprod, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -175,7 +154,7 @@ void DOT::runCudaVariantOccGS(VariantID vid) a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (dprod != hdprod) { cudaErrchk( cudaMemcpyAsync( hdprod, dprod, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -185,10 +164,7 @@ void DOT::runCudaVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, dprod); - if (separate_buffers) { - deallocData(hrds, hdprod); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_CUDA ) { diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 379872940..91344415c 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -65,21 +65,12 @@ void DOT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr dprod; - allocData(rds, dprod, 1); - Real_ptr hdprod = dprod; - if (separate_buffers) { - allocData(hrds, hdprod, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (dprod != hdprod) { *hdprod = m_dot_init; hipErrchk( hipMemcpyAsync( dprod, hdprod, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -94,7 +85,7 @@ void DOT::runHipVariantBlock(VariantID vid) a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (dprod != hdprod) { hipErrchk( hipMemcpyAsync( hdprod, dprod, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -104,10 +95,7 @@ void DOT::runHipVariantBlock(VariantID vid) } stopTimer(); - deallocData(rds, dprod); - if (separate_buffers) { - deallocData(hrds, hdprod); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_HIP ) { @@ -144,16 +132,7 @@ void DOT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); - - Real_ptr dprod; - allocData(rds, dprod, 1); - Real_ptr hdprod = dprod; - if (separate_buffers) { - allocData(hrds, hdprod, 1); - } + RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -162,7 +141,7 @@ void DOT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (dprod != hdprod) { *hdprod = m_dot_init; hipErrchk( hipMemcpyAsync( dprod, hdprod, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -177,7 +156,7 @@ void DOT::runHipVariantOccGS(VariantID vid) a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (dprod != hdprod) { hipErrchk( hipMemcpyAsync( hdprod, dprod, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -187,10 +166,7 @@ void DOT::runHipVariantOccGS(VariantID vid) } stopTimer(); - deallocData(rds, dprod); - if (separate_buffers) { - deallocData(hrds, hdprod); - } + RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_HIP ) { From 86a2c3cf376ee7c08dbe5ad9bf859438c5d964ba Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 Nov 2023 10:34:07 -0800 Subject: [PATCH 161/454] Bring PI_ATOMIC in line with reduction changes --- src/basic/PI_ATOMIC-Cuda.cpp | 25 ++++++++++++------------- src/basic/PI_ATOMIC-Hip.cpp | 25 ++++++++++++------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 6bd2bfc7d..d65189b1f 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -47,13 +47,12 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) PI_ATOMIC_DATA_SETUP; - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); + DataSpace reduction_data_space = getReductionDataSpace(vid); + DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); Real_ptr hpi = pi; - if (separate_buffers) { - allocData(hrds, hpi, 1); + if (reduction_data_space != host_data_space) { + allocData(host_data_space, hpi, 1); } if ( vid == Base_CUDA ) { @@ -61,7 +60,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -74,7 +73,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) pi_atomic<<>>( pi, dx, iend ); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -89,7 +88,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -106,7 +105,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) }); cudaErrchk( cudaGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -121,7 +120,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), cudaMemcpyHostToDevice, res.get_stream() ) ); @@ -135,7 +134,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - if (separate_buffers) { + if (pi != hpi) { cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), cudaMemcpyDeviceToHost, res.get_stream() ) ); } @@ -149,8 +148,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) getCout() << "\n PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; } - if (separate_buffers) { - deallocData(hrds, hpi); + if (pi != hpi) { + deallocData(host_data_space, hpi); } } diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 32911ee82..36841a209 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -47,13 +47,12 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) PI_ATOMIC_DATA_SETUP; - DataSpace rds = getReductionDataSpace(vid); - DataSpace hrds = hostAccessibleDataSpace(rds); - const bool separate_buffers = (hrds != rds); + DataSpace reduction_data_space = getReductionDataSpace(vid); + DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); Real_ptr hpi = pi; - if (separate_buffers) { - allocData(hrds, hpi, 1); + if (reduction_data_space != host_data_space) { + allocData(host_data_space, hpi, 1); } if ( vid == Base_HIP ) { @@ -61,7 +60,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -74,7 +73,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) hipLaunchKernelGGL((atomic_pi),grid_size, block_size, shmem, res.get_stream(), pi, dx, iend ); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -89,7 +88,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -108,7 +107,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, atomic_pi_lambda); hipErrchk( hipGetLastError() ); - if (separate_buffers) { + if (pi != hpi) { hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -123,7 +122,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (separate_buffers) { + if (pi != hpi) { *hpi = m_pi_init; hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), hipMemcpyHostToDevice, res.get_stream() ) ); @@ -137,7 +136,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - if (separate_buffers) { + if (pi != hpi) { hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), hipMemcpyDeviceToHost, res.get_stream() ) ); } @@ -151,8 +150,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) getCout() << "\n PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl; } - if (separate_buffers) { - deallocData(hrds, hpi); + if (pi != hpi) { + deallocData(host_data_space, hpi); } } From 576f1cf7bb77981d9e148083fc6cee400bfff665 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 Nov 2023 12:24:30 -0800 Subject: [PATCH 162/454] Put GPU reduction initialization and copy back under a macro --- src/algorithm/REDUCE_SUM-Cuda.cpp | 42 ++++------------ src/algorithm/REDUCE_SUM-Hip.cpp | 42 ++++------------ src/basic/PI_ATOMIC-Cuda.cpp | 51 +++++--------------- src/basic/PI_ATOMIC-Hip.cpp | 51 +++++--------------- src/basic/PI_REDUCE-Cuda.cpp | 46 ++++++------------ src/basic/PI_REDUCE-Hip.cpp | 46 ++++++------------ src/basic/REDUCE3_INT-Cuda.cpp | 60 +++++++---------------- src/basic/REDUCE3_INT-Hip.cpp | 60 +++++++---------------- src/basic/REDUCE_STRUCT-Cuda.cpp | 80 +++++++++---------------------- src/basic/REDUCE_STRUCT-Hip.cpp | 80 +++++++++---------------------- src/basic/TRAP_INT-Cuda.cpp | 42 +++++----------- src/basic/TRAP_INT-Hip.cpp | 42 +++++----------- src/common/GPUUtils.hpp | 74 ++++++++++++++++++++++------ src/stream/DOT-Cuda.cpp | 42 +++++----------- src/stream/DOT-Hip.cpp | 42 +++++----------- 15 files changed, 260 insertions(+), 540 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 94481333e..dea245e85 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -70,7 +70,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) int len = iend - ibegin; - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -116,7 +116,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) // Free temporary storage deallocData(DataSpace::CudaDevice, temp_storage); - RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum); } else { @@ -139,18 +139,12 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sum != hsum) { - *hsum = m_sum_init; - cudaErrchk( cudaMemcpyAsync( sum, hsum, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sum = m_sum_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -160,17 +154,12 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (sum != hsum) { - cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_sum = *hsum; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_CUDA ) { @@ -210,7 +199,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -219,13 +208,7 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sum != hsum) { - *hsum = m_sum_init; - cudaErrchk( cudaMemcpyAsync( sum, hsum, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sum = m_sum_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -235,17 +218,12 @@ void REDUCE_SUM::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (sum != hsum) { - cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_sum = *hsum; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_CUDA ) { diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 16edb0c8c..9613d1e63 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -75,7 +75,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) int len = iend - ibegin; - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -143,7 +143,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); - RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); + RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum); } else { @@ -166,18 +166,12 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sum != hsum) { - *hsum = m_sum_init; - hipErrchk( hipMemcpyAsync( sum, hsum, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sum = m_sum_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -186,17 +180,12 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); - if (sum != hsum) { - hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_sum = *hsum; + RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); + RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_HIP ) { @@ -236,7 +225,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -245,13 +234,7 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sum != hsum) { - *hsum = m_sum_init; - hipErrchk( hipMemcpyAsync( sum, hsum, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sum = m_sum_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -260,17 +243,12 @@ void REDUCE_SUM::runHipVariantOccGS(VariantID vid) x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); - if (sum != hsum) { - hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_sum = *hsum; + RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sum, hsum); + RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index d65189b1f..7a1897a2c 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -60,25 +60,16 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; pi_atomic<<>>( pi, dx, iend ); cudaErrchk( cudaGetLastError() ); - if (pi != hpi) { - cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final = *hpi * 4.0; + Real_type rpi; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi_final = rpi * static_cast(4); } stopTimer(); @@ -88,13 +79,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; @@ -105,12 +90,9 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) }); cudaErrchk( cudaGetLastError() ); - if (pi != hpi) { - cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final = *hpi * 4.0; + Real_type rpi; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi_final = rpi * static_cast(4); } stopTimer(); @@ -120,13 +102,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -134,12 +110,9 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - if (pi != hpi) { - cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi_final = *hpi * 4.0; + Real_type rpi; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi_final = rpi * static_cast(4); } stopTimer(); diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 36841a209..0544c5f2a 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -60,25 +60,16 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; hipLaunchKernelGGL((atomic_pi),grid_size, block_size, shmem, res.get_stream(), pi, dx, iend ); hipErrchk( hipGetLastError() ); - if (pi != hpi) { - hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final = *hpi * 4.0; + Real_type rpi; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi_final = rpi * static_cast(4); } stopTimer(); @@ -88,13 +79,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); auto atomic_pi_lambda = [=] __device__ (Index_type i) { double x = (double(i) + 0.5) * dx; @@ -107,12 +92,9 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, atomic_pi_lambda); hipErrchk( hipGetLastError() ); - if (pi != hpi) { - hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final = *hpi * 4.0; + Real_type rpi; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi_final = rpi * static_cast(4); } stopTimer(); @@ -122,13 +104,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -136,12 +112,9 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - if (pi != hpi) { - hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi_final = *hpi * 4.0; + Real_type rpi; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi_final = rpi * static_cast(4); } stopTimer(); diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index e1d046160..d237b08cf 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -67,18 +67,12 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -88,17 +82,14 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (pi != hpi) { - cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi = *hpi * 4.0; + Real_type rpi; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi = rpi * static_cast(4); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); + RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_CUDA ) { @@ -112,7 +103,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) PI_REDUCE_BODY; }); - m_pi = 4.0 * static_cast(pi.get()); + m_pi = static_cast(4) * static_cast(pi.get()); } stopTimer(); @@ -135,7 +126,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -144,13 +135,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - cudaErrchk( cudaMemcpyAsync( pi, hpi, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -160,17 +145,14 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (pi != hpi) { - cudaErrchk( cudaMemcpyAsync( hpi, pi, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_pi = *hpi * 4.0; + Real_type rpi; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi = rpi * static_cast(4); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); + RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_CUDA ) { @@ -184,7 +166,7 @@ void PI_REDUCE::runCudaVariantOccGS(VariantID vid) PI_REDUCE_BODY; }); - m_pi = 4.0 * static_cast(pi.get()); + m_pi = static_cast(4) * static_cast(pi.get()); } stopTimer(); diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 4049287fc..1ae12a067 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -67,18 +67,12 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -87,17 +81,14 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) dx, pi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); - if (pi != hpi) { - hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi = *hpi * 4.0; + Real_type rpi; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi = rpi * static_cast(4); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); + RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_HIP ) { @@ -111,7 +102,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) PI_REDUCE_BODY; }); - m_pi = 4.0 * static_cast(pi.get()); + m_pi = static_cast(4) * static_cast(pi.get()); } stopTimer(); @@ -134,7 +125,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -143,13 +134,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (pi != hpi) { - *hpi = m_pi_init; - hipErrchk( hipMemcpyAsync( pi, hpi, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *pi = m_pi_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -158,17 +143,14 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) dx, pi, m_pi_init, iend ); hipErrchk( hipGetLastError() ); - if (pi != hpi) { - hipErrchk( hipMemcpyAsync( hpi, pi, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_pi = *hpi * 4.0; + Real_type rpi; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); + m_pi = rpi * static_cast(4); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(pi, hpi); + RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi); } else if ( vid == RAJA_HIP ) { @@ -182,7 +164,7 @@ void PI_REDUCE::runHipVariantOccGS(VariantID vid) PI_REDUCE_BODY; }); - m_pi = 4.0 * static_cast(pi.get()); + m_pi = static_cast(4) * static_cast(pi.get()); } stopTimer(); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index a568c1c3c..b0aae15fe 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -80,22 +80,13 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); + RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (vmem != hvmem) { - hvmem[0] = m_vsum_init; - hvmem[1] = m_vmin_init; - hvmem[2] = m_vmax_init; - cudaErrchk( cudaMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - vmem[0] = m_vsum_init; - vmem[1] = m_vmin_init; - vmem[2] = m_vmax_init; - } + Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; + RAJAPERF_CUDA_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; @@ -107,19 +98,16 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (vmem != hvmem) { - cudaErrchk( cudaMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_vsum += hvmem[0]; - m_vmin = RAJA_MIN(m_vmin, hvmem[1]); - m_vmax = RAJA_MAX(m_vmax, hvmem[2]); + Int_type rvmem[3]; + RAJAPERF_CUDA_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); + m_vsum += rvmem[0]; + m_vmin = RAJA_MIN(m_vmin, rvmem[1]); + m_vmax = RAJA_MAX(m_vmax, rvmem[2]); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); + RAJAPERF_CUDA_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_CUDA ) { @@ -160,7 +148,7 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); + RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -169,17 +157,8 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (vmem != hvmem) { - hvmem[0] = m_vsum_init; - hvmem[1] = m_vmin_init; - hvmem[2] = m_vmax_init; - cudaErrchk( cudaMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - vmem[0] = m_vsum_init; - vmem[1] = m_vmin_init; - vmem[2] = m_vmax_init; - } + Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; + RAJAPERF_CUDA_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -191,19 +170,16 @@ void REDUCE3_INT::runCudaVariantOccGS(VariantID vid) iend ); cudaErrchk( cudaGetLastError() ); - if (vmem != hvmem) { - cudaErrchk( cudaMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_vsum += hvmem[0]; - m_vmin = RAJA_MIN(m_vmin, hvmem[1]); - m_vmax = RAJA_MAX(m_vmax, hvmem[2]); + Int_type rvmem[3]; + RAJAPERF_CUDA_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); + m_vsum += rvmem[0]; + m_vmin = RAJA_MIN(m_vmin, rvmem[1]); + m_vmax = RAJA_MAX(m_vmax, rvmem[2]); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); + RAJAPERF_CUDA_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index e0359d4e1..8ce3e9d2f 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -80,22 +80,13 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); + RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (vmem != hvmem) { - hvmem[0] = m_vsum_init; - hvmem[1] = m_vmin_init; - hvmem[2] = m_vmax_init; - hipErrchk( hipMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - vmem[0] = m_vsum_init; - vmem[1] = m_vmin_init; - vmem[2] = m_vmax_init; - } + Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; + RAJAPERF_HIP_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; @@ -107,19 +98,16 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - if (vmem != hvmem) { - hipErrchk( hipMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_vsum += hvmem[0]; - m_vmin = RAJA_MIN(m_vmin, hvmem[1]); - m_vmax = RAJA_MAX(m_vmax, hvmem[2]); + Int_type rvmem[3]; + RAJAPERF_HIP_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); + m_vsum += rvmem[0]; + m_vmin = RAJA_MIN(m_vmin, rvmem[1]); + m_vmax = RAJA_MAX(m_vmax, rvmem[2]); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); + RAJAPERF_HIP_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_HIP ) { @@ -160,7 +148,7 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); + RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -169,17 +157,8 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (vmem != hvmem) { - hvmem[0] = m_vsum_init; - hvmem[1] = m_vmin_init; - hvmem[2] = m_vmax_init; - hipErrchk( hipMemcpyAsync( vmem, hvmem, 3*sizeof(Int_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - vmem[0] = m_vsum_init; - vmem[1] = m_vmin_init; - vmem[2] = m_vmax_init; - } + Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; + RAJAPERF_HIP_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -192,19 +171,16 @@ void REDUCE3_INT::runHipVariantOccGS(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - if (vmem != hvmem) { - hipErrchk( hipMemcpyAsync( hvmem, vmem, 3*sizeof(Int_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_vsum += hvmem[0]; - m_vmin = RAJA_MIN(m_vmin, hvmem[1]); - m_vmax = RAJA_MAX(m_vmax, hvmem[2]); + Int_type rvmem[3]; + RAJAPERF_HIP_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); + m_vsum += rvmem[0]; + m_vmin = RAJA_MIN(m_vmin, rvmem[1]); + m_vmax = RAJA_MAX(m_vmax, rvmem[2]); } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(vmem, hvmem); + RAJAPERF_HIP_REDUCER_TEARDOWN(vmem, hvmem); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index ed6e3d8af..fab4363aa 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -109,28 +109,13 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (mem != hmem) { - hmem[0] = m_init_sum; // xcenter - hmem[1] = m_init_min; // xmin - hmem[2] = m_init_max; // xmax - hmem[3] = m_init_sum; // ycenter - hmem[4] = m_init_min; // ymin - hmem[5] = m_init_max; // ymax - cudaErrchk( cudaMemcpyAsync( mem, hmem, 6*sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - mem[0] = m_init_sum; // xcenter - mem[1] = m_init_min; // xmin - mem[2] = m_init_max; // xmax - mem[3] = m_init_sum; // ycenter - mem[4] = m_init_min; // ymin - mem[5] = m_init_max; // ymax - } + Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; + RAJAPERF_CUDA_REDUCER_INITIALIZE(imem, mem, hmem, 6); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; @@ -143,22 +128,19 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) points.N); cudaErrchk( cudaGetLastError() ); - if (mem != hmem) { - cudaErrchk( cudaMemcpyAsync( hmem, mem, 6*sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); - points.SetXMin(hmem[1]); - points.SetXMax(hmem[2]); - points.SetYMin(hmem[4]); - points.SetYMax(hmem[5]); + Real_type rmem[6]; + RAJAPERF_CUDA_REDUCER_COPY_BACK(rmem, mem, hmem, 6); + points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); + points.SetXMin(rmem[1]); + points.SetXMax(rmem[2]); + points.SetYMin(rmem[4]); + points.SetYMax(rmem[5]); m_points=points; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); + RAJAPERF_CUDA_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_CUDA ) { @@ -207,7 +189,7 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -216,23 +198,8 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (mem != hmem) { - hmem[0] = m_init_sum; // xcenter - hmem[1] = m_init_min; // xmin - hmem[2] = m_init_max; // xmax - hmem[3] = m_init_sum; // ycenter - hmem[4] = m_init_min; // ymin - hmem[5] = m_init_max; // ymax - cudaErrchk( cudaMemcpyAsync( mem, hmem, 6*sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - mem[0] = m_init_sum; // xcenter - mem[1] = m_init_min; // xmin - mem[2] = m_init_max; // xmax - mem[3] = m_init_sum; // ycenter - mem[4] = m_init_min; // ymin - mem[5] = m_init_max; // ymax - } + Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; + RAJAPERF_CUDA_REDUCER_INITIALIZE(imem, mem, hmem, 6); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -245,22 +212,19 @@ void REDUCE_STRUCT::runCudaVariantOccGS(VariantID vid) points.N); cudaErrchk( cudaGetLastError() ); - if (mem != hmem) { - cudaErrchk( cudaMemcpyAsync( hmem, mem, 6*sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); - points.SetXMin(hmem[1]); - points.SetXMax(hmem[2]); - points.SetYMin(hmem[4]); - points.SetYMax(hmem[5]); + Real_type rmem[6]; + RAJAPERF_CUDA_REDUCER_COPY_BACK(rmem, mem, hmem, 6); + points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); + points.SetXMin(rmem[1]); + points.SetXMax(rmem[2]); + points.SetYMin(rmem[4]); + points.SetYMax(rmem[5]); m_points=points; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); + RAJAPERF_CUDA_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index e6017d394..eb64aa1ec 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -110,28 +110,13 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (mem != hmem) { - hmem[0] = m_init_sum; // xcenter - hmem[1] = m_init_min; // xmin - hmem[2] = m_init_max; // xmax - hmem[3] = m_init_sum; // ycenter - hmem[4] = m_init_min; // ymin - hmem[5] = m_init_max; // ymax - hipErrchk( hipMemcpyAsync( mem, hmem, 6*sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - mem[0] = m_init_sum; // xcenter - mem[1] = m_init_min; // xmin - mem[2] = m_init_max; // xmax - mem[3] = m_init_sum; // ycenter - mem[4] = m_init_min; // ymin - mem[5] = m_init_max; // ymax - } + Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; + RAJAPERF_HIP_REDUCER_INITIALIZE(imem, mem, hmem, 6); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; @@ -146,22 +131,19 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) points.N); hipErrchk( hipGetLastError() ); - if (mem != hmem) { - hipErrchk( hipMemcpyAsync( hmem, mem, 6*sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); - points.SetXMin(hmem[1]); - points.SetXMax(hmem[2]); - points.SetYMin(hmem[4]); - points.SetYMax(hmem[5]); + Real_type rmem[6]; + RAJAPERF_HIP_REDUCER_COPY_BACK(rmem, mem, hmem, 6); + points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); + points.SetXMin(rmem[1]); + points.SetXMax(rmem[2]); + points.SetYMin(rmem[4]); + points.SetYMax(rmem[5]); m_points=points; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); + RAJAPERF_HIP_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_HIP ) { @@ -209,7 +191,7 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, mem, hmem, 6); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -218,23 +200,8 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (mem != hmem) { - hmem[0] = m_init_sum; // xcenter - hmem[1] = m_init_min; // xmin - hmem[2] = m_init_max; // xmax - hmem[3] = m_init_sum; // ycenter - hmem[4] = m_init_min; // ymin - hmem[5] = m_init_max; // ymax - hipErrchk( hipMemcpyAsync( mem, hmem, 6*sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - mem[0] = m_init_sum; // xcenter - mem[1] = m_init_min; // xmin - mem[2] = m_init_max; // xmax - mem[3] = m_init_sum; // ycenter - mem[4] = m_init_min; // ymin - mem[5] = m_init_max; // ymax - } + Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; + RAJAPERF_HIP_REDUCER_INITIALIZE(imem, mem, hmem, 6); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -248,22 +215,19 @@ void REDUCE_STRUCT::runHipVariantOccGS(VariantID vid) points.N); hipErrchk( hipGetLastError() ); - if (mem != hmem) { - hipErrchk( hipMemcpyAsync( hmem, mem, 6*sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); - points.SetXMin(hmem[1]); - points.SetXMax(hmem[2]); - points.SetYMin(hmem[4]); - points.SetYMax(hmem[5]); + Real_type rmem[6]; + RAJAPERF_HIP_REDUCER_COPY_BACK(rmem, mem, hmem, 6); + points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); + points.SetXMin(rmem[1]); + points.SetXMax(rmem[2]); + points.SetYMin(rmem[4]); + points.SetYMax(rmem[5]); m_points=points; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(mem, hmem); + RAJAPERF_HIP_REDUCER_TEARDOWN(mem, hmem); } else if ( vid == RAJA_HIP ) { diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 0c2a63cc7..e8fef5b7d 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -86,18 +86,12 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sumx != hsumx) { - *hsumx = m_sumx_init; - cudaErrchk( cudaMemcpyAsync( sumx, hsumx, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sumx = m_sumx_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -109,17 +103,14 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) iend); cudaErrchk( cudaGetLastError() ); - if (sumx != hsumx) { - cudaErrchk( cudaMemcpyAsync( hsumx, sumx, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_sumx += *hsumx * h; + Real_type rsumx; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); + m_sumx += rsumx * h; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_CUDA ) { @@ -156,7 +147,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -165,13 +156,7 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sumx != hsumx) { - *hsumx = m_sumx_init; - cudaErrchk( cudaMemcpyAsync( sumx, hsumx, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sumx = m_sumx_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -183,17 +168,14 @@ void TRAP_INT::runCudaVariantOccGS(VariantID vid) iend); cudaErrchk( cudaGetLastError() ); - if (sumx != hsumx) { - cudaErrchk( cudaMemcpyAsync( hsumx, sumx, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_sumx += *hsumx * h; + Real_type rsumx; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); + m_sumx += rsumx * h; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); + RAJAPERF_CUDA_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_CUDA ) { diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 5a8cb3d0e..76d69b778 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -86,18 +86,12 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sumx != hsumx) { - *hsumx = m_sumx_init; - hipErrchk( hipMemcpyAsync( sumx, hsumx, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sumx = m_sumx_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -108,17 +102,14 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) iend); hipErrchk( hipGetLastError() ); - if (sumx != hsumx) { - hipErrchk( hipMemcpyAsync( hsumx, sumx, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_sumx += *hsumx * h; + Real_type rsumx; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); + m_sumx += rsumx * h; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); + RAJAPERF_HIP_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_HIP ) { @@ -155,7 +146,7 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -164,13 +155,7 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (sumx != hsumx) { - *hsumx = m_sumx_init; - hipErrchk( hipMemcpyAsync( sumx, hsumx, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *sumx = m_sumx_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -182,17 +167,14 @@ void TRAP_INT::runHipVariantOccGS(VariantID vid) iend); hipErrchk( hipGetLastError() ); - if (sumx != hsumx) { - hipErrchk( hipMemcpyAsync( hsumx, sumx, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_sumx += *hsumx * h; + Real_type rsumx; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); + m_sumx += rsumx * h; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(sumx, hsumx); + RAJAPERF_HIP_REDUCER_TEARDOWN(sumx, hsumx); } else if ( vid == RAJA_HIP ) { diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 7bc737829..003efd858 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -169,23 +169,69 @@ inline void seq_for(camp::int_seq const&, Func&& func) } // closing brace for rajaperf namespace // -#define RAJAPERF_GPU_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ - DataSpace reduction_data_space = getReductionDataSpace(vid); \ - DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); \ +#define RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) \ + DataSpace reduction_data_space = getReductionDataSpace(vid); \ + DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); \ \ - pointer_type device_ptr_name; \ - allocData(reduction_data_space, device_ptr_name, (length)); \ - pointer_type host_ptr_name = device_ptr_name; \ - if (reduction_data_space != host_data_space) { \ - allocData(host_data_space, host_ptr_name, (length)); \ - } + pointer_type device_ptr_name; \ + allocData(reduction_data_space, device_ptr_name, (length)); \ + pointer_type host_ptr_name = device_ptr_name; \ + if (reduction_data_space != host_data_space) { \ + allocData(host_data_space, host_ptr_name, (length)); \ + } + +// +#define RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) \ + deallocData(reduction_data_space, device_ptr_name); \ + if (reduction_data_space != host_data_space) { \ + deallocData(host_data_space, host_ptr_name); \ + } + +// +#define RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(gpu_type, init_ptr, device_ptr_name, host_ptr_name, length) \ + if (device_ptr_name != host_ptr_name) { \ + for (int i = 0; i < (length); ++i) { \ + host_ptr_name[i] = (init_ptr)[i]; \ + } \ + gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ + (length)*sizeof(device_ptr_name[0]), \ + gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ + } else { \ + for (int i = 0; i < (length); ++i) { \ + device_ptr_name[i] = (init_ptr)[i]; \ + } \ + } // -#define RAJAPERF_GPU_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ - deallocData(reduction_data_space, device_ptr_name); \ - if (reduction_data_space != host_data_space) { \ - deallocData(host_data_space, host_ptr_name); \ - } +#define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, final_ptr, device_ptr_name, host_ptr_name, length) \ + if (device_ptr_name != host_ptr_name) { \ + gpu_type##Errchk( gpu_type##MemcpyAsync( host_ptr_name, device_ptr_name, \ + (length)*sizeof(device_ptr_name[0]), \ + gpu_type##MemcpyDeviceToHost, res.get_stream() ) ); \ + } \ + gpu_type##Errchk( gpu_type##StreamSynchronize( res.get_stream() ) ); \ + for (int i = 0; i < (length); ++i) { \ + (final_ptr)[i] = host_ptr_name[i]; \ + } + + +#define RAJAPERF_CUDA_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_CUDA_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ + RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) +#define RAJAPERF_CUDA_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(cuda, init_ptr, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_CUDA_REDUCER_COPY_BACK(final_ptr, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, final_ptr, device_ptr_name, host_ptr_name, length) + +#define RAJAPERF_HIP_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_HIP_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ + RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) +#define RAJAPERF_HIP_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(hip, init_ptr, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_HIP_REDUCER_COPY_BACK(final_ptr, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, final_ptr, device_ptr_name, host_ptr_name, length) // #define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(kernel, variant) \ diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 2acec1b2a..b0f55aed2 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -65,18 +65,12 @@ void DOT::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (dprod != hdprod) { - *hdprod = m_dot_init; - cudaErrchk( cudaMemcpyAsync( dprod, hdprod, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *dprod = m_dot_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -84,17 +78,14 @@ void DOT::runCudaVariantBlock(VariantID vid) a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); - if (dprod != hdprod) { - cudaErrchk( cudaMemcpyAsync( hdprod, dprod, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_dot += *hdprod; + Real_type rdprod; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); + m_dot += rdprod; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); + RAJAPERF_CUDA_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_CUDA ) { @@ -131,7 +122,7 @@ void DOT::runCudaVariantOccGS(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( @@ -140,13 +131,7 @@ void DOT::runCudaVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (dprod != hdprod) { - *hdprod = m_dot_init; - cudaErrchk( cudaMemcpyAsync( dprod, hdprod, sizeof(Real_type), - cudaMemcpyHostToDevice, res.get_stream() ) ); - } else { - *dprod = m_dot_init; - } + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -154,17 +139,14 @@ void DOT::runCudaVariantOccGS(VariantID vid) a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); - if (dprod != hdprod) { - cudaErrchk( cudaMemcpyAsync( hdprod, dprod, sizeof(Real_type), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - } - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - m_dot += *hdprod; + Real_type rdprod; + RAJAPERF_CUDA_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); + m_dot += rdprod; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); + RAJAPERF_CUDA_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_CUDA ) { diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 91344415c..69ca49d47 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -65,18 +65,12 @@ void DOT::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (dprod != hdprod) { - *hdprod = m_dot_init; - hipErrchk( hipMemcpyAsync( dprod, hdprod, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *dprod = m_dot_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; @@ -85,17 +79,14 @@ void DOT::runHipVariantBlock(VariantID vid) a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); - if (dprod != hdprod) { - hipErrchk( hipMemcpyAsync( hdprod, dprod, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_dot += *hdprod; + Real_type rdprod; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); + m_dot += rdprod; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); + RAJAPERF_HIP_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_HIP ) { @@ -132,7 +123,7 @@ void DOT::runHipVariantOccGS(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_GPU_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( @@ -141,13 +132,7 @@ void DOT::runHipVariantOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if (dprod != hdprod) { - *hdprod = m_dot_init; - hipErrchk( hipMemcpyAsync( dprod, hdprod, sizeof(Real_type), - hipMemcpyHostToDevice, res.get_stream() ) ); - } else { - *dprod = m_dot_init; - } + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -156,17 +141,14 @@ void DOT::runHipVariantOccGS(VariantID vid) a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); - if (dprod != hdprod) { - hipErrchk( hipMemcpyAsync( hdprod, dprod, sizeof(Real_type), - hipMemcpyDeviceToHost, res.get_stream() ) ); - } - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - m_dot += *hdprod; + Real_type rdprod; + RAJAPERF_HIP_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); + m_dot += rdprod; } stopTimer(); - RAJAPERF_GPU_REDUCER_TEARDOWN(dprod, hdprod); + RAJAPERF_HIP_REDUCER_TEARDOWN(dprod, hdprod); } else if ( vid == RAJA_HIP ) { From 9d04ab84705f945d024ebb3f3fc5a24c7494be1c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 Nov 2023 12:30:33 -0800 Subject: [PATCH 163/454] Bring PI_ATOMIC into line with reductions Move allocation of pi into per variant setup. This allows us to use a simpler setup for GPUs and then use the reduction setup macro to allocate pi. --- src/basic/PI_ATOMIC-Cuda.cpp | 15 ++++----------- src/basic/PI_ATOMIC-Hip.cpp | 14 +++----------- src/basic/PI_ATOMIC-OMP.cpp | 2 ++ src/basic/PI_ATOMIC-OMPTarget.cpp | 3 +++ src/basic/PI_ATOMIC-Seq.cpp | 2 ++ src/basic/PI_ATOMIC.cpp | 2 -- src/basic/PI_ATOMIC.hpp | 10 ++++++++-- 7 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 7a1897a2c..7446618fa 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -45,15 +45,9 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) auto res{getCudaResource()}; - PI_ATOMIC_DATA_SETUP; + PI_ATOMIC_GPU_DATA_SETUP; - DataSpace reduction_data_space = getReductionDataSpace(vid); - DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); - - Real_ptr hpi = pi; - if (reduction_data_space != host_data_space) { - allocData(host_data_space, hpi, 1); - } + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); if ( vid == Base_CUDA ) { @@ -121,9 +115,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) getCout() << "\n PI_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; } - if (pi != hpi) { - deallocData(host_data_space, hpi); - } + RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi); + } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Cuda) diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 0544c5f2a..1e6fd2e7a 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -45,15 +45,9 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) auto res{getHipResource()}; - PI_ATOMIC_DATA_SETUP; + PI_ATOMIC_GPU_DATA_SETUP; - DataSpace reduction_data_space = getReductionDataSpace(vid); - DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); - - Real_ptr hpi = pi; - if (reduction_data_space != host_data_space) { - allocData(host_data_space, hpi, 1); - } + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); if ( vid == Base_HIP ) { @@ -123,9 +117,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) getCout() << "\n PI_ATOMIC : Unknown Hip variant id = " << vid << std::endl; } - if (pi != hpi) { - deallocData(host_data_space, hpi); - } + RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi); } RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_ATOMIC, Hip) diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index c031dcf32..2c0228089 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -99,6 +99,8 @@ void PI_ATOMIC::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } + PI_ATOMIC_DATA_TEARDOWN; + #else RAJA_UNUSED_VAR(vid); #endif diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index 9d4f2649f..c685e026d 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -78,6 +78,9 @@ void PI_ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } else { getCout() << "\n PI_ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; } + + PI_ATOMIC_DATA_TEARDOWN; + } } // end namespace basic diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index 9d3864713..b3db76b21 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -95,6 +95,8 @@ void PI_ATOMIC::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } + PI_ATOMIC_DATA_TEARDOWN; + } } // end namespace basic diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 6e08f5813..b1293b531 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -64,7 +64,6 @@ PI_ATOMIC::~PI_ATOMIC() void PI_ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { m_dx = 1.0 / double(getActualProblemSize()); - allocData(getReductionDataSpace(vid), m_pi, 1); m_pi_init = 0.0; m_pi_final = -static_cast(vid); } @@ -77,7 +76,6 @@ void PI_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; - deallocData(m_pi, vid); } } // end namespace basic diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index fe26d9beb..38a0b62a6 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -25,7 +25,14 @@ #define PI_ATOMIC_DATA_SETUP \ Real_type dx = m_dx; \ - Real_ptr pi = m_pi; + Real_ptr pi; \ + allocData(getReductionDataSpace(vid), pi, 1); + +#define PI_ATOMIC_DATA_TEARDOWN \ + deallocData(pi, vid); + +#define PI_ATOMIC_GPU_DATA_SETUP \ + Real_type dx = m_dx; #include "common/KernelBase.hpp" @@ -68,7 +75,6 @@ class PI_ATOMIC : public KernelBase using gpu_block_sizes_type = gpu_block_size::make_list_type; Real_type m_dx; - Real_ptr m_pi; Real_type m_pi_init; Real_type m_pi_final; }; From f161f5dc70bd0b3ca4fc489b5a1bdf55b551a855 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 Nov 2023 14:33:18 -0800 Subject: [PATCH 164/454] Rename to hostCopyDataSpace Hopefully this is more clear. When you want to make a host copy of some data this will tell you what data space to use based on the data space of your data. --- src/common/DataUtils.cpp | 31 +++++++++++++++++-------------- src/common/DataUtils.hpp | 35 +++++++++++++++++++---------------- src/common/KernelBase.hpp | 2 +- 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index f374b358a..df911783a 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -279,8 +279,8 @@ void copyData(DataSpace dst_dataSpace, void* dst_ptr, DataSpace src_dataSpace, const void* src_ptr, Size_type nbytes) { - if (hostBasedDataSpace(dst_dataSpace) == dst_dataSpace && - hostBasedDataSpace(src_dataSpace) == src_dataSpace) { + if (hostCopyDataSpace(dst_dataSpace) == dst_dataSpace && + hostCopyDataSpace(src_dataSpace) == src_dataSpace) { detail::copyHostData(dst_ptr, src_ptr, nbytes); } @@ -598,15 +598,17 @@ long double calcChecksum(Complex_ptr ptr, Size_type len, /*! - * \brief Get an host based data space for the given dataSpace. + * \brief Get a host data space to use when making a host copy of data in the given + * dataSpace. * - * A host based data space is one that is always stored on the host. + * The returned host data space should reside in memory attached to the host. * - * The intention is to check if the performance (bandwidth) of the given data - * space is good on the host. If not then fall back on a space that performs - * well on the host and in explicit copy operations with the given space. + * The intention is to get a data space with high performance on the host. + * Return the given data space if its already performant and fall back on a + * host data space that performs well in explicit copy operations with the + * given space. */ -DataSpace hostBasedDataSpace(DataSpace dataSpace) +DataSpace hostCopyDataSpace(DataSpace dataSpace) { switch (dataSpace) { case DataSpace::Host: @@ -639,19 +641,20 @@ DataSpace hostBasedDataSpace(DataSpace dataSpace) default: { - throw std::invalid_argument("hostBasedDataSpace : Unknown data space"); + throw std::invalid_argument("hostCopyDataSpace : Unknown data space"); } break; } } /*! - * \brief Get an host accessible data space for the given dataSpace. + * \brief Get a data space accessible to the host for the given dataSpace. * - * A host accessible data space is one that can be accessed on the host. + * The returned host data space may reside in memory attached to another device. * - * The intention is to check if the given memory space is accessible on the - * host. If not then fall back on a space that is host accessible and can be - * used with explicit copy operations with the given space. + * The intention is to get a data space accessible on the host even if it is not + * performant. Return the given data space if its already accessible and fall + * back on a space that is host accessible and performs well in explicit copy + * operations with the given space. */ DataSpace hostAccessibleDataSpace(DataSpace dataSpace) { diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 710cddd3f..4d512bf6d 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -160,24 +160,27 @@ long double calcChecksum(Complex_ptr d, Size_type len, /*! - * \brief Get an host based data space for the given dataSpace. + * \brief Get a host data space to use when making a host copy of data in the given + * dataSpace. * - * A host based data space is one that is always stored on the host. + * The returned host data space should reside in memory attached to the host. * - * The intention is to check if the performance (bandwidth) of the given data - * space is good on the host. If not then fall back on a space that performs - * well on the host and in explicit copy operations with the given space. + * The intention is to get a data space with high performance on the host. + * Return the given data space if its already performant and fall back on a + * host data space that performs well in explicit copy operations with the + * given space. */ -DataSpace hostBasedDataSpace(DataSpace dataSpace); +DataSpace hostCopyDataSpace(DataSpace dataSpace); /*! - * \brief Get an host accessible data space for the given dataSpace. + * \brief Get a data space accessible to the host for the given dataSpace. * - * A host accessible data space is one that can be accessed on the host. + * The returned host data space may reside in memory attached to another device. * - * The intention is to check if the given memory space is accessible on the - * host. If not then fall back on a space that is host accessible and can be - * used with explicit copy operations with the given space. + * The intention is to get a data space accessible on the host even if it is not + * performant. Return the given data space if its already accessible and fall + * back on a space that is host accessible and performs well in explicit copy + * operations with the given space. */ DataSpace hostAccessibleDataSpace(DataSpace dataSpace); @@ -308,7 +311,7 @@ struct AutoDataMover template inline void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -327,7 +330,7 @@ template inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align, T val) { - DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -344,7 +347,7 @@ inline void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, S template inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -362,7 +365,7 @@ inline void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len template inline void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len, Size_type align) { - DataSpace init_dataSpace = hostBasedDataSpace(dataSpace); + DataSpace init_dataSpace = hostCopyDataSpace(dataSpace); allocData(init_dataSpace, ptr, len, align); @@ -381,7 +384,7 @@ inline long double calcChecksum(DataSpace dataSpace, T* ptr, Size_type len, Size T* check_ptr = ptr; T* copied_ptr = nullptr; - DataSpace check_dataSpace = hostBasedDataSpace(dataSpace); + DataSpace check_dataSpace = hostCopyDataSpace(dataSpace); if (check_dataSpace != dataSpace) { allocData(check_dataSpace, copied_ptr, len, align); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 640710770..10ffb4459 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -323,7 +323,7 @@ class KernelBase rajaperf::AutoDataMover scopedMoveData(T*& ptr, Size_type len, VariantID vid) { DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostBasedDataSpace(ds); + DataSpace hds = rajaperf::hostCopyDataSpace(ds); rajaperf::moveData(hds, ds, ptr, len, getDataAlignment()); return {ds, hds, ptr, len, getDataAlignment()}; } From d691684021be48de835891b460ec83be1056c056 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 30 Nov 2023 14:53:33 -0800 Subject: [PATCH 165/454] methods and DAXPY conversion to support same launch process for CUDA variant --- src/basic/DAXPY-Cuda.cpp | 19 +++++++++++---- src/common/CudaDataUtils.hpp | 25 +++++++++++++++++++ src/common/GPUUtils.hpp | 47 ++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 5 deletions(-) diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 79f1fde4a..eb852bf49 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -52,8 +52,11 @@ void DAXPY::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - daxpy<<>>( y, x, a, - iend ); + + RPlaunchCudaKernel( (daxpy), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); cudaErrchk( cudaGetLastError() ); } @@ -66,10 +69,16 @@ void DAXPY::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { + + auto daxpy_lambda = [=] __device__ (Index_type i) { DAXPY_BODY; - }); + }; + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 305937844..54b75a1ef 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -42,6 +42,31 @@ __device__ __forceinline__ unsigned long long device_timer() return global_timer; } +/*! + * \brief Method for launching a CUDA kernel with given configuration. + * + * Note: method includes a call to check whether number of args + * in kernel signature matches number of args passed to this + * method. + */ +template +void RPlaunchCudaKernel(F kernel, + const size_t& numBlocks, const size_t& dimBlocks, + std::uint32_t sharedMemBytes, cudaStream_t stream, + Args... args) +{ + constexpr size_t count = sizeof...(Args); + auto tup = std::tuple{args...}; + auto chk_tup = checkArgsCount(kernel, tup); + void* arg_arr[count]; + packArgs<0>(chk_tup, arg_arr); + + auto k = reinterpret_cast(kernel); + cudaLaunchKernel(k, numBlocks, dimBlocks, + arg_arr, + sharedMemBytes, stream); +} + /*! * \brief Simple forall cuda kernel that runs a lambda. */ diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 8d6012a6d..763c64bc2 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -19,6 +19,53 @@ namespace rajaperf { +/*! + * \brief Routine to check whether number of args in signature of the + * given kernel matches the number of items in the given tuple. + */ +template +std::tuple checkArgsCount(void (*kernel)(Formals...), + std::tuple(actuals)) +{ + (void) kernel; // to prevent compiler warning + static_assert(sizeof...(Formals) == sizeof...(Actuals), + "Argument Count Mismatch"); + std::tuple to_formals{std::move(actuals)}; + return to_formals; +} + +/*! + * \brief Stopping case for recursive method below. + */ +template ::type* = nullptr> +void packArgs(const std::tuple&, void*) {} + +/*! + * \brief Recursive method to copy items from given tuple to an array + * of void* pointers, which is what CUDA and HIP kernel launch + * methods want for kernel arguments. + * + * Note: method contains a static assert check for whether any + * item in the given tuple is a reference type, which + * doesn't work for passing args to a GPU kernel. + */ +template ::type* = nullptr> +void packArgs(const std::tuple& formals, void** vargs) +{ + using T = typename std::tuple_element >::type; + + static_assert(!std::is_reference{}, + "A __global__ function cannot have a reference as one of its " + "arguments."); + + vargs[n] = const_cast( + reinterpret_cast(&std::get(formals)) ); + + return packArgs(formals, vargs); +} + namespace gpu_block_size { From 17e119fd81831f1c322a97206703435a83674dd6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 30 Nov 2023 14:54:30 -0800 Subject: [PATCH 166/454] Unify launch process for DAXPY HIP variants --- src/basic/DAXPY-Hip.cpp | 15 +++++++++++++++ src/common/HipDataUtils.hpp | 25 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 22f86b4d7..8782a3a07 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -53,8 +53,15 @@ void DAXPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; +#if 0 hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), y, x, a, iend ); +#else + RPlaunchHipKernel( (daxpy), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); +#endif hipErrchk( hipGetLastError() ); } @@ -71,8 +78,16 @@ void DAXPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; +#if 0 hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_lambda); +#else + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_lambda ); +#endif hipErrchk( hipGetLastError() ); } diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index ad23ae5d5..0685eff91 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -29,6 +29,31 @@ namespace rajaperf { +/*! + * \brief Method for launching a HIP kernel with given configuration. + * + * Note: method includes a call to check whether number of args + * in kernel signature matches number of args passed to this + * method. + */ +template +void RPlaunchHipKernel(F kernel, + const size_t& numBlocks, const size_t& dimBlocks, + std::uint32_t sharedMemBytes, hipStream_t stream, + Args... args) +{ + constexpr size_t count = sizeof...(Args); + auto tup = std::tuple{args...}; + auto chk_tup = checkArgsCount(kernel, tup); + void* arg_arr[count]; + packArgs<0>(chk_tup, arg_arr); + + auto k = reinterpret_cast(kernel); + hipLaunchKernel(k, numBlocks, dimBlocks, + arg_arr, + sharedMemBytes, stream); +} + /*! * \brief Simple forall hip kernel that runs a lambda. */ From 4e6dcafdd1d2b6ab6ff7231f8df4ac7d52895f48 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 30 Nov 2023 14:57:07 -0800 Subject: [PATCH 167/454] Code cleanup --- src/basic/DAXPY-Hip.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 8782a3a07..5e717d72d 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -53,15 +53,11 @@ void DAXPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; -#if 0 - hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), y, x, a, - iend ); -#else + RPlaunchHipKernel( (daxpy), grid_size, block_size, shmem, res.get_stream(), y, x, a, iend ); -#endif hipErrchk( hipGetLastError() ); } @@ -78,16 +74,12 @@ void DAXPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; -#if 0 - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_lambda); -#else + RPlaunchHipKernel( (lambda_hip_forall), grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_lambda ); -#endif hipErrchk( hipGetLastError() ); } From af7e40c2b4ca41b487acad6394e788245cb13d7e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 30 Nov 2023 16:02:50 -0800 Subject: [PATCH 168/454] Modifications based on Jason's comments. --- src/common/CudaDataUtils.hpp | 12 +++++------- src/common/GPUUtils.hpp | 38 ++---------------------------------- src/common/HipDataUtils.hpp | 10 ++++------ 3 files changed, 11 insertions(+), 49 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 54b75a1ef..aded17cfb 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -51,17 +51,15 @@ __device__ __forceinline__ unsigned long long device_timer() */ template void RPlaunchCudaKernel(F kernel, - const size_t& numBlocks, const size_t& dimBlocks, + const dim3& numBlocks, const dim3& dimBlocks, std::uint32_t sharedMemBytes, cudaStream_t stream, - Args... args) + Args const&... args) { constexpr size_t count = sizeof...(Args); - auto tup = std::tuple{args...}; - auto chk_tup = checkArgsCount(kernel, tup); - void* arg_arr[count]; - packArgs<0>(chk_tup, arg_arr); + checkArgsCount(kernel, args...); + void* arg_arr[count]{(void*)&args...}; - auto k = reinterpret_cast(kernel); + auto k = reinterpret_cast(kernel); cudaLaunchKernel(k, numBlocks, dimBlocks, arg_arr, sharedMemBytes, stream); diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 763c64bc2..fc0920b06 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -24,46 +24,12 @@ namespace rajaperf * given kernel matches the number of items in the given tuple. */ template -std::tuple checkArgsCount(void (*kernel)(Formals...), - std::tuple(actuals)) +void checkArgsCount(void (*kernel)(Formals...), + Actuals...) { (void) kernel; // to prevent compiler warning static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch"); - std::tuple to_formals{std::move(actuals)}; - return to_formals; -} - -/*! - * \brief Stopping case for recursive method below. - */ -template ::type* = nullptr> -void packArgs(const std::tuple&, void*) {} - -/*! - * \brief Recursive method to copy items from given tuple to an array - * of void* pointers, which is what CUDA and HIP kernel launch - * methods want for kernel arguments. - * - * Note: method contains a static assert check for whether any - * item in the given tuple is a reference type, which - * doesn't work for passing args to a GPU kernel. - */ -template ::type* = nullptr> -void packArgs(const std::tuple& formals, void** vargs) -{ - using T = typename std::tuple_element >::type; - - static_assert(!std::is_reference{}, - "A __global__ function cannot have a reference as one of its " - "arguments."); - - vargs[n] = const_cast( - reinterpret_cast(&std::get(formals)) ); - - return packArgs(formals, vargs); } namespace gpu_block_size diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 0685eff91..f17b016bb 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -40,15 +40,13 @@ template void RPlaunchHipKernel(F kernel, const size_t& numBlocks, const size_t& dimBlocks, std::uint32_t sharedMemBytes, hipStream_t stream, - Args... args) + Args const&... args) { constexpr size_t count = sizeof...(Args); - auto tup = std::tuple{args...}; - auto chk_tup = checkArgsCount(kernel, tup); - void* arg_arr[count]; - packArgs<0>(chk_tup, arg_arr); + checkArgsCount(kernel, args...); + void* arg_arr[count]{(void*)&args...}; - auto k = reinterpret_cast(kernel); + auto k = reinterpret_cast(kernel); hipLaunchKernel(k, numBlocks, dimBlocks, arg_arr, sharedMemBytes, stream); From 2f53bdf6fc183e61562f56acdf0436d42ae8f3c7 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 30 Nov 2023 16:03:56 -0800 Subject: [PATCH 169/454] Mathc CUDA impl --- src/common/HipDataUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index f17b016bb..f14a7aae2 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -38,7 +38,7 @@ namespace rajaperf */ template void RPlaunchHipKernel(F kernel, - const size_t& numBlocks, const size_t& dimBlocks, + const dim3& numBlocks, const dim3& dimBlocks, std::uint32_t sharedMemBytes, hipStream_t stream, Args const&... args) { From 44688b84f67fbc38e6fec2d5d7f89c44c576bf8e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 1 Dec 2023 10:53:40 -0800 Subject: [PATCH 170/454] Tightening up code based on Jason's comments --- src/common/CudaDataUtils.hpp | 10 ++++++---- src/common/GPUUtils.hpp | 13 ------------- src/common/HipDataUtils.hpp | 8 +++++--- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index aded17cfb..083068b0c 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -49,14 +49,16 @@ __device__ __forceinline__ unsigned long long device_timer() * in kernel signature matches number of args passed to this * method. */ -template -void RPlaunchCudaKernel(F kernel, +template +void RPlaunchCudaKernel(void (*kernel)(KernArgs...), const dim3& numBlocks, const dim3& dimBlocks, std::uint32_t sharedMemBytes, cudaStream_t stream, - Args const&... args) + Args const&... args) { + static_assert(sizeof...(KernArgs) == sizeof...(Args), + "Argument count mismatch between kernel and call to this method"); + constexpr size_t count = sizeof...(Args); - checkArgsCount(kernel, args...); void* arg_arr[count]{(void*)&args...}; auto k = reinterpret_cast(kernel); diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index fc0920b06..8d6012a6d 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -19,19 +19,6 @@ namespace rajaperf { -/*! - * \brief Routine to check whether number of args in signature of the - * given kernel matches the number of items in the given tuple. - */ -template -void checkArgsCount(void (*kernel)(Formals...), - Actuals...) -{ - (void) kernel; // to prevent compiler warning - static_assert(sizeof...(Formals) == sizeof...(Actuals), - "Argument Count Mismatch"); -} - namespace gpu_block_size { diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index f14a7aae2..76e87774e 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -36,14 +36,16 @@ namespace rajaperf * in kernel signature matches number of args passed to this * method. */ -template -void RPlaunchHipKernel(F kernel, +template +void RPlaunchHipKernel(void (*kernel)(KernArgs...), const dim3& numBlocks, const dim3& dimBlocks, std::uint32_t sharedMemBytes, hipStream_t stream, Args const&... args) { + static_assert(sizeof...(KernArgs) == sizeof...(Args), + "Argument count mismatch between kernel and call to this method"); + constexpr size_t count = sizeof...(Args); - checkArgsCount(kernel, args...); void* arg_arr[count]{(void*)&args...}; auto k = reinterpret_cast(kernel); From 3fa4f29adae88385d08bbc732381de53f7a32d35 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 1 Dec 2023 12:18:54 -0800 Subject: [PATCH 171/454] Add kernel arg type checking --- src/common/CudaDataUtils.hpp | 14 ++++++++++---- src/common/HipDataUtils.hpp | 15 +++++++++++---- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 083068b0c..b8cad7555 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -45,9 +45,8 @@ __device__ __forceinline__ unsigned long long device_timer() /*! * \brief Method for launching a CUDA kernel with given configuration. * - * Note: method includes a call to check whether number of args - * in kernel signature matches number of args passed to this - * method. + * Note: method checks whether number of args and their types in + * kernel signature matches args passed to this method. */ template void RPlaunchCudaKernel(void (*kernel)(KernArgs...), @@ -56,7 +55,14 @@ void RPlaunchCudaKernel(void (*kernel)(KernArgs...), Args const&... args) { static_assert(sizeof...(KernArgs) == sizeof...(Args), - "Argument count mismatch between kernel and call to this method"); + "Number of kernel args doesn't match what's passed to method"); + + using int_array = int[]; + int_array ia = {[](){ + static_assert(std::is_same, std::decay_t>::value, "Kernel arg types don't match what's passed to method"); + return 0; + }()...}; + RAJA_UNUSED_VAR(ia); constexpr size_t count = sizeof...(Args); void* arg_arr[count]{(void*)&args...}; diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 76e87774e..08a0746d2 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -32,9 +32,8 @@ namespace rajaperf /*! * \brief Method for launching a HIP kernel with given configuration. * - * Note: method includes a call to check whether number of args - * in kernel signature matches number of args passed to this - * method. + * Note: method checks whether number of args and their types in + * kernel signature matches args passed to this method. */ template void RPlaunchHipKernel(void (*kernel)(KernArgs...), @@ -43,7 +42,15 @@ void RPlaunchHipKernel(void (*kernel)(KernArgs...), Args const&... args) { static_assert(sizeof...(KernArgs) == sizeof...(Args), - "Argument count mismatch between kernel and call to this method"); + "Number of kernel args doesn't match what's passed to method"); + + using int_array = int[]; + int_array ia = {[](){ + static_assert(std::is_same, std::decay_t>::value, + "Kernel arg types don't match what's passed to method"); + return 0; + }()...}; + RAJA_UNUSED_VAR(ia); constexpr size_t count = sizeof...(Args); void* arg_arr[count]{(void*)&args...}; From 47ab1a68afc90fda16028e75ef4bbb647ff3d339 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 1 Dec 2023 13:03:56 -0800 Subject: [PATCH 172/454] Fix gcc issue about missing argument packs --- src/common/CudaDataUtils.hpp | 2 +- src/common/HipDataUtils.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index b8cad7555..2f76ae5d8 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -58,7 +58,7 @@ void RPlaunchCudaKernel(void (*kernel)(KernArgs...), "Number of kernel args doesn't match what's passed to method"); using int_array = int[]; - int_array ia = {[](){ + int_array ia = {[](...){ static_assert(std::is_same, std::decay_t>::value, "Kernel arg types don't match what's passed to method"); return 0; }()...}; diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 08a0746d2..0c99cddce 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -45,7 +45,7 @@ void RPlaunchHipKernel(void (*kernel)(KernArgs...), "Number of kernel args doesn't match what's passed to method"); using int_array = int[]; - int_array ia = {[](){ + int_array ia = {[](...){ static_assert(std::is_same, std::decay_t>::value, "Kernel arg types don't match what's passed to method"); return 0; From 0210b6bb59e05f2e601e7654eff5900d470af7a5 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 1 Dec 2023 13:25:16 -0800 Subject: [PATCH 173/454] revert change committed accidentally --- src/common/CudaDataUtils.hpp | 3 +-- src/common/HipDataUtils.hpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 2f76ae5d8..f01a46d3d 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -57,8 +57,7 @@ void RPlaunchCudaKernel(void (*kernel)(KernArgs...), static_assert(sizeof...(KernArgs) == sizeof...(Args), "Number of kernel args doesn't match what's passed to method"); - using int_array = int[]; - int_array ia = {[](...){ + int ia[] = {[](){ static_assert(std::is_same, std::decay_t>::value, "Kernel arg types don't match what's passed to method"); return 0; }()...}; diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 0c99cddce..aa6710dfe 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -44,8 +44,7 @@ void RPlaunchHipKernel(void (*kernel)(KernArgs...), static_assert(sizeof...(KernArgs) == sizeof...(Args), "Number of kernel args doesn't match what's passed to method"); - using int_array = int[]; - int_array ia = {[](...){ + int ia[] = {[](){ static_assert(std::is_same, std::decay_t>::value, "Kernel arg types don't match what's passed to method"); return 0; From 34e134b9a42b4781eb5f7e80c802d2dae4055901 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 1 Dec 2023 14:07:39 -0800 Subject: [PATCH 174/454] Use conjunction instead of lambda pack expansion --- src/common/CudaDataUtils.hpp | 9 +++------ src/common/HipDataUtils.hpp | 10 +++------- src/rajaperf_config.hpp.in | 12 ++++++++++++ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index f01a46d3d..de6ea1e4e 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -45,7 +45,7 @@ __device__ __forceinline__ unsigned long long device_timer() /*! * \brief Method for launching a CUDA kernel with given configuration. * - * Note: method checks whether number of args and their types in + * Note: method checks whether number of args and their types in * kernel signature matches args passed to this method. */ template @@ -57,11 +57,8 @@ void RPlaunchCudaKernel(void (*kernel)(KernArgs...), static_assert(sizeof...(KernArgs) == sizeof...(Args), "Number of kernel args doesn't match what's passed to method"); - int ia[] = {[](){ - static_assert(std::is_same, std::decay_t>::value, "Kernel arg types don't match what's passed to method"); - return 0; - }()...}; - RAJA_UNUSED_VAR(ia); + static_assert(conjunction, std::decay_t>...>::value, + "Kernel arg types don't match what's passed to method"); constexpr size_t count = sizeof...(Args); void* arg_arr[count]{(void*)&args...}; diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index aa6710dfe..20dde8f4e 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -32,7 +32,7 @@ namespace rajaperf /*! * \brief Method for launching a HIP kernel with given configuration. * - * Note: method checks whether number of args and their types in + * Note: method checks whether number of args and their types in * kernel signature matches args passed to this method. */ template @@ -44,12 +44,8 @@ void RPlaunchHipKernel(void (*kernel)(KernArgs...), static_assert(sizeof...(KernArgs) == sizeof...(Args), "Number of kernel args doesn't match what's passed to method"); - int ia[] = {[](){ - static_assert(std::is_same, std::decay_t>::value, - "Kernel arg types don't match what's passed to method"); - return 0; - }()...}; - RAJA_UNUSED_VAR(ia); + static_assert(conjunction, std::decay_t>...>::value, + "Kernel arg types don't match what's passed to method"); constexpr size_t count = sizeof...(Args); void* arg_arr[count]{(void*)&args...}; diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index d545c0b93..4ba0d0994 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -23,6 +23,7 @@ #include "RAJA/config.hpp" #include "camp/number.hpp" +#include #include #cmakedefine RAJA_PERFSUITE_ENABLE_MPI @@ -110,6 +111,17 @@ std::string machine_run; }; +#if __cplusplus < 201703L +// Implement std::conjunction from https://en.cppreference.com/w/cpp/types/conjunction +template struct conjunction : std::true_type {}; +template struct conjunction : B1 {}; +template +struct conjunction + : std::conditional_t, B1> {}; +#else +using std::conjunction; +#endif + } // closing brace for rajaperf namespace // Squash compiler warnings about unused variables From 344827b2a2fe6286b200d9ecb6f44c5e23bcc66c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 1 Dec 2023 16:45:45 -0800 Subject: [PATCH 175/454] increase poodle time limits to get intel classic to complete --- .gitlab/custom-jobs-and-variables.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 9c845e6a6..409574769 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -29,9 +29,9 @@ variables: # Optimization notes: We have 4 jobs lasting at max 5 minutes and using 28 # cores out of 112 available (see -j in scripts/gitlab/build_and_test.sh). # We allow allocation overlapping. - POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=12 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=14 --nodes=1" # Arguments for job level allocation - POODLE_JOB_ALLOC: "--overlap --time=10 --nodes=1" + POODLE_JOB_ALLOC: "--overlap --time=12 --nodes=1" # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle From 43c5e2c31f69903ba99da83922152c38ef267d5b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 09:34:31 -0800 Subject: [PATCH 176/454] Update toss4_mvapich2_icpx.sh --- scripts/lc-builds/toss4_mvapich2_icpx.sh | 27 ++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh index 9c8cd5b97..7b0daa339 100755 --- a/scripts/lc-builds/toss4_mvapich2_icpx.sh +++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh @@ -7,10 +7,15 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -if [ "$1" == "" ]; then +if [[ $# -lt 2 ]]; then echo - echo "You must pass a compiler version number to script. For example," - echo " toss4_mvapich2_icpx.sh 2.3.7 2022.1.0" + echo "You must pass 2 or more arguments to the script (in this order): " + echo " 1) mvapich2 compiler version number" + echo " 2) icpx compiler version number" + echo " 3...) optional arguments to cmake" + echo + echo "For example: " + echo " toss4_mvapich2_icpx.sh 2022.1.0" exit fi @@ -30,15 +35,25 @@ echo rm -rf build_${BUILD_SUFFIX} 2>/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} -module load cmake/3.21.1 +module load cmake/3.23.1 + +## +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# times at a potential cost of slower 'forall' execution. +## + +source /usr/tce/packages/intel/intel-${COMP_VER}/setvars.sh cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DMPI_CXX_COMPILER=/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpic++ \ + -DMPI_C_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicc" \ + -DMPI_CXX_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicxx" \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icpx \ + -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/compiler/${COMP_VER}/linux/bin/icx \ -DBLT_CXX_STD=c++14 \ -C ${RAJA_HOSTCONFIG} \ - -DENABLE_MPI=On \ + -DENABLE_MPI=ON \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ From ed9e1735dac86b320437770e4ff954e10ee5cdc9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 09:50:20 -0800 Subject: [PATCH 177/454] Fix BUILD_SUFFIX in build scripts Add '-' to the nvcc scripts to be consistent with the non-nvcc scripts. --- scripts/lc-builds/blueos_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_nvcc_clang_caliper.sh | 2 +- scripts/lc-builds/blueos_nvcc_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_xl.sh | 2 +- scripts/lc-builds/blueos_spectrum_nvcc_clang.sh | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index a6332fa54..526c4c763 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -24,7 +24,7 @@ COMP_ARCH=$2 COMP_CLANG_VER=$3 shift 3 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake echo diff --git a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh index f36715c21..de2fb5548 100755 --- a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh +++ b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh @@ -28,7 +28,7 @@ CALI_DIR=$4 ADIAK_DIR=$5 shift 5 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake echo diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index 3ca718cb2..f194e8121 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -24,7 +24,7 @@ COMP_ARCH=$2 COMP_GCC_VER=$3 shift 3 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake echo diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index ead4d5a7c..6d30da64c 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -24,7 +24,7 @@ COMP_ARCH=$2 COMP_XL_VER=$3 shift 3 -BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-xl${COMP_XL_VER} +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-xl-${COMP_XL_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_xl_X.cmake echo diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index 957b1eb2e..c5fa74cb2 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -26,7 +26,7 @@ COMP_ARCH=$3 COMP_CLANG_VER=$4 shift 4 -BUILD_SUFFIX=lc_blueos-spectrum${COMP_MPI_VER}-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER} +BUILD_SUFFIX=lc_blueos-spectrum-${COMP_MPI_VER}-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-${COMP_CLANG_VER} RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake echo From 643a3e5f04bd8fda9a54662115742410ef304c13 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 09:50:30 -0800 Subject: [PATCH 178/454] Add blueos_spectrum_nvcc_gcc.sh --- scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100755 scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh new file mode 100755 index 000000000..4e1a68318 --- /dev/null +++ b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 4 ]]; then + echo + echo "You must pass 4 arguments to the script (in this order): " + echo " 1) compiler version number for spectrum mpi" + echo " 2) compiler version number for nvcc (number only, not 'sm_70' for example)" + echo " 3) CUDA compute architecture" + echo " 4) compiler version number for gcc. " + echo + echo "For example: " + echo " blueos_spectrum_nvcc_gcc.sh rolling-release 10.2.89 70 8.3.1" + exit +fi + +COMP_MPI_VER=$1 +COMP_NVCC_VER=$2 +COMP_ARCH=$3 +COMP_GCC_VER=$4 +shift 4 + +BUILD_SUFFIX=lc_blueos-spectrum-${COMP_MPI_VER}-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.20.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMPI_CXX_COMPILER=/usr/tce/packages/spectrum-mpi/spectrum-mpi-${COMP_MPI_VER}-gcc-${COMP_GCC_VER}/bin/mpig++ \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=On \ + -DENABLE_OPENMP=On \ + -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ + -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to run with mpi when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " lrun -n4 ./bin/raja-perf.exe" +echo +echo "***********************************************************************" From 0f693c5d4d90715aff90f9b381b995826ec8f713 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 15:49:33 -0800 Subject: [PATCH 179/454] fixup merge --- src/common/RunParams.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 5f5f70412..2cfb3fc8c 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -586,7 +586,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) opt == std::string("--omptarget-reduction-data-space") || opt == std::string("--cuda-reduction-data-space") || opt == std::string("--hip-reduction-data-space") || - opt == std::string("--kokkos-reduction-data-space") + opt == std::string("--kokkos-reduction-data-space") || opt == std::string("--seq-mpi-data-space") || opt == std::string("--omp-mpi-data-space") || opt == std::string("--omptarget-mpi-data-space") || From 06d35c084a8d65499d56e54ae0c7355999c6a85c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 15:47:10 -0800 Subject: [PATCH 180/454] Fixup MPI data space help message --- src/common/RunParams.cpp | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 2cfb3fc8c..fb839b470 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -159,6 +159,13 @@ void RunParams::print(std::ostream& str) const str << "\n hip reduction data space = " << getDataSpaceName(hipReductionDataSpace); str << "\n kokkos reduction data space = " << getDataSpaceName(kokkosReductionDataSpace); + str << "\n seq MPI data space = " << getDataSpaceName(seqMPIDataSpace); + str << "\n omp MPI data space = " << getDataSpaceName(ompMPIDataSpace); + str << "\n omp target MPI data space = " << getDataSpaceName(ompTargetMPIDataSpace); + str << "\n cuda MPI data space = " << getDataSpaceName(cudaMPIDataSpace); + str << "\n hip MPI data space = " << getDataSpaceName(hipMPIDataSpace); + str << "\n kokkos MPI data space = " << getDataSpaceName(kokkosMPIDataSpace); + str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { str << "\n\t" << kernel_input[j]; @@ -1152,41 +1159,35 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n"; - str << "\t --seq-mpi-data-space, -smpids [Default is Host]\n" + str << "\t --seq-mpi-data-space [Default is Host]\n" << "\t (name of data space to use with MPI and sequential execution)\n"; str << "\t\t Examples...\n" - << "\t\t --seq-mpi-data-space Host (run sequential variants with Host memory for MPI buffers)\n" - << "\t\t -smpids Copy (run sequential variants and copy to Host memory for MPI buffers)\n\n"; + << "\t\t --seq-mpi-data-space Host (run sequential variants with Host memory for MPI buffers)\n\n"; - str << "\t --omp-mpi-data-space, -ompids [Default is Omp]\n" + str << "\t --omp-mpi-data-space [Default is Omp]\n" << "\t (name of data space to use with MPI and OpenMP execution)\n"; str << "\t\t Examples...\n" - << "\t\t --omp-mpi-data-space Omp (run Omp variants with Omp memory for MPI buffers)\n" - << "\t\t -ompids Host (run Omp variants with Host memory for MPI buffers)\n\n"; + << "\t\t --omp-mpi-data-space Omp (run Omp variants with Omp memory for MPI buffers)\n\n"; - str << "\t --omptarget-mpi-data-space, -otmpids [Default is Copy]\n" + str << "\t --omptarget-mpi-data-space [Default is Copy]\n" << "\t (name of data space to use with MPI and OpenMP target execution)\n"; str << "\t\t Examples...\n" - << "\t\t --omptarget-mpi-data-space Copy (run Omp Target variants and copy to Host memory for MPI buffers)\n" - << "\t\t -otmpids OmpTarget (run Omp Target variants with OmpTarget memory for MPI buffers (assumes MPI can access OmpTarget memory))\n\n"; + << "\t\t --omptarget-mpi-data-space Copy (run Omp Target variants and copy to Host memory for MPI buffers)\n\n"; - str << "\t --cuda-mpi-data-space, -cmpids [Default is CudaPinned]\n" + str << "\t --cuda-mpi-data-space [Default is CudaPinned]\n" << "\t (name of data space to use with MPI and cuda execution)\n"; str << "\t\t Examples...\n" - << "\t\t --cuda-mpi-data-space CudaPinned (run CUDA variants with Cuda Pinned memory for MPI buffers)\n" - << "\t\t -cmpids CudaDevice (run CUDA variants with Cuda Device memory for MPI buffers (assumes MPI is cuda/gpu aware))\n\n"; + << "\t\t --cuda-mpi-data-space CudaPinned (run CUDA variants with Cuda Pinned memory for MPI buffers)\n\n"; - str << "\t --hip-mpi-data-space, -hmpids [Default is HipPinned]\n" + str << "\t --hip-mpi-data-space [Default is HipPinned]\n" << "\t (name of data space to use with MPI and hip execution)\n"; str << "\t\t Examples...\n" - << "\t\t --hip-mpi-data-space Copy (run HIP variants and copy to Host memory for MPI buffers)\n" - << "\t\t -hmpids hipDevice (run HIP variants with Hip Device memory for MPI buffers (assumes MPI is hip/gpu aware))\n\n"; + << "\t\t --hip-mpi-data-space Copy (run HIP variants and copy to Host memory for MPI buffers)\n\n"; - str << "\t --kokkos-mpi-data-space, -kmpids [Default is Copy]\n" + str << "\t --kokkos-mpi-data-space [Default is Copy]\n" << "\t (name of data space to use with MPI and kokkos execution)\n"; str << "\t\t Examples...\n" - << "\t\t --kokkos-mpi-data-space Copy (run KOKKOS variants and copy to Host memory for MPI buffers)\n" - << "\t\t -kmpids HipPinned (run KOKKOS variants with Hip Pinned memory for MPI buffers)\n\n"; + << "\t\t --kokkos-mpi-data-space Copy (run KOKKOS variants and copy to Host memory for MPI buffers)\n\n"; #if defined(RAJA_PERFSUITE_USE_CALIPER) str << "\t --add-to-spot-config, -atsc [Default is none]\n" From 27d299b6005861ba24c3317370381289385176ed Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 15:47:23 -0800 Subject: [PATCH 181/454] Add reduction data space help message --- src/common/RunParams.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index fb839b470..7b3dcef2e 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1159,6 +1159,42 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n"; + str << "\t --seq-reduction-data-space [Default is Host]\n" + << "\t (name of data space to use with reductions for sequential variants)\n" + << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; + str << "\t\t Examples...\n" + << "\t\t --seq-reduction-data-space Host (run sequential variants with Host memory)\n\n"; + + str << "\t --omp-reduction-data-space [Default is Omp]\n" + << "\t (names of data space to use with reductions for OpenMP variants)\n" + << "\t Valid data space names are 'Host' or 'Omp'\n"; + str << "\t\t Examples...\n" + << "\t\t --omp-reduction-data-space Omp (run Omp variants with Omp memory)\n\n"; + + str << "\t --omptarget-reduction-data-space [Default is OmpTarget]\n" + << "\t (names of data space to use with reductions for OpenMP Target variants)\n" + << "\t Valid data space names are 'OmpTarget' or 'CudaPinned'\n"; + str << "\t\t Examples...\n" + << "\t\t --omptarget-reduction-data-space OmpTarget (run Omp Target variants with Omp Target memory)\n\n"; + + str << "\t --cuda-reduction-data-space [Default is CudaManagedDevicePreferredHostAccessed]\n" + << "\t (names of data space to use with reductions for CUDA variants)\n" + << "\t Valid data space names are 'CudaDevice', 'CudaPinned', or 'CudaManaged'\n"; + str << "\t\t Examples...\n" + << "\t\t --cuda-reduction-data-space CudaManaged (run CUDA variants with Cuda Managed memory)\n\n"; + + str << "\t --hip-reduction-data-space [Default is HipDevice]\n" + << "\t (names of data space to use with reductions for HIP variants)\n" + << "\t Valid data space names are 'HipDevice', 'HipPinned', or 'HipManaged'\n"; + str << "\t\t Examples...\n" + << "\t\t --hip-reduction-data-space HipManaged (run HIP variants with Hip Managed memory)\n\n"; + + str << "\t --kokkos-reduction-data-space [Default is Host]\n" + << "\t (names of data space to use with reductions)\n"; + str << "\t\t Examples...\n" + << "\t\t --kokkos-data-space Host (run KOKKOS variants with Host memory)\n" + << "\t\t -kds HipPinned (run KOKKOS variants with Hip Pinned memory)\n\n"; + str << "\t --seq-mpi-data-space [Default is Host]\n" << "\t (name of data space to use with MPI and sequential execution)\n"; str << "\t\t Examples...\n" From de86749e8d2131eba867cb495cb7677878067261 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 16:02:37 -0800 Subject: [PATCH 182/454] Sync mpi amdclang script --- scripts/lc-builds/toss4_cray-mpich_amdclang.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 4e83671f0..36c4353b7 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -52,6 +52,12 @@ echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in i echo "Configuration extra arguments:" echo " $@" echo +echo "To get cmake to work you may have to configure with" +echo " -DHIP_PLATFORM=amd" +echo +echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" +echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" +echo rm -rf build_${BUILD_SUFFIX} >/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} From e5d283bb13e9d3e428478fd8334ea362b399228e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 4 Dec 2023 16:18:11 -0800 Subject: [PATCH 183/454] Let cmake/BLT know when test uses MPI --- CMakeLists.txt | 6 ++++++ test/CMakeLists.txt | 21 +++++++++++++++------ tpl/RAJA | 2 +- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812d339b0..146158e26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,6 +51,12 @@ if (ENABLE_TESTS) endif() cmake_dependent_option(RAJA_PERFSUITE_ENABLE_MPI "Build with MPI" On "ENABLE_MPI" Off) +if (RAJA_PERFSUITE_ENABLE_MPI) +set(RAJA_PERFSUITE_NUM_MPI_TASKS 4 CACHE STRING "Number of MPI tasks in tests") +else() +set(RAJA_PERFSUITE_NUM_MPI_TASKS 0 CACHE INTERNAL "Number of MPI tasks in tests") +endif() +message(STATUS "Using RAJA_PERFSUITE_NUM_MPI_TASKS: ${RAJA_PERFSUITE_NUM_MPI_TASKS}") cmake_dependent_option(RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN "Build OpenMP scan variants" Off "ENABLE_OPENMP" Off) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c2d21d81d..38329f98f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,11 +16,20 @@ set(RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS algorithm comm) list(APPEND RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) - -raja_add_test( - NAME test-raja-perf-suite - SOURCES test-raja-perf-suite.cpp - DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} - ) + +if (RAJA_PERFSUITE_ENABLE_MPI) + raja_add_test( + NAME test-raja-perf-suite + SOURCES test-raja-perf-suite.cpp + DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} + NUM_MPI_TASKS ${RAJA_PERFSUITE_NUM_MPI_TASKS} + ) +else() + raja_add_test( + NAME test-raja-perf-suite + SOURCES test-raja-perf-suite.cpp + DEPENDS_ON ${RAJA_PERFSUITE_TEST_EXECUTABLE_DEPENDS} + ) +endif() target_include_directories(test-raja-perf-suite.exe PRIVATE ${PROJECT_SOURCE_DIR}/src) diff --git a/tpl/RAJA b/tpl/RAJA index e00f05675..668476510 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit e00f05675b7e633c8bfdde583e25efd3a50bf267 +Subproject commit 668476510d61b0f58ac71ed0c8c54de601c8355c From b8bd8dd10dbd27785d8d93c59dbc0d184aa1efa7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Dec 2023 08:13:44 -0800 Subject: [PATCH 184/454] Add more GPU macros This is needed because the use case in FIRST_MIN needs to initialize multiple values with a single value and needs to do its own finalization and output --- src/common/GPUUtils.hpp | 47 +++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 003efd858..ecadbe7ca 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -181,37 +181,52 @@ inline void seq_for(camp::int_seq const&, Func&& func) } // -#define RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) \ +#define RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) \ deallocData(reduction_data_space, device_ptr_name); \ if (reduction_data_space != host_data_space) { \ deallocData(host_data_space, host_ptr_name); \ } // +#define RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(gpu_type, init_value, device_ptr_name, host_ptr_name, length) \ + if (device_ptr_name != host_ptr_name) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + host_ptr_name[i] = (init_value); \ + } \ + gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ + (length)*sizeof(device_ptr_name[0]), \ + gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ + } else { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + device_ptr_name[i] = (init_value); \ + } \ + } #define RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(gpu_type, init_ptr, device_ptr_name, host_ptr_name, length) \ if (device_ptr_name != host_ptr_name) { \ - for (int i = 0; i < (length); ++i) { \ - host_ptr_name[i] = (init_ptr)[i]; \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + host_ptr_name[i] = (init_ptr)[i]; \ } \ gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ (length)*sizeof(device_ptr_name[0]), \ gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ } else { \ - for (int i = 0; i < (length); ++i) { \ - device_ptr_name[i] = (init_ptr)[i]; \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + device_ptr_name[i] = (init_ptr)[i]; \ } \ } // -#define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, final_ptr, device_ptr_name, host_ptr_name, length) \ +#define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, device_ptr_name, host_ptr_name, length) \ if (device_ptr_name != host_ptr_name) { \ gpu_type##Errchk( gpu_type##MemcpyAsync( host_ptr_name, device_ptr_name, \ (length)*sizeof(device_ptr_name[0]), \ gpu_type##MemcpyDeviceToHost, res.get_stream() ) ); \ } \ - gpu_type##Errchk( gpu_type##StreamSynchronize( res.get_stream() ) ); \ - for (int i = 0; i < (length); ++i) { \ - (final_ptr)[i] = host_ptr_name[i]; \ + gpu_type##Errchk( gpu_type##StreamSynchronize( res.get_stream() ) ); + +#define RAJAPERF_GPU_REDUCER_COPY_FINAL_IMPL(final_ptr, host_ptr_name, length) \ + for (size_t i = 0; i < static_cast(length); ++i) { \ + (final_ptr)[i] = host_ptr_name[i]; \ } @@ -219,19 +234,29 @@ inline void seq_for(camp::int_seq const&, Func&& func) RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) #define RAJAPERF_CUDA_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) +#define RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(cuda, init_value, device_ptr_name, host_ptr_name, length) #define RAJAPERF_CUDA_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length) \ RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(cuda, init_ptr, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, device_ptr_name, host_ptr_name, length) #define RAJAPERF_CUDA_REDUCER_COPY_BACK(final_ptr, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, final_ptr, device_ptr_name, host_ptr_name, length) + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_COPY_FINAL_IMPL(final_ptr, host_ptr_name, length) #define RAJAPERF_HIP_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) #define RAJAPERF_HIP_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) +#define RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(hip, init_value, device_ptr_name, host_ptr_name, length) #define RAJAPERF_HIP_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length) \ RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(hip, init_ptr, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, device_ptr_name, host_ptr_name, length) #define RAJAPERF_HIP_REDUCER_COPY_BACK(final_ptr, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, final_ptr, device_ptr_name, host_ptr_name, length) + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, device_ptr_name, host_ptr_name, length) \ + RAJAPERF_GPU_REDUCER_COPY_FINAL_IMPL(final_ptr, host_ptr_name, length) // #define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(kernel, variant) \ From 0af86633422cc8c076d26270173ef16019204335 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Dec 2023 08:24:10 -0800 Subject: [PATCH 185/454] Use GPU reducer macros in FIRST_MIN --- src/lcals/FIRST_MIN-Cuda.cpp | 31 ++++++++----------------------- src/lcals/FIRST_MIN-Hip.cpp | 31 ++++++++----------------------- 2 files changed, 16 insertions(+), 46 deletions(-) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 0efac6950..1c5da48e7 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -72,27 +72,21 @@ void FIRST_MIN::runCudaVariantBlock(VariantID vid) if ( vid == Base_CUDA ) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value - MyMinLoc* dminloc; - cudaErrchk( cudaMalloc( (void**)&dminloc, - grid_size * sizeof(MyMinLoc) ) ); + RAJAPERF_CUDA_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; + RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); constexpr size_t shmem = sizeof(MyMinLoc)*block_size; first_min<<>>(x, dminloc, mymin, iend); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( mymin_block, dminloc, - grid_size * sizeof(MyMinLoc), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - + RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; @@ -103,8 +97,7 @@ void FIRST_MIN::runCudaVariantBlock(VariantID vid) } stopTimer(); - cudaErrchk( cudaFree( dminloc ) ); - delete[] mymin_block; + RAJAPERF_CUDA_REDUCER_TEARDOWN(dminloc, mymin_block); } else if ( vid == RAJA_CUDA ) { @@ -149,26 +142,19 @@ void FIRST_MIN::runCudaVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value - - MyMinLoc* dminloc; - cudaErrchk( cudaMalloc( (void**)&dminloc, - grid_size * sizeof(MyMinLoc) ) ); + RAJAPERF_CUDA_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; + RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); first_min<<>>(x, dminloc, mymin, iend); cudaErrchk( cudaGetLastError() ); - cudaErrchk( cudaMemcpyAsync( mymin_block, dminloc, - grid_size * sizeof(MyMinLoc), - cudaMemcpyDeviceToHost, res.get_stream() ) ); - cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); - + RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; @@ -179,8 +165,7 @@ void FIRST_MIN::runCudaVariantOccGS(VariantID vid) } stopTimer(); - cudaErrchk( cudaFree( dminloc ) ); - delete[] mymin_block; + RAJAPERF_CUDA_REDUCER_TEARDOWN(dminloc, mymin_block); } else if ( vid == RAJA_CUDA ) { diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index a9fb41c74..991ef8f77 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -72,16 +72,14 @@ void FIRST_MIN::runHipVariantBlock(VariantID vid) if ( vid == Base_HIP ) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value - MyMinLoc* dminloc; - hipErrchk( hipMalloc( (void**)&dminloc, - grid_size * sizeof(MyMinLoc) ) ); + RAJAPERF_HIP_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; + RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); constexpr size_t shmem = sizeof(MyMinLoc)*block_size; hipLaunchKernelGGL( (first_min), grid_size, block_size, @@ -91,11 +89,7 @@ void FIRST_MIN::runHipVariantBlock(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( mymin_block, dminloc, - grid_size * sizeof(MyMinLoc), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - + RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; @@ -106,8 +100,7 @@ void FIRST_MIN::runHipVariantBlock(VariantID vid) } stopTimer(); - hipErrchk( hipFree( dminloc ) ); - delete[] mymin_block; + RAJAPERF_HIP_REDUCER_TEARDOWN(dminloc, mymin_block); } else if ( vid == RAJA_HIP ) { @@ -152,16 +145,13 @@ void FIRST_MIN::runHipVariantOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - MyMinLoc* mymin_block = new MyMinLoc[grid_size]; //per-block min value - - MyMinLoc* dminloc; - hipErrchk( hipMalloc( (void**)&dminloc, - grid_size * sizeof(MyMinLoc) ) ); + RAJAPERF_HIP_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; + RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); hipLaunchKernelGGL( (first_min), grid_size, block_size, shmem, res.get_stream(), x, @@ -170,11 +160,7 @@ void FIRST_MIN::runHipVariantOccGS(VariantID vid) iend ); hipErrchk( hipGetLastError() ); - hipErrchk( hipMemcpyAsync( mymin_block, dminloc, - grid_size * sizeof(MyMinLoc), - hipMemcpyDeviceToHost, res.get_stream() ) ); - hipErrchk( hipStreamSynchronize( res.get_stream() ) ); - + RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; @@ -185,8 +171,7 @@ void FIRST_MIN::runHipVariantOccGS(VariantID vid) } stopTimer(); - hipErrchk( hipFree( dminloc ) ); - delete[] mymin_block; + RAJAPERF_HIP_REDUCER_TEARDOWN(dminloc, mymin_block); } else if ( vid == RAJA_HIP ) { From 218ba419687421bdbb36b650014ba5f1a1e75eab Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Dec 2023 08:39:25 -0800 Subject: [PATCH 186/454] Split GPU FIRST_MIN into comparable tunings --- src/lcals/FIRST_MIN-Cuda.cpp | 104 +++++++++++++++++++++++++++++------ src/lcals/FIRST_MIN-Hip.cpp | 103 ++++++++++++++++++++++++++++------ src/lcals/FIRST_MIN.hpp | 16 ++++-- 3 files changed, 185 insertions(+), 38 deletions(-) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 1c5da48e7..e50c4c7a8 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -59,10 +59,10 @@ __global__ void first_min(Real_ptr x, template < size_t block_size > -void FIRST_MIN::runCudaVariantBlock(VariantID vid) +void FIRST_MIN::runCudaVariantBlockHost(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; + // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -99,7 +99,23 @@ void FIRST_MIN::runCudaVariantBlock(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(dminloc, mymin_block); - } else if ( vid == RAJA_CUDA ) { + } else { + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void FIRST_MIN::runCudaVariantBlockDevice(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -123,10 +139,10 @@ void FIRST_MIN::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void FIRST_MIN::runCudaVariantOccGS(VariantID vid) +void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; + // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -167,7 +183,23 @@ void FIRST_MIN::runCudaVariantOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(dminloc, mymin_block); - } else if ( vid == RAJA_CUDA ) { + } else { + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void FIRST_MIN::runCudaVariantBlockDeviceOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -201,23 +233,49 @@ void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + if ( vid == Base_HIP ) { - setBlockSize(block_size); - runCudaVariantBlock(vid); + if (tune_idx == t) { - } + setBlockSize(block_size); + runCudaVariantBlockHost(vid); + + } - t += 1; + t += 1; - if (tune_idx == t) { + if (tune_idx == t) { - setBlockSize(block_size); - runCudaVariantOccGS(vid); + setBlockSize(block_size); + runCudaVariantBlockHostOccGS(vid); + + } + + t += 1; } - t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockDevice(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantBlockDeviceOccGS(vid); + + } + + t += 1; + + } } @@ -240,9 +298,21 @@ void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + if ( vid == Base_HIP ) { + + addVariantTuningName(vid, "blkhst"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkhst_occgs_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + } + + if ( vid == RAJA_HIP ) { + + addVariantTuningName(vid, "blkdev"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 991ef8f77..9724e6875 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -59,10 +59,10 @@ __global__ void first_min(Real_ptr x, template < size_t block_size > -void FIRST_MIN::runHipVariantBlock(VariantID vid) +void FIRST_MIN::runHipVariantBlockHost(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; + // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -102,7 +102,23 @@ void FIRST_MIN::runHipVariantBlock(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(dminloc, mymin_block); - } else if ( vid == RAJA_HIP ) { + } else { + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void FIRST_MIN::runHipVariantBlockDevice(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -126,10 +142,10 @@ void FIRST_MIN::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void FIRST_MIN::runHipVariantOccGS(VariantID vid) +void FIRST_MIN::runHipVariantBlockHostOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; + // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -173,7 +189,23 @@ void FIRST_MIN::runHipVariantOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(dminloc, mymin_block); - } else if ( vid == RAJA_HIP ) { + } else { + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void FIRST_MIN::runHipVariantBlockDeviceOccGS(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -207,24 +239,49 @@ void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + if ( vid == Base_HIP ) { - setBlockSize(block_size); - runHipVariantBlock(vid); + if (tune_idx == t) { - } + setBlockSize(block_size); + runHipVariantBlockHost(vid); + + } - t += 1; + t += 1; - if (tune_idx == t) { + if (tune_idx == t) { - setBlockSize(block_size); - runHipVariantOccGS(vid); + setBlockSize(block_size); + runHipVariantBlockHostOccGS(vid); + + } + + t += 1; } - t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockDevice(vid); + + } + + t += 1; + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantBlockDeviceOccGS(vid); + + } + + t += 1; + + } } }); @@ -246,9 +303,21 @@ void FIRST_MIN::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + if ( vid == Base_HIP ) { + + addVariantTuningName(vid, "blkhst"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkhst_occgs_"+std::to_string(block_size)); - addVariantTuningName(vid, "occgs_"+std::to_string(block_size)); + } + + if ( vid == RAJA_HIP ) { + + addVariantTuningName(vid, "blkdev"+std::to_string(block_size)); + + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + + } } diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 1431dad62..28a7e760e 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -84,13 +84,21 @@ class FIRST_MIN : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockHost(VariantID vid); template < size_t block_size > - void runCudaVariantOccGS(VariantID vid); + void runHipVariantBlockHost(VariantID vid); template < size_t block_size > - void runHipVariantBlock(VariantID vid); + void runCudaVariantBlockDevice(VariantID vid); template < size_t block_size > - void runHipVariantOccGS(VariantID vid); + void runHipVariantBlockDevice(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockHostOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockHostOccGS(VariantID vid); + template < size_t block_size > + void runCudaVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size > + void runHipVariantBlockDeviceOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 10d9e73e7ea11b197cfed3fcaeb9a0c66941337f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Dec 2023 08:46:01 -0800 Subject: [PATCH 187/454] Rename other Reducer tunings from block to blkdev This was used in FIRST_MIN to differentiate when we finalize the per block values on the host or the device --- src/algorithm/REDUCE_SUM-Cuda.cpp | 12 ++++++------ src/algorithm/REDUCE_SUM-Hip.cpp | 12 ++++++------ src/algorithm/REDUCE_SUM.hpp | 8 ++++---- src/basic/PI_REDUCE-Cuda.cpp | 12 ++++++------ src/basic/PI_REDUCE-Hip.cpp | 12 ++++++------ src/basic/PI_REDUCE.hpp | 8 ++++---- src/basic/REDUCE3_INT-Cuda.cpp | 12 ++++++------ src/basic/REDUCE3_INT-Hip.cpp | 12 ++++++------ src/basic/REDUCE3_INT.hpp | 8 ++++---- src/basic/REDUCE_STRUCT-Cuda.cpp | 12 ++++++------ src/basic/REDUCE_STRUCT-Hip.cpp | 12 ++++++------ src/basic/REDUCE_STRUCT.hpp | 8 ++++---- src/basic/TRAP_INT-Cuda.cpp | 12 ++++++------ src/basic/TRAP_INT-Hip.cpp | 12 ++++++------ src/basic/TRAP_INT.hpp | 8 ++++---- src/stream/DOT-Cuda.cpp | 12 ++++++------ src/stream/DOT-Hip.cpp | 12 ++++++------ src/stream/DOT.hpp | 8 ++++---- 18 files changed, 96 insertions(+), 96 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 07b2c1c81..2a8bf57ee 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -251,7 +251,7 @@ void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlock(VariantID vid) +void REDUCE_SUM::runCudaVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -287,7 +287,7 @@ void REDUCE_SUM::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlockOccGS(VariantID vid) +void REDUCE_SUM::runCudaVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -368,7 +368,7 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockDevice(vid); } @@ -377,7 +377,7 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlockOccGS(vid); + runCudaVariantBlockDeviceOccGS(vid); } @@ -417,9 +417,9 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } } diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index b148979e9..a3520ddd9 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -276,7 +276,7 @@ void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runHipVariantBlock(VariantID vid) +void REDUCE_SUM::runHipVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -312,7 +312,7 @@ void REDUCE_SUM::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_SUM::runHipVariantBlockOccGS(VariantID vid) +void REDUCE_SUM::runHipVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -393,7 +393,7 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockDevice(vid); } @@ -402,7 +402,7 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlockOccGS(vid); + runHipVariantBlockDeviceOccGS(vid); } @@ -446,9 +446,9 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 63a77898d..4a9db2831 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -72,13 +72,13 @@ class REDUCE_SUM : public KernelBase template < size_t block_size > void runHipVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockDevice(VariantID vid); template < size_t block_size > - void runHipVariantBlock(VariantID vid); + void runHipVariantBlockDevice(VariantID vid); template < size_t block_size > - void runCudaVariantBlockOccGS(VariantID vid); + void runCudaVariantBlockDeviceOccGS(VariantID vid); template < size_t block_size > - void runHipVariantBlockOccGS(VariantID vid); + void runHipVariantBlockDeviceOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 0442131ef..df7ef4e11 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -178,7 +178,7 @@ void PI_REDUCE::runCudaVariantBlockAtomicOccGS(VariantID vid) template < size_t block_size > -void PI_REDUCE::runCudaVariantBlock(VariantID vid) +void PI_REDUCE::runCudaVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -211,7 +211,7 @@ void PI_REDUCE::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void PI_REDUCE::runCudaVariantBlockOccGS(VariantID vid) +void PI_REDUCE::runCudaVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -277,7 +277,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockDevice(vid); } @@ -286,7 +286,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlockOccGS(vid); + runCudaVariantBlockDeviceOccGS(vid); } @@ -321,9 +321,9 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 62464702b..095ca8e16 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -176,7 +176,7 @@ void PI_REDUCE::runHipVariantBlockAtomicOccGS(VariantID vid) template < size_t block_size > -void PI_REDUCE::runHipVariantBlock(VariantID vid) +void PI_REDUCE::runHipVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -209,7 +209,7 @@ void PI_REDUCE::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void PI_REDUCE::runHipVariantBlockOccGS(VariantID vid) +void PI_REDUCE::runHipVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -275,7 +275,7 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockDevice(vid); } @@ -284,7 +284,7 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlockOccGS(vid); + runHipVariantBlockDeviceOccGS(vid); } @@ -319,9 +319,9 @@ void PI_REDUCE::setHipTuningDefinitions(VariantID vid) if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } } diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 303d0b6fd..2f993f8e7 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -68,13 +68,13 @@ class PI_REDUCE : public KernelBase template < size_t block_size > void runHipVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockDevice(VariantID vid); template < size_t block_size > - void runHipVariantBlock(VariantID vid); + void runHipVariantBlockDevice(VariantID vid); template < size_t block_size > - void runCudaVariantBlockOccGS(VariantID vid); + void runCudaVariantBlockDeviceOccGS(VariantID vid); template < size_t block_size > - void runHipVariantBlockOccGS(VariantID vid); + void runHipVariantBlockDeviceOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 1089bd36a..0272af043 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -208,7 +208,7 @@ void REDUCE3_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void REDUCE3_INT::runCudaVariantBlock(VariantID vid) +void REDUCE3_INT::runCudaVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -245,7 +245,7 @@ void REDUCE3_INT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE3_INT::runCudaVariantBlockOccGS(VariantID vid) +void REDUCE3_INT::runCudaVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -315,7 +315,7 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockDevice(vid); } @@ -324,7 +324,7 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlockOccGS(vid); + runCudaVariantBlockDeviceOccGS(vid); } @@ -359,9 +359,9 @@ void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 98f82e13e..73726baef 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -209,7 +209,7 @@ void REDUCE3_INT::runHipVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void REDUCE3_INT::runHipVariantBlock(VariantID vid) +void REDUCE3_INT::runHipVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -246,7 +246,7 @@ void REDUCE3_INT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE3_INT::runHipVariantBlockOccGS(VariantID vid) +void REDUCE3_INT::runHipVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -316,7 +316,7 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockDevice(vid); } @@ -325,7 +325,7 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlockOccGS(vid); + runHipVariantBlockDeviceOccGS(vid); } @@ -360,9 +360,9 @@ void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index c3fccd588..bced8f059 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -83,13 +83,13 @@ class REDUCE3_INT : public KernelBase template < size_t block_size > void runHipVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockDevice(VariantID vid); template < size_t block_size > - void runHipVariantBlock(VariantID vid); + void runHipVariantBlockDevice(VariantID vid); template < size_t block_size > - void runCudaVariantBlockOccGS(VariantID vid); + void runCudaVariantBlockDeviceOccGS(VariantID vid); template < size_t block_size > - void runHipVariantBlockOccGS(VariantID vid); + void runHipVariantBlockDeviceOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index ad7db4622..d21258647 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -263,7 +263,7 @@ void REDUCE_STRUCT::runCudaVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) +void REDUCE_STRUCT::runCudaVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -308,7 +308,7 @@ void REDUCE_STRUCT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantBlockOccGS(VariantID vid) +void REDUCE_STRUCT::runCudaVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -386,7 +386,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockDevice(vid); } @@ -395,7 +395,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlockOccGS(vid); + runCudaVariantBlockDeviceOccGS(vid); } @@ -430,9 +430,9 @@ void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid) if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index e104fcef5..489c60411 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -266,7 +266,7 @@ void REDUCE_STRUCT::runHipVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) +void REDUCE_STRUCT::runHipVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -310,7 +310,7 @@ void REDUCE_STRUCT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void REDUCE_STRUCT::runHipVariantBlockOccGS(VariantID vid) +void REDUCE_STRUCT::runHipVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -388,7 +388,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockDevice(vid); } @@ -397,7 +397,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlockOccGS(vid); + runHipVariantBlockDeviceOccGS(vid); } @@ -432,9 +432,9 @@ void REDUCE_STRUCT::setHipTuningDefinitions(VariantID vid) if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 2bfa10412..ef70a50ca 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -98,13 +98,13 @@ class REDUCE_STRUCT : public KernelBase template < size_t block_size > void runHipVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockDevice(VariantID vid); template < size_t block_size > - void runHipVariantBlock(VariantID vid); + void runHipVariantBlockDevice(VariantID vid); template < size_t block_size > - void runCudaVariantBlockOccGS(VariantID vid); + void runCudaVariantBlockDeviceOccGS(VariantID vid); template < size_t block_size > - void runHipVariantBlockOccGS(VariantID vid); + void runHipVariantBlockDeviceOccGS(VariantID vid); struct PointsType { Int_type N; diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 6b3ded2bc..5bafd38e1 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -200,7 +200,7 @@ void TRAP_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void TRAP_INT::runCudaVariantBlock(VariantID vid) +void TRAP_INT::runCudaVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -233,7 +233,7 @@ void TRAP_INT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void TRAP_INT::runCudaVariantBlockOccGS(VariantID vid) +void TRAP_INT::runCudaVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -299,7 +299,7 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockDevice(vid); } @@ -308,7 +308,7 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlockOccGS(vid); + runCudaVariantBlockDeviceOccGS(vid); } @@ -343,9 +343,9 @@ void TRAP_INT::setCudaTuningDefinitions(VariantID vid) if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index e79c0d5ce..fe6d5a5f9 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -199,7 +199,7 @@ void TRAP_INT::runHipVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void TRAP_INT::runHipVariantBlock(VariantID vid) +void TRAP_INT::runHipVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -232,7 +232,7 @@ void TRAP_INT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void TRAP_INT::runHipVariantBlockOccGS(VariantID vid) +void TRAP_INT::runHipVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -298,7 +298,7 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockDevice(vid); } @@ -307,7 +307,7 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlockOccGS(vid); + runHipVariantBlockDeviceOccGS(vid); } @@ -342,9 +342,9 @@ void TRAP_INT::setHipTuningDefinitions(VariantID vid) if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 1a77131f5..107ed67d8 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -80,13 +80,13 @@ class TRAP_INT : public KernelBase template < size_t block_size > void runHipVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockDevice(VariantID vid); template < size_t block_size > - void runHipVariantBlock(VariantID vid); + void runHipVariantBlockDevice(VariantID vid); template < size_t block_size > - void runCudaVariantBlockOccGS(VariantID vid); + void runCudaVariantBlockDeviceOccGS(VariantID vid); template < size_t block_size > - void runHipVariantBlockOccGS(VariantID vid); + void runHipVariantBlockDeviceOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 7398b58ff..61157d22e 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -172,7 +172,7 @@ void DOT::runCudaVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void DOT::runCudaVariantBlock(VariantID vid) +void DOT::runCudaVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -205,7 +205,7 @@ void DOT::runCudaVariantBlock(VariantID vid) } template < size_t block_size > -void DOT::runCudaVariantBlockOccGS(VariantID vid) +void DOT::runCudaVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -271,7 +271,7 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlock(vid); + runCudaVariantBlockDevice(vid); } @@ -280,7 +280,7 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantBlockOccGS(vid); + runCudaVariantBlockDeviceOccGS(vid); } @@ -315,9 +315,9 @@ void DOT::setCudaTuningDefinitions(VariantID vid) if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index db68e7685..124fcd5c2 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -174,7 +174,7 @@ void DOT::runHipVariantBlockAtomicOccGS(VariantID vid) } template < size_t block_size > -void DOT::runHipVariantBlock(VariantID vid) +void DOT::runHipVariantBlockDevice(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -207,7 +207,7 @@ void DOT::runHipVariantBlock(VariantID vid) } template < size_t block_size > -void DOT::runHipVariantBlockOccGS(VariantID vid) +void DOT::runHipVariantBlockDeviceOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -273,7 +273,7 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlock(vid); + runHipVariantBlockDevice(vid); } @@ -282,7 +282,7 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantBlockOccGS(vid); + runHipVariantBlockDeviceOccGS(vid); } @@ -317,9 +317,9 @@ void DOT::setHipTuningDefinitions(VariantID vid) if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); - addVariantTuningName(vid, "block_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); } diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 3247b1f5a..50391939b 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -64,13 +64,13 @@ class DOT : public KernelBase template < size_t block_size > void runHipVariantBlockAtomicOccGS(VariantID vid); template < size_t block_size > - void runCudaVariantBlock(VariantID vid); + void runCudaVariantBlockDevice(VariantID vid); template < size_t block_size > - void runHipVariantBlock(VariantID vid); + void runHipVariantBlockDevice(VariantID vid); template < size_t block_size > - void runCudaVariantBlockOccGS(VariantID vid); + void runCudaVariantBlockDeviceOccGS(VariantID vid); template < size_t block_size > - void runHipVariantBlockOccGS(VariantID vid); + void runHipVariantBlockDeviceOccGS(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 2528677920bcb4592421a16b1f1d39a2ecf59f21 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 5 Dec 2023 09:18:56 -0800 Subject: [PATCH 188/454] Add comments to the GPU reduction macros --- src/common/GPUUtils.hpp | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index ecadbe7ca..2fb3c41ec 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -168,7 +168,10 @@ inline void seq_for(camp::int_seq const&, Func&& func) } // closing brace for rajaperf namespace -// +// allocate pointer of pointer_type with length +// device_ptr_name gets memory in the reduction data space for the current variant +// host_ptr_name is set to either device_ptr_name if the reduction data space is +// host accessible or a new allocation in a host accessible data space otherwise #define RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) \ DataSpace reduction_data_space = getReductionDataSpace(vid); \ DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); \ @@ -180,42 +183,50 @@ inline void seq_for(camp::int_seq const&, Func&& func) allocData(host_data_space, host_ptr_name, (length)); \ } -// +// deallocate device_ptr_name and host_ptr_name +// must be in the same scope as RAJAPERF_GPU_REDUCER_SETUP_IMPL #define RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) \ deallocData(reduction_data_space, device_ptr_name); \ if (reduction_data_space != host_data_space) { \ deallocData(host_data_space, host_ptr_name); \ } -// +// Initialize device_ptr_name with length copies of init_value +// host_ptr_name will be used as an intermediary with an explicit copy +// if the reduction data space is not host accessible #define RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(gpu_type, init_value, device_ptr_name, host_ptr_name, length) \ if (device_ptr_name != host_ptr_name) { \ - for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ host_ptr_name[i] = (init_value); \ } \ gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ (length)*sizeof(device_ptr_name[0]), \ gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ } else { \ - for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ device_ptr_name[i] = (init_value); \ } \ } + +// Initialize device_ptr_name with values in init_ptr +// host_ptr_name will be used as an intermediary with an explicit copy +// if the reduction data space is not host accessible #define RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(gpu_type, init_ptr, device_ptr_name, host_ptr_name, length) \ if (device_ptr_name != host_ptr_name) { \ - for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ host_ptr_name[i] = (init_ptr)[i]; \ } \ gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ (length)*sizeof(device_ptr_name[0]), \ gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ } else { \ - for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ device_ptr_name[i] = (init_ptr)[i]; \ } \ } -// +// Copy back data from device_ptr_name into host_ptr_name +// if the reduction data space is not host accessible #define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, device_ptr_name, host_ptr_name, length) \ if (device_ptr_name != host_ptr_name) { \ gpu_type##Errchk( gpu_type##MemcpyAsync( host_ptr_name, device_ptr_name, \ @@ -224,15 +235,16 @@ inline void seq_for(camp::int_seq const&, Func&& func) } \ gpu_type##Errchk( gpu_type##StreamSynchronize( res.get_stream() ) ); +// Copy data into final_ptr from host_ptr_name #define RAJAPERF_GPU_REDUCER_COPY_FINAL_IMPL(final_ptr, host_ptr_name, length) \ - for (size_t i = 0; i < static_cast(length); ++i) { \ + for (size_t i = 0; i < static_cast(length); ++i) { \ (final_ptr)[i] = host_ptr_name[i]; \ } #define RAJAPERF_CUDA_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_CUDA_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ +#define RAJAPERF_CUDA_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) #define RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length) \ RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(cuda, init_value, device_ptr_name, host_ptr_name, length) @@ -246,7 +258,7 @@ inline void seq_for(camp::int_seq const&, Func&& func) #define RAJAPERF_HIP_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_HIP_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ +#define RAJAPERF_HIP_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) #define RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length) \ RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(hip, init_value, device_ptr_name, host_ptr_name, length) From 2ff4f078ec95223fa9d01f236c2cd9c39c839597 Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Tue, 5 Dec 2023 20:20:57 +0000 Subject: [PATCH 189/454] Add remaining basic kernels for memory space usage --- src/basic/IF_QUAD-Sycl.cpp | 28 --------------------------- src/basic/INIT3-Sycl.cpp | 25 ------------------------ src/basic/INIT_VIEW1D-Sycl.cpp | 17 +--------------- src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 15 -------------- src/basic/MULADDSUB-Sycl.cpp | 25 ------------------------ src/basic/NESTED_INIT-Sycl.cpp | 15 -------------- src/basic/REDUCE3_INT-Sycl.cpp | 5 +++-- src/basic/TRAP_INT-Sycl.cpp | 12 ------------ 8 files changed, 4 insertions(+), 138 deletions(-) diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index e52e1714a..bb7f8c010 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -29,22 +29,6 @@ namespace rajaperf namespace basic { -#define IF_QUAD_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(a, m_a, iend, qu); \ - allocAndInitSyclDeviceData(b, m_b, iend, qu); \ - allocAndInitSyclDeviceData(c, m_c, iend, qu); \ - allocAndInitSyclDeviceData(x1, m_x1, iend, qu); \ - allocAndInitSyclDeviceData(x2, m_x2, iend, qu); - -#define IF_QUAD_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_x1, x1, iend, qu); \ - getSyclDeviceData(m_x2, x2, iend, qu); \ - deallocSyclDeviceData(a, qu); \ - deallocSyclDeviceData(b, qu); \ - deallocSyclDeviceData(c, qu); \ - deallocSyclDeviceData(x1, qu); \ - deallocSyclDeviceData(x2, qu); - template void IF_QUAD::runSyclVariantImpl(VariantID vid) { @@ -57,8 +41,6 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { if (work_group_size > 0) { - IF_QUAD_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -79,12 +61,8 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - IF_QUAD_DATA_TEARDOWN_SYCL; - } else { - IF_QUAD_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -101,8 +79,6 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - IF_QUAD_DATA_TEARDOWN_SYCL; - } } else if ( vid == RAJA_SYCL ) { @@ -112,8 +88,6 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) return; } - IF_QUAD_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -126,8 +100,6 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - IF_QUAD_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n IF_QUAD : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index 6ea52b4dd..9761fc639 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -28,23 +28,6 @@ namespace rajaperf namespace basic { -#define INIT3_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(out1, m_out1, iend, qu); \ - allocAndInitSyclDeviceData(out2, m_out2, iend, qu); \ - allocAndInitSyclDeviceData(out3, m_out3, iend, qu); \ - allocAndInitSyclDeviceData(in1, m_in1, iend, qu); \ - allocAndInitSyclDeviceData(in2, m_in2, iend, qu); - -#define INIT3_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_out1, out1, iend, qu); \ - getSyclDeviceData(m_out2, out2, iend, qu); \ - getSyclDeviceData(m_out3, out3, iend, qu); \ - deallocSyclDeviceData(out1, qu); \ - deallocSyclDeviceData(out2, qu); \ - deallocSyclDeviceData(out3, qu); \ - deallocSyclDeviceData(in1, qu); \ - deallocSyclDeviceData(in2, qu); - template void INIT3::runSyclVariantImpl(VariantID vid) { @@ -56,8 +39,6 @@ void INIT3::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - INIT3_DATA_SETUP_SYCL; - if (work_group_size > 0) { startTimer(); @@ -101,8 +82,6 @@ void INIT3::runSyclVariantImpl(VariantID vid) stopTimer(); } - - INIT3_DATA_TEARDOWN_SYCL; } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { @@ -110,8 +89,6 @@ void INIT3::runSyclVariantImpl(VariantID vid) return; } - INIT3_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -124,8 +101,6 @@ void INIT3::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - INIT3_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n INIT3 : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index 699255499..47de6058a 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -29,13 +29,6 @@ namespace rajaperf namespace basic { -#define INIT_VIEW1D_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(a, m_a, iend, qu); - -#define INIT_VIEW1D_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_a, a, iend, qu); \ - deallocSyclDeviceData(a, qu); - template void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) { @@ -47,8 +40,6 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - INIT_VIEW1D_DATA_SETUP_SYCL; - if (work_group_size > 0) { startTimer(); @@ -90,17 +81,13 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) } - INIT_VIEW1D_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { - std::cout << "\n INIT3 : RAJA_SYCL does not support auto work group size" << std::endl; + std::cout << "\n INIT_VIEW1D : RAJA_SYCL does not support auto work group size" << std::endl; return; } - INIT_VIEW1D_DATA_SETUP_SYCL; - INIT_VIEW1D_VIEW_RAJA; startTimer(); @@ -115,8 +102,6 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - INIT_VIEW1D_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n INIT_VIEW1D : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp index a3d2317bc..53cc292f0 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -28,13 +28,6 @@ namespace rajaperf namespace basic { -#define INIT_VIEW1D_OFFSET_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(a, m_a, iend, qu); - -#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_a, a, iend, qu); \ - deallocSyclDeviceData(a, qu); - template void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) { @@ -46,8 +39,6 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - INIT_VIEW1D_OFFSET_DATA_SETUP_SYCL; - if (work_group_size > 0) { startTimer(); @@ -91,8 +82,6 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) stopTimer(); } - - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_SYCL; } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { @@ -100,8 +89,6 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) return; } - INIT_VIEW1D_OFFSET_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -114,8 +101,6 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp index 9fbea9d8f..f4f13c681 100644 --- a/src/basic/MULADDSUB-Sycl.cpp +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -28,23 +28,6 @@ namespace rajaperf namespace basic { -#define MULADDSUB_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(out1, m_out1, iend, qu); \ - allocAndInitSyclDeviceData(out2, m_out2, iend, qu); \ - allocAndInitSyclDeviceData(out3, m_out3, iend, qu); \ - allocAndInitSyclDeviceData(in1, m_in1, iend, qu); \ - allocAndInitSyclDeviceData(in2, m_in2, iend, qu); - -#define MULADDSUB_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_out1, out1, iend, qu); \ - getSyclDeviceData(m_out2, out2, iend, qu); \ - getSyclDeviceData(m_out3, out3, iend, qu); \ - deallocSyclDeviceData(out1, qu); \ - deallocSyclDeviceData(out2, qu); \ - deallocSyclDeviceData(out3, qu); \ - deallocSyclDeviceData(in1, qu); \ - deallocSyclDeviceData(in2, qu); - template void MULADDSUB::runSyclVariantImpl(VariantID vid) { @@ -56,8 +39,6 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - MULADDSUB_DATA_SETUP_SYCL; - if (work_group_size > 0) { startTimer(); @@ -101,8 +82,6 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) stopTimer(); } - - MULADDSUB_DATA_TEARDOWN_SYCL; } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { @@ -110,8 +89,6 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) return; } - MULADDSUB_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -124,8 +101,6 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - MULADDSUB_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n MULADDSUB : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index d5028b403..b7351cedb 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -28,13 +28,6 @@ namespace rajaperf namespace basic { -#define NESTED_INIT_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(array, m_array, m_array_length, qu); - -#define NESTED_INIT_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_array, array, m_array_length, qu); \ - deallocSyclDeviceData(array, qu); - template void NESTED_INIT::runSyclVariantImpl(VariantID vid) { @@ -44,8 +37,6 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - NESTED_INIT_DATA_SETUP_SYCL; - if (work_group_size > 0) { startTimer(); @@ -97,8 +88,6 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) } - NESTED_INIT_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { @@ -106,8 +95,6 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) return; } - NESTED_INIT_DATA_SETUP_SYCL; - using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::SyclKernelAsync< @@ -135,8 +122,6 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - NESTED_INIT_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n NESTED_INIT : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index 6a108cd7f..2cea55afc 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -29,7 +29,6 @@ namespace basic { #define REDUCE3_INT_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(vec, m_vec, iend, qu); \ Int_ptr hsum; \ allocAndInitSyclDeviceData(hsum, &m_vsum_init, 1, qu); \ Int_ptr hmin; \ @@ -38,7 +37,9 @@ namespace basic allocAndInitSyclDeviceData(hmax, &m_vmax_init, 1, qu); #define REDUCE3_INT_DATA_TEARDOWN_SYCL \ - deallocSyclDeviceData(vec, qu); + deallocSyclDeviceData(hsum, qu); \ + deallocSyclDeviceData(hmin, qu); \ + deallocSyclDeviceData(hmax, qu); template void REDUCE3_INT::runSyclVariantImpl(VariantID vid) diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index 08393b784..8671c10b2 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -43,10 +43,6 @@ Real_type trap_int_func(Real_type x, return denom; } -#define TRAP_INT_DATA_SETUP_SYCL // nothing to do here... - -#define TRAP_INT_DATA_TEARDOWN_SYCL // nothing to do here... - template void TRAP_INT::runSyclVariantImpl(VariantID vid) { @@ -58,8 +54,6 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - TRAP_INT_DATA_SETUP_SYCL; - if (work_group_size > 0) { startTimer(); @@ -127,8 +121,6 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) stopTimer(); } - - TRAP_INT_DATA_TEARDOWN_SYCL; } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { @@ -136,8 +128,6 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) return; } - TRAP_INT_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -154,8 +144,6 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - TRAP_INT_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n TRAP_INT : Unknown Sycl variant id = " << vid << std::endl; } From bc8e756e569bfbb113d6e4366ba978d0382506dc Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Tue, 5 Dec 2023 20:54:19 +0000 Subject: [PATCH 190/454] Add files from prior branch, needs to be included in base classes and cmake --- src/apps/DEL_DOT_VEC_2D-Sycl.cpp | 131 +++++++++++++++ src/apps/ENERGY-Sycl.cpp | 228 +++++++++++++++++++++++++++ src/apps/FIR-Sycl.cpp | 117 ++++++++++++++ src/apps/LTIMES-Sycl.cpp | 123 +++++++++++++++ src/apps/LTIMES_NOVIEW-Sycl.cpp | 123 +++++++++++++++ src/apps/PRESSURE-Sycl.cpp | 138 ++++++++++++++++ src/apps/VOL3D-Sycl.cpp | 123 +++++++++++++++ src/lcals/DIFF_PREDICT-Sycl.cpp | 107 +++++++++++++ src/lcals/EOS-Sycl.cpp | 110 +++++++++++++ src/lcals/FIRST_DIFF-Sycl.cpp | 107 +++++++++++++ src/lcals/GEN_LIN_RECUR-Sycl.cpp | 118 ++++++++++++++ src/lcals/HYDRO_1D-Sycl.cpp | 109 +++++++++++++ src/lcals/HYDRO_2D-Sycl.cpp | 212 +++++++++++++++++++++++++ src/lcals/INT_PREDICT-Sycl.cpp | 105 ++++++++++++ src/lcals/PLANCKIAN-Sycl.cpp | 116 ++++++++++++++ src/lcals/TRIDIAG_ELIM-Sycl.cpp | 102 ++++++++++++ src/polybench/POLYBENCH_2MM-Sycl.cpp | 199 +++++++++++++++++++++++ src/stream/ADD-Sycl.cpp | 109 +++++++++++++ src/stream/COPY-Sycl.cpp | 108 +++++++++++++ src/stream/DOT-Sycl.cpp | 114 ++++++++++++++ src/stream/MUL-Sycl.cpp | 106 +++++++++++++ src/stream/TRIAD-Sycl.cpp | 109 +++++++++++++ 22 files changed, 2814 insertions(+) create mode 100644 src/apps/DEL_DOT_VEC_2D-Sycl.cpp create mode 100644 src/apps/ENERGY-Sycl.cpp create mode 100644 src/apps/FIR-Sycl.cpp create mode 100644 src/apps/LTIMES-Sycl.cpp create mode 100644 src/apps/LTIMES_NOVIEW-Sycl.cpp create mode 100644 src/apps/PRESSURE-Sycl.cpp create mode 100644 src/apps/VOL3D-Sycl.cpp create mode 100644 src/lcals/DIFF_PREDICT-Sycl.cpp create mode 100644 src/lcals/EOS-Sycl.cpp create mode 100644 src/lcals/FIRST_DIFF-Sycl.cpp create mode 100644 src/lcals/GEN_LIN_RECUR-Sycl.cpp create mode 100644 src/lcals/HYDRO_1D-Sycl.cpp create mode 100644 src/lcals/HYDRO_2D-Sycl.cpp create mode 100644 src/lcals/INT_PREDICT-Sycl.cpp create mode 100644 src/lcals/PLANCKIAN-Sycl.cpp create mode 100644 src/lcals/TRIDIAG_ELIM-Sycl.cpp create mode 100644 src/polybench/POLYBENCH_2MM-Sycl.cpp create mode 100644 src/stream/ADD-Sycl.cpp create mode 100644 src/stream/COPY-Sycl.cpp create mode 100644 src/stream/DOT-Sycl.cpp create mode 100644 src/stream/MUL-Sycl.cpp create mode 100644 src/stream/TRIAD-Sycl.cpp diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp new file mode 100644 index 000000000..27a6a3cfd --- /dev/null +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -0,0 +1,131 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DEL_DOT_VEC_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "AppsData.hpp" + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define DEL_DOT_VEC_2D_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ + allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ + allocAndInitSyclDeviceData(xdot, m_xdot, m_array_length, qu); \ + allocAndInitSyclDeviceData(ydot, m_ydot, m_array_length, qu); \ + allocAndInitSyclDeviceData(div, m_div, m_array_length, qu); \ + allocAndInitSyclDeviceData(real_zones, m_domain->real_zones, iend, qu); + +#define DEL_DOT_VEC_2D_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_div, div, m_array_length, qu); \ + deallocSyclDeviceData(x, qu); \ + deallocSyclDeviceData(y, qu); \ + deallocSyclDeviceData(xdot, qu); \ + deallocSyclDeviceData(ydot, qu); \ + deallocSyclDeviceData(div, qu); \ + deallocSyclDeviceData(real_zones, qu); + +void DEL_DOT_VEC_2D::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = m_domain->n_real_zones; + + DEL_DOT_VEC_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + DEL_DOT_VEC_2D_DATA_SETUP_SYCL; + + NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; + NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; + NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; + NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type ii = item.get_global_id(0); + if (ii < iend) { + DEL_DOT_VEC_2D_BODY_INDEX + DEL_DOT_VEC_2D_BODY + } + + }); + }); + + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + DEL_DOT_VEC_2D_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + DEL_DOT_VEC_2D_DATA_SETUP_SYCL; + + NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; + NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; + NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; + NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; + + RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones, sycl_res); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + zones, [=] (Index_type i) { + DEL_DOT_VEC_2D_BODY; + }); + + } + qu->wait(); + stopTimer(); + + DEL_DOT_VEC_2D_DATA_TEARDOWN_SYCL; + + + } else { + std::cout << "\n DEL_DOT_VEC_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp new file mode 100644 index 000000000..713b80256 --- /dev/null +++ b/src/apps/ENERGY-Sycl.cpp @@ -0,0 +1,228 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ENERGY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define ENERGY_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(e_new, m_e_new, iend, qu); \ + allocAndInitSyclDeviceData(e_old, m_e_old, iend, qu); \ + allocAndInitSyclDeviceData(delvc, m_delvc, iend, qu); \ + allocAndInitSyclDeviceData(p_new, m_p_new, iend, qu); \ + allocAndInitSyclDeviceData(p_old, m_p_old, iend, qu); \ + allocAndInitSyclDeviceData(q_new, m_q_new, iend, qu); \ + allocAndInitSyclDeviceData(q_old, m_q_old, iend, qu); \ + allocAndInitSyclDeviceData(work, m_work, iend, qu); \ + allocAndInitSyclDeviceData(compHalfStep, m_compHalfStep, iend, qu); \ + allocAndInitSyclDeviceData(pHalfStep, m_pHalfStep, iend, qu); \ + allocAndInitSyclDeviceData(bvc, m_bvc, iend, qu); \ + allocAndInitSyclDeviceData(pbvc, m_pbvc, iend, qu); \ + allocAndInitSyclDeviceData(ql_old, m_ql_old, iend, qu); \ + allocAndInitSyclDeviceData(qq_old, m_qq_old, iend, qu); \ + allocAndInitSyclDeviceData(vnewc, m_vnewc, iend, qu); + +#define ENERGY_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_e_new, e_new, iend, qu); \ + getSyclDeviceData(m_q_new, q_new, iend, qu); \ + deallocSyclDeviceData(e_new, qu); \ + deallocSyclDeviceData(e_old, qu); \ + deallocSyclDeviceData(delvc, qu); \ + deallocSyclDeviceData(p_new, qu); \ + deallocSyclDeviceData(p_old, qu); \ + deallocSyclDeviceData(q_new, qu); \ + deallocSyclDeviceData(q_old, qu); \ + deallocSyclDeviceData(work, qu); \ + deallocSyclDeviceData(compHalfStep, qu); \ + deallocSyclDeviceData(pHalfStep, qu); \ + deallocSyclDeviceData(bvc, qu); \ + deallocSyclDeviceData(pbvc, qu); \ + deallocSyclDeviceData(ql_old, qu); \ + deallocSyclDeviceData(qq_old, qu); \ + deallocSyclDeviceData(vnewc, qu); + +void ENERGY::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ENERGY_DATA_SETUP; + + using sycl::sqrt; + using sycl::fabs; + + if ( vid == Base_SYCL ) { + + ENERGY_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY1 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY2 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY3 + } + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY4 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY5 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if(i < iend) { + ENERGY_BODY6 + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + ENERGY_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + ENERGY_DATA_SETUP_SYCL; + + const bool async = true; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY1; + }); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY2; + }); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY3; + }); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY4; + }); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY5; + }); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ENERGY_BODY6; + }); + + }); // end sequential region (for single-source code) + + } + qu->wait(); + stopTimer(); + + ENERGY_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n ENERGY : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp new file mode 100644 index 000000000..6ced4d5cf --- /dev/null +++ b/src/apps/FIR-Sycl.cpp @@ -0,0 +1,117 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIR.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + +#define FIR_DATA_SETUP_SYCL \ + Real_ptr coeff; \ +\ + allocAndInitSyclDeviceData(in, m_in, getActualProblemSize(), qu); \ + allocAndInitSyclDeviceData(out, m_out, getActualProblemSize(), qu); \ + Real_ptr tcoeff = &coeff_array[0]; \ + allocAndInitSyclDeviceData(coeff, tcoeff, FIR_COEFFLEN, qu); + + +#define FIR_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_out, out, getActualProblemSize(), qu); \ + deallocSyclDeviceData(in, qu); \ + deallocSyclDeviceData(out, qu); \ + deallocSyclDeviceData(coeff, qu); + +void FIR::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize() - m_coefflen; + + FIR_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + FIR_COEFF; + + FIR_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + FIR_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + FIR_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + FIR_COEFF; + + FIR_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + FIR_BODY; + }); + + } + qu->wait(); + stopTimer(); + + FIR_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n FIR : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp new file mode 100644 index 000000000..799e247b1 --- /dev/null +++ b/src/apps/LTIMES-Sycl.cpp @@ -0,0 +1,123 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "LTIMES.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +#define LTIMES_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(phidat, m_phidat, m_philen, qu); \ + allocAndInitSyclDeviceData(elldat, m_elldat, m_elllen, qu); \ + allocAndInitSyclDeviceData(psidat, m_psidat, m_psilen, qu); + +#define LTIMES_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_phidat, phidat, m_philen, qu); \ + deallocSyclDeviceData(phidat, qu); \ + deallocSyclDeviceData(elldat, qu); \ + deallocSyclDeviceData(psidat, qu); + +void LTIMES::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + LTIMES_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + LTIMES_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3> ( + sycl::range<3>(num_z, num_g, num_m), + sycl::range<3>(1,1,1)), + [=] (sycl::nd_item<3> item) { + + Index_type z = item.get_global_id(0); + Index_type g = item.get_global_id(1); + Index_type m = item.get_global_id(2); + + for (Index_type d = 0; d < num_d; ++d) { + LTIMES_BODY + } + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + LTIMES_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + LTIMES_DATA_SETUP_SYCL; + + LTIMES_VIEWS_RANGES_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernel< + RAJA::statement::For<1, RAJA::sycl_global_2<1>, //z + RAJA::statement::For<2, RAJA::sycl_global_1<1>, //g + RAJA::statement::For<3, RAJA::sycl_global_0<1>, //m + RAJA::statement::For<0, RAJA::seq_exec, //d + RAJA::statement::Lambda<0> + > + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + [=] (ID d, IZ z, IG g, IM m) { + LTIMES_BODY_RAJA; + }); + + } + qu->wait(); + stopTimer(); + + LTIMES_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n LTIMES : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp new file mode 100644 index 000000000..310d7cd60 --- /dev/null +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -0,0 +1,123 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "LTIMES_NOVIEW.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +#define LTIMES_NOVIEW_DATA_SETUP_SYCL \ +\ + allocAndInitSyclDeviceData(phidat, m_phidat, m_philen, qu); \ + allocAndInitSyclDeviceData(elldat, m_elldat, m_elllen, qu); \ + allocAndInitSyclDeviceData(psidat, m_psidat, m_psilen, qu); + +#define LTIMES_NOVIEW_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_phidat, phidat, m_philen, qu); \ + deallocSyclDeviceData(phidat, qu); \ + deallocSyclDeviceData(elldat, qu); \ + deallocSyclDeviceData(psidat, qu); + +void LTIMES_NOVIEW::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + LTIMES_NOVIEW_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + LTIMES_NOVIEW_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3> ( + sycl::range<3>(num_m, num_g, num_z), + sycl::range<3>(1, 1, 1)), + [=] (sycl::nd_item<3> item) { + + Index_type z = item.get_global_id(2); + Index_type g = item.get_global_id(1); + Index_type m = item.get_global_id(0); + + for (Index_type d = 0; d < num_d; ++d) { + LTIMES_NOVIEW_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + LTIMES_NOVIEW_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + LTIMES_NOVIEW_DATA_SETUP_SYCL; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernel< + RAJA::statement::For<1, RAJA::sycl_global_2<1>, //z + RAJA::statement::For<2, RAJA::sycl_global_1<1>, //g + RAJA::statement::For<3, RAJA::sycl_global_0<1>, //m + RAJA::statement::For<0, RAJA::seq_exec, //d + RAJA::statement::Lambda<0> + > + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + [=] (Index_type d, Index_type z, Index_type g, Index_type m) { + LTIMES_NOVIEW_BODY; + }); + + } + qu->wait(); + stopTimer(); + + LTIMES_NOVIEW_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n LTIMES_NOVIEW : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp new file mode 100644 index 000000000..269709fc7 --- /dev/null +++ b/src/apps/PRESSURE-Sycl.cpp @@ -0,0 +1,138 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PRESSURE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define PRESSURE_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(compression, m_compression, iend, qu); \ + allocAndInitSyclDeviceData(bvc, m_bvc, iend, qu); \ + allocAndInitSyclDeviceData(p_new, m_p_new, iend, qu); \ + allocAndInitSyclDeviceData(e_old, m_e_old, iend, qu); \ + allocAndInitSyclDeviceData(vnewc, m_vnewc, iend, qu); + +#define PRESSURE_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_p_new, p_new, iend, qu); \ + deallocSyclDeviceData(compression, qu); \ + deallocSyclDeviceData(bvc, qu); \ + deallocSyclDeviceData(p_new, qu); \ + deallocSyclDeviceData(e_old, qu); \ + deallocSyclDeviceData(vnewc, qu); + +void PRESSURE::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PRESSURE_DATA_SETUP; + using sycl::fabs; + + if ( vid == Base_SYCL ) { + + PRESSURE_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PRESSURE_BODY1 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PRESSURE_BODY2 + } + + }); + }); + + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + PRESSURE_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + PRESSURE_DATA_SETUP_SYCL; + + const bool async = true; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + PRESSURE_BODY1; + }); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + PRESSURE_BODY2; + }); + + }); // end sequential region (for single-source code) + + } + qu->wait(); + stopTimer(); + + PRESSURE_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n PRESSURE : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp new file mode 100644 index 000000000..a61cef626 --- /dev/null +++ b/src/apps/VOL3D-Sycl.cpp @@ -0,0 +1,123 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "VOL3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "AppsData.hpp" + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define VOL3D_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ + allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ + allocAndInitSyclDeviceData(z, m_z, m_array_length, qu); \ + allocAndInitSyclDeviceData(vol, m_vol, m_array_length, qu); + +#define VOL3D_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_vol, vol, m_array_length, qu); \ + deallocSyclDeviceData(x, qu); \ + deallocSyclDeviceData(y, qu); \ + deallocSyclDeviceData(z, qu); \ + deallocSyclDeviceData(vol, qu); + +void VOL3D::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + VOL3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + VOL3D_DATA_SETUP_SYCL; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; + NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend - ibegin, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + i += ibegin; + if(i < iend) { + VOL3D_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + VOL3D_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + VOL3D_DATA_SETUP_SYCL; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; + NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + VOL3D_BODY; + }); + + } + qu->wait(); + stopTimer(); + + VOL3D_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n VOL3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp new file mode 100644 index 000000000..dbb33c171 --- /dev/null +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFF_PREDICT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define DIFF_PREDICT_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(px, m_px, m_array_length, qu); \ + allocAndInitSyclDeviceData(cx, m_cx, m_array_length, qu); + +#define DIFF_PREDICT_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_px, px, m_array_length, qu); \ + deallocSyclDeviceData(px, qu); \ + deallocSyclDeviceData(cx, qu); + +void DIFF_PREDICT::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DIFF_PREDICT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + DIFF_PREDICT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1>(grid_size, block_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + DIFF_PREDICT_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + DIFF_PREDICT_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + DIFF_PREDICT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + DIFF_PREDICT_BODY; + }); + + } + qu->wait(); + stopTimer(); + + DIFF_PREDICT_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n DIFF_PREDICT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp new file mode 100644 index 000000000..c81cd163b --- /dev/null +++ b/src/lcals/EOS-Sycl.cpp @@ -0,0 +1,110 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "EOS.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + +#define EOS_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ + allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ + allocAndInitSyclDeviceData(z, m_z, m_array_length, qu); \ + allocAndInitSyclDeviceData(u, m_u, m_array_length, qu); + +#define EOS_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_x, x, m_array_length, qu); \ + deallocSyclDeviceData(x, qu); \ + deallocSyclDeviceData(y, qu); \ + deallocSyclDeviceData(z, qu); \ + deallocSyclDeviceData(u, qu); + +void EOS::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + EOS_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + EOS_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1>(grid_size, block_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + EOS_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + EOS_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + EOS_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + EOS_BODY; + }); + + } + qu->wait(); + stopTimer(); + + EOS_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n EOS : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp new file mode 100644 index 000000000..fa32b4e35 --- /dev/null +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_DIFF.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define FIRST_DIFF_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(x, m_x, m_N, qu); \ + allocAndInitSyclDeviceData(y, m_y, m_N, qu); + +#define FIRST_DIFF_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_x, x, m_N, qu); \ + deallocSyclDeviceData(x, qu); \ + deallocSyclDeviceData(y, qu); + +void FIRST_DIFF::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIRST_DIFF_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + FIRST_DIFF_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1>(grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + FIRST_DIFF_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + FIRST_DIFF_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + FIRST_DIFF_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + FIRST_DIFF_BODY; + }); + + } + qu->wait(); + stopTimer(); + + FIRST_DIFF_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n FIRST_DIFF : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_Sycl diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp new file mode 100644 index 000000000..9c78c87b1 --- /dev/null +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -0,0 +1,118 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "GEN_LIN_RECUR.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define GEN_LIN_RECUR_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(b5, m_b5, m_N, qu); \ + allocAndInitSyclDeviceData(stb5, m_stb5, m_N, qu); \ + allocAndInitSyclDeviceData(sa, m_sa, m_N, qu); \ + allocAndInitSyclDeviceData(sb, m_sb, m_N, qu); + +#define GEN_LIN_RECUR_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_b5, b5, m_N, qu); \ + deallocSyclDeviceData(b5, qu); \ + deallocSyclDeviceData(stb5, qu); \ + deallocSyclDeviceData(sa, qu); \ + deallocSyclDeviceData(sb, qu); + +void GEN_LIN_RECUR::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + GEN_LIN_RECUR_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + GEN_LIN_RECUR_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size1 = block_size * RAJA_DIVIDE_CEILING_INT(N, block_size); + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size1, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type k = item.get_global_id(0); + if (k < N) { + GEN_LIN_RECUR_BODY1; + } + + }); + }); + + const size_t grid_size2 = block_size * RAJA_DIVIDE_CEILING_INT(N+1, block_size); + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size2, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < N+1) { + GEN_LIN_RECUR_BODY2; + } + + }); + }); + } + qu->wait(); + stopTimer(); + + GEN_LIN_RECUR_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + GEN_LIN_RECUR_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(0, N), [=] (Index_type k) { + GEN_LIN_RECUR_BODY1; + }); + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(1, N+1), [=] (Index_type i) { + GEN_LIN_RECUR_BODY2; + }); + + } + qu->wait(); + stopTimer(); + + GEN_LIN_RECUR_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n GEN_LIN_RECUR : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp new file mode 100644 index 000000000..ab2ab8899 --- /dev/null +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -0,0 +1,109 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HYDRO_1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define HYDRO_1D_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ + allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ + allocAndInitSyclDeviceData(z, m_z, m_array_length, qu); + +#define HYDRO_1D_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_x, x, m_array_length, qu); \ + deallocSyclDeviceData(x, qu); \ + deallocSyclDeviceData(y, qu); \ + deallocSyclDeviceData(z, qu); + +void HYDRO_1D::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HYDRO_1D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + HYDRO_1D_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1>(grid_size, block_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + HYDRO_1D_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + HYDRO_1D_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + HYDRO_1D_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + HYDRO_1D_BODY; + }); + + } + qu->wait(); + stopTimer(); + + HYDRO_1D_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n HYDRO_1D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp new file mode 100644 index 000000000..3e605ed49 --- /dev/null +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -0,0 +1,212 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HYDRO_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + constexpr size_t j_block_sz = 32; + constexpr size_t k_block_sz = 8; + +#define HYDRO_2D_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(zadat, m_za, m_array_length, qu); \ + allocAndInitSyclDeviceData(zbdat, m_zb, m_array_length, qu); \ + allocAndInitSyclDeviceData(zmdat, m_zm, m_array_length, qu); \ + allocAndInitSyclDeviceData(zpdat, m_zp, m_array_length, qu); \ + allocAndInitSyclDeviceData(zqdat, m_zq, m_array_length, qu); \ + allocAndInitSyclDeviceData(zrdat, m_zr, m_array_length, qu); \ + allocAndInitSyclDeviceData(zudat, m_zu, m_array_length, qu); \ + allocAndInitSyclDeviceData(zvdat, m_zv, m_array_length, qu); \ + allocAndInitSyclDeviceData(zzdat, m_zz, m_array_length, qu); \ + allocAndInitSyclDeviceData(zroutdat, m_zrout, m_array_length, qu); \ + allocAndInitSyclDeviceData(zzoutdat, m_zzout, m_array_length, qu); + +#define HYDRO_2D_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_zrout, zroutdat, m_array_length, qu); \ + getSyclDeviceData(m_zzout, zzoutdat, m_array_length, qu); \ + deallocSyclDeviceData(zadat, qu); \ + deallocSyclDeviceData(zbdat, qu); \ + deallocSyclDeviceData(zmdat, qu); \ + deallocSyclDeviceData(zpdat, qu); \ + deallocSyclDeviceData(zqdat, qu); \ + deallocSyclDeviceData(zrdat, qu); \ + deallocSyclDeviceData(zudat, qu); \ + deallocSyclDeviceData(zvdat, qu); \ + deallocSyclDeviceData(zzdat, qu); \ + deallocSyclDeviceData(zroutdat, qu); \ + deallocSyclDeviceData(zzoutdat, qu); + +void HYDRO_2D::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type kbeg = 1; + const Index_type kend = m_kn - 1; + const Index_type jbeg = 1; + const Index_type jend = m_jn - 1; + + HYDRO_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + HYDRO_2D_DATA_SETUP_SYCL; + + auto kn_grid_size = k_block_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_block_sz); + auto jn_grid_size = j_block_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + qu->submit([&] (sycl::handler& h) { + + h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_grid_size, jn_grid_size), + sycl::range<2>(k_block_sz,j_block_sz)), + [=] (sycl::nd_item<2> item) { + + int j = item.get_global_id(1) + 1; + int k = item.get_global_id(0) + 1; + + if (j < jn-1 && k < kn-1) { + HYDRO_2D_BODY1 + } + + }); + }); + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_grid_size, jn_grid_size), + sycl::range<2>(k_block_sz,j_block_sz)), + [=] (sycl::nd_item<2> item) { + + int j = item.get_global_id(1) + 1; + int k = item.get_global_id(0) + 1; + + if (j < jn-1 && k < kn-1) { + HYDRO_2D_BODY2 + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_grid_size, jn_grid_size), + sycl::range<2>(k_block_sz,j_block_sz)), + [=] (sycl::nd_item<2> item) { + + int j = item.get_global_id(1) + 1; + int k = item.get_global_id(0) + 1; + + if (j < jn-1 && k < kn-1) { + HYDRO_2D_BODY3 + } + + }); + }); +/* qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<2>(kn-2, jn-2), + sycl::id<2>(1, 1), // offset to start a idx 1 + [=] (sycl::item<2> item ) { + int j = item.get_id(1); + int k = item.get_id(0); + HYDRO_2D_BODY2 + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::range<2>(kn-2, jn-2), + sycl::id<2>(1, 1), // offset to start a idx 1 + [=] (sycl::item<2> item ) { + int j = item.get_id(1); + int k = item.get_id(0); + HYDRO_2D_BODY3 + + }); + });*/ + + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + HYDRO_2D_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + HYDRO_2D_DATA_SETUP_SYCL; + + HYDRO_2D_VIEWS_RAJA; + + using EXECPOL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernel< + RAJA::statement::For<0, RAJA::sycl_global_1<8>, // k + RAJA::statement::For<1, RAJA::sycl_global_2<32>, // j + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY1_RAJA; + }); + + RAJA::kernel( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY2_RAJA; + }); + + RAJA::kernel( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY3_RAJA; + }); + + } + qu->wait(); + stopTimer(); + + HYDRO_2D_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n HYDRO_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp new file mode 100644 index 000000000..371be9ff7 --- /dev/null +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -0,0 +1,105 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INT_PREDICT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define INT_PREDICT_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(px, m_px, m_array_length, qu); + +#define INT_PREDICT_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_px, px, m_array_length, qu); \ + deallocSyclDeviceData(px, qu); + +void INT_PREDICT::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INT_PREDICT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + INT_PREDICT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1>(grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + INT_PREDICT_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + INT_PREDICT_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + INT_PREDICT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + INT_PREDICT_BODY; + }); + + } + qu->wait(); + stopTimer(); + + INT_PREDICT_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n INT_PREDICT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp new file mode 100644 index 000000000..e3f71316a --- /dev/null +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -0,0 +1,116 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PLANCKIAN.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define PLANCKIAN_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(x, m_x, iend, qu); \ + allocAndInitSyclDeviceData(y, m_y, iend, qu); \ + allocAndInitSyclDeviceData(u, m_u, iend, qu); \ + allocAndInitSyclDeviceData(v, m_v, iend, qu); \ + allocAndInitSyclDeviceData(w, m_w, iend, qu); + +#define PLANCKIAN_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_w, w, iend, qu); \ + deallocSyclDeviceData(x, qu); \ + deallocSyclDeviceData(y, qu); \ + deallocSyclDeviceData(u, qu); \ + deallocSyclDeviceData(v, qu); \ + deallocSyclDeviceData(w, qu); + +void PLANCKIAN::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PLANCKIAN_DATA_SETUP; + + using sycl::exp; + + if ( vid == Base_SYCL ) { + + PLANCKIAN_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + qu->submit([&] (sycl::handler& h) + { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PLANCKIAN_BODY + } + + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + + PLANCKIAN_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + PLANCKIAN_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + PLANCKIAN_BODY; + }); + + } + qu->wait(); + stopTimer(); + + PLANCKIAN_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n PLANCKIAN : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp new file mode 100644 index 000000000..06b513d03 --- /dev/null +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -0,0 +1,102 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIDIAG_ELIM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define TRIDIAG_ELIM_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(xout, m_xout, m_N, qu); \ + allocAndInitSyclDeviceData(xin, m_xin, m_N, qu); \ + allocAndInitSyclDeviceData(y, m_y, m_N, qu); \ + allocAndInitSyclDeviceData(z, m_z, m_N, qu); + +#define TRIDIAG_ELIM_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_xout, xout, m_N, qu); \ + deallocSyclDeviceData(xout, qu); \ + deallocSyclDeviceData(xin, qu); \ + deallocSyclDeviceData(y, qu); \ + deallocSyclDeviceData(z, qu); + +void TRIDIAG_ELIM::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = m_N; + + TRIDIAG_ELIM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + TRIDIAG_ELIM_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < iend) { + TRIDIAG_ELIM_BODY; + } + + }); + }); + } + qu->wait(); + stopTimer(); + + TRIDIAG_ELIM_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + TRIDIAG_ELIM_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + TRIDIAG_ELIM_BODY; + }); + + } + qu->wait(); + stopTimer(); + + TRIDIAG_ELIM_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n TRIDIAG_ELIM : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp new file mode 100644 index 000000000..bd051daf0 --- /dev/null +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -0,0 +1,199 @@ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_2MM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 16; + +#define POLYBENCH_2MM_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(tmp, m_tmp, m_ni * m_nj, qu); \ + allocAndInitSyclDeviceData(A, m_A, m_ni * m_nk, qu); \ + allocAndInitSyclDeviceData(B, m_B, m_nk * m_nj, qu); \ + allocAndInitSyclDeviceData(C, m_C, m_nj * m_nl, qu); \ + allocAndInitSyclDeviceData(D, m_D, m_ni * m_nl, qu); \ +\ + Real_type alpha = m_alpha; \ + Real_type beta = m_beta; \ + + +#define POLYBENCH_2MM_TEARDOWN_SYCL \ + getSyclDeviceData(m_D, D, m_ni * m_nl, qu); \ + deallocSyclDeviceData(tmp, qu); \ + deallocSyclDeviceData(A, qu); \ + deallocSyclDeviceData(B, qu); \ + deallocSyclDeviceData(C, qu); \ + deallocSyclDeviceData(D, qu); + +void POLYBENCH_2MM::runSyclVariant(VariantID vid) +{ + const unsigned long run_reps = getRunReps(); + + POLYBENCH_2MM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + { + POLYBENCH_2MM_DATA_SETUP_SYCL; + + const size_t ni_grid_size = block_size * RAJA_DIVIDE_CEILING_INT(m_ni, block_size); + const size_t nj_grid_size = block_size * RAJA_DIVIDE_CEILING_INT(m_nj, block_size); + const size_t nl_grid_size = block_size * RAJA_DIVIDE_CEILING_INT(m_nl, block_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + qu->submit([&] (sycl::handler& h) + { + + h.parallel_for(sycl::nd_range<2> + {sycl::range<2> {ni_grid_size, nj_grid_size}, + sycl::range<2> {block_size, block_size}}, + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type j = item.get_global_id(1); + + if (i < ni && j < nj) { + POLYBENCH_2MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_2MM_BODY2; + } + POLYBENCH_2MM_BODY3; + } + }); + }); + + qu->submit([&] (sycl::handler& h) + { + + h.parallel_for(sycl::nd_range<2> + {sycl::range<2> {ni_grid_size, nl_grid_size}, + sycl::range<2> {block_size, block_size}}, + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type l = item.get_global_id(1); + + if(i < ni && l < nl) { + POLYBENCH_2MM_BODY4; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_2MM_BODY5; + } + POLYBENCH_2MM_BODY6; + } + }); + }); + } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); + } + + POLYBENCH_2MM_TEARDOWN_SYCL; + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_2MM_DATA_SETUP_SYCL; + + POLYBENCH_2MM_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelNonTrivial< + RAJA::statement::For<0, RAJA::sycl_global_0<16>, + RAJA::statement::For<1, RAJA::sycl_global_1<16>, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk}), + RAJA::tuple{0.0}, + + [=] (Real_type &dot) { + POLYBENCH_2MM_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Index_type k, + Real_type &dot) { + POLYBENCH_2MM_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_2MM_BODY3_RAJA; + } + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nj}), + RAJA::tuple{0.0}, + + [=] (Real_type &dot) { + POLYBENCH_2MM_BODY4_RAJA; + }, + [=] (Index_type i, Index_type l, Index_type j, + Real_type &dot) { + POLYBENCH_2MM_BODY5_RAJA; + }, + [=] (Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_2MM_BODY6_RAJA; + } + ); + + } + stopTimer(); + + POLYBENCH_2MM_TEARDOWN_SYCL; + + } else { + std::cout << "\n POLYBENCH_2MM : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_Sycl + diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp new file mode 100644 index 000000000..071342647 --- /dev/null +++ b/src/stream/ADD-Sycl.cpp @@ -0,0 +1,109 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + +#define ADD_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(a, m_a, iend, qu); \ + allocAndInitSyclDeviceData(b, m_b, iend, qu); \ + allocAndInitSyclDeviceData(c, m_c, iend, qu); + +#define ADD_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_c, c, iend, qu); \ + deallocSyclDeviceData(a, qu); \ + deallocSyclDeviceData(b, qu); \ + deallocSyclDeviceData(c, qu); + +void ADD::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ADD_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + ADD_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + ADD_BODY + } + + }); + }); + } + + qu->wait(); + stopTimer(); + + ADD_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + ADD_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ADD_BODY; + }); + + } + qu->wait(); + stopTimer(); + + ADD_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n ADD : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp new file mode 100644 index 000000000..1408bb1c0 --- /dev/null +++ b/src/stream/COPY-Sycl.cpp @@ -0,0 +1,108 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "COPY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + +#define COPY_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(a, m_a, iend, qu); \ + allocAndInitSyclDeviceData(c, m_c, iend, qu); + +#define COPY_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_c, c, iend, qu); \ + deallocSyclDeviceData(a, qu); \ + deallocSyclDeviceData(c, qu); + +void COPY::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + COPY_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + COPY_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + COPY_BODY + } + + }); + }); + } + + qu->wait(); + stopTimer(); + + COPY_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + COPY_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + COPY_BODY; + }); + + } + qu->wait(); + stopTimer(); + + COPY_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n COPY : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp new file mode 100644 index 000000000..cdc8605d4 --- /dev/null +++ b/src/stream/DOT-Sycl.cpp @@ -0,0 +1,114 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DOT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace stream +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + + +#define DOT_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(a, m_a, iend, qu); \ + allocAndInitSyclDeviceData(b, m_b, iend, qu); + +#define DOT_DATA_TEARDOWN_SYCL \ + deallocSyclDeviceData(a, qu); \ + deallocSyclDeviceData(b, qu); + +void DOT::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DOT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + DOT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type dot = m_dot_init; + + { + sycl::buffer buf_dot(&dot, 1); + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + + auto sumReduction = reduction(buf_dot, h, sycl::plus()); + + h.parallel_for(sycl::nd_range<1>{grid_size, block_size}, + sumReduction, + [=] (sycl::nd_item<1> item, auto& dot) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + DOT_BODY; + } + + }); + }); + } + + m_dot += dot; + + } + stopTimer(); + + DOT_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + DOT_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall< RAJA::sycl_exec_nontrivial >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + DOT_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n DOT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp new file mode 100644 index 000000000..e2c6aa0f7 --- /dev/null +++ b/src/stream/MUL-Sycl.cpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MUL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + +#define MUL_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(b, m_b, iend, qu); \ + allocAndInitSyclDeviceData(c, m_c, iend, qu); + +#define MUL_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_b, b, iend, qu); \ + deallocSyclDeviceData(b, qu); \ + deallocSyclDeviceData(c, qu) + +void MUL::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MUL_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + MUL_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + MUL_BODY + } + }); + }); + } + + qu->wait(); + stopTimer(); + + MUL_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + MUL_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + MUL_BODY; + }); + + } + qu->wait(); + stopTimer(); + + MUL_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n MUL : Unknown Sycl variant id = " << vid << std::endl; + } +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_Sycl diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp new file mode 100644 index 000000000..45083881c --- /dev/null +++ b/src/stream/TRIAD-Sycl.cpp @@ -0,0 +1,109 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-738930 +// +// All rights reserved. +// +// This file is part of the RAJA Performance Suite. +// +// For details about use and distribution, please read RAJAPerf/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include + +#include +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace stream +{ + + // + // Define thread block size for SYCL execution + // + const size_t block_size = 256; + +#define TRIAD_DATA_SETUP_SYCL \ + allocAndInitSyclDeviceData(a, m_a, iend, qu); \ + allocAndInitSyclDeviceData(b, m_b, iend, qu); \ + allocAndInitSyclDeviceData(c, m_c, iend, qu); + +#define TRIAD_DATA_TEARDOWN_SYCL \ + getSyclDeviceData(m_a, a, iend, qu); \ + deallocSyclDeviceData(a, qu); \ + deallocSyclDeviceData(b, qu); \ + deallocSyclDeviceData(c, qu); + +void TRIAD::runSyclVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRIAD_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + TRIAD_DATA_SETUP_SYCL; + + const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + TRIAD_BODY + } + + }); + }); + } + + qu->wait(); + stopTimer(); + + TRIAD_DATA_TEARDOWN_SYCL; + + } else if ( vid == RAJA_SYCL ) { + + TRIAD_DATA_SETUP_SYCL; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + TRIAD_BODY; + }); + + } + qu->wait(); + stopTimer(); + + TRIAD_DATA_TEARDOWN_SYCL; + + } else { + std::cout << "\n TRIAD : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +} // end namespace stream +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL From 6aff454a58c931fda910ecefd4dad46ec684fd69 Mon Sep 17 00:00:00 2001 From: Thorsten Blass Date: Thu, 7 Dec 2023 05:15:00 -0600 Subject: [PATCH 191/454] [OpenMP] Fixed 'undeclared identifier' error in COPY8-OMPTarget.cpp --- src/basic/COPY8-OMPTarget.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/basic/COPY8-OMPTarget.cpp b/src/basic/COPY8-OMPTarget.cpp index 729449861..88d8e3cac 100644 --- a/src/basic/COPY8-OMPTarget.cpp +++ b/src/basic/COPY8-OMPTarget.cpp @@ -40,7 +40,8 @@ void COPY8::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp target is_device_ptr(x, y) device( did ) + #pragma omp target is_device_ptr(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, \ + y3, y4, y5, y6, y7) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { COPY8_BODY; @@ -70,4 +71,4 @@ void COPY8::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } // end namespace basic } // end namespace rajaperf -#endif // RAJA_ENABLE_TARGET_OPENMP +#endif // RAJA_ENABLE_TARGET_OPENMP From dd3c7fb28a7d262174d07987a243ae3e32254f60 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 7 Dec 2023 10:25:19 -0800 Subject: [PATCH 192/454] add extra mpi CI jobs Move ruby mpi CI job into list of extras --- .gitlab/jobs/corona.yml | 5 +++++ .gitlab/jobs/lassen.yml | 7 +++++++ .gitlab/jobs/poodle.yml | 6 ++++++ .gitlab/jobs/ruby.yml | 10 +++++----- .gitlab/jobs/tioga.yml | 5 +++++ 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index b7c5fe5b7..519d70f47 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -25,6 +25,11 @@ rocmcc_5_6_0_hip: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. +rocmcc_5_6_0_hip_mpi: + variables: + SPEC: "~shared ~openmp +rocm +mpi amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ^cray-mpich ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_corona + # With GitLab CI, included files cannot be empty. variables: INCLUDED_FILE_CANNOT_BE_EMPTY: "True" diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index 00d9f1c36..2cc28e193 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -41,6 +41,13 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled: MODULE_LIST: "cuda/11.5.0" LASSEN_JOB_ALLOC: "1 --atsdisable -W 30" +gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: + extends: .job_on_lassen + variables: + SPEC: " ~shared +openmp +cuda +mpi %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + MODULE_LIST: "cuda/11.5.0" + LASSEN_JOB_ALLOC: "1 --atsdisable -W 30" + ########## # OTHERS ########## diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 9e56823e8..adcf25494 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -40,3 +40,9 @@ intel_2022_1_0: # We do not recommend using ${PROJECT__VARIANTS} and # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. + +intel_2022_1_0_mpi: + variables: + SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + allow_failure: true + extends: .job_on_poodle diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index c61ec161a..45a0a62f8 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -36,14 +36,14 @@ intel_2022_1_0: SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby -intel_2022_1_0_mpi: - variables: - SPEC: "${PROJECT_RUBY_VARIANTS} +mpi %intel@2022.1.0 ${PROJECT_RUBY_DEPS} ^mvapich2" - extends: .job_on_ruby - ############ # Extra jobs ############ # We do not recommend using ${PROJECT__VARIANTS} and # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. + +intel_2022_1_0_mpi: + variables: + SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_ruby diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index 1cf05e4e5..f8ce39da4 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -26,3 +26,8 @@ rocmcc_5_6_0_hip_openmp: variables: SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" extends: .job_on_tioga + +rocmcc_5_6_0_hip_openmp_mpi: + variables: + SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^cray-mpich ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + extends: .job_on_tioga From 83209d182e368a78ffb25f82e0f3240859e6ce66 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 7 Dec 2023 11:17:23 -0800 Subject: [PATCH 193/454] Use new dispatch policies in HALOEXCHANGE_FUSED --- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 284 ++++++++++++++++++++- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 292 +++++++++++++++++++++- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 242 +++++++++++++++++- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 224 ++++++++++++++++- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 235 ++++++++++++++++- src/apps/HALOEXCHANGE_FUSED.hpp | 45 +++- 6 files changed, 1307 insertions(+), 15 deletions(-) diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 791742b72..5c03be327 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -89,7 +89,7 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) +void HALOEXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -170,10 +170,95 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::cuda_work_async, RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + RAJA::direct_dispatch, + camp::list> >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + res.wait(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + } + stopTimer(); + + } else { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void HALOEXCHANGE_FUSED::runCudaVariantFuncPtr(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -244,7 +329,200 @@ void HALOEXCHANGE_FUSED::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Cuda) +template < size_t block_size > +void HALOEXCHANGE_FUSED::runCudaVariantVirtFunc(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::cuda_work_async, + RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_virtual_function_dispatch >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_FUSED_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + res.wait(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_FUSED_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + } + stopTimer(); + + } else { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; + } +} + + +void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runCudaVariantDirect(vid); + + } + + t += 1; + + } + + }); + + + if (vid == RAJA_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runCudaVariantFuncPtr(vid); + + } + + t += 1; + + } + + }); + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runCudaVariantVirtFunc(vid); + + } + + t += 1; + + } + + }); + + } + +} + +void HALOEXCHANGE_FUSED::setCudaTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); + + } + + }); + + if (vid == RAJA_CUDA) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "funcptr_"+std::to_string(block_size)); + + } + + }); + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "virtfunc_"+std::to_string(block_size)); + + } + + }); + + } + +} } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index bdc168359..72293495a 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -89,7 +89,7 @@ __global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* template < size_t block_size > -void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) +void HALOEXCHANGE_FUSED::runHipVariantDirect(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -170,14 +170,192 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::direct_dispatch, + camp::list> >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + res.wait(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + } + stopTimer(); + + } else { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void HALOEXCHANGE_FUSED::runHipVariantFuncPtr(VariantID vid) +{ #if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_FUSED_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(res); + res.wait(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { + HALOEXCHANGE_FUSED_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(res); + res.wait(); + + } + stopTimer(); + + } else { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; + } #else - RAJA::ordered, + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; #endif - RAJA::constant_stride_array_of_objects >; +} + + +template < size_t block_size > +void HALOEXCHANGE_FUSED::runHipVariantVirtFunc(VariantID vid) +{ +#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + using AllocatorHolder = RAJAPoolAllocatorHolder; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_virtual_function_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -246,9 +424,115 @@ void HALOEXCHANGE_FUSED::runHipVariantImpl(VariantID vid) } else { getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; } +#else + getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; +#endif +} + + +void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runHipVariantDirect(vid); + + } + + t += 1; + + } + + }); + +#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL + if (vid == RAJA_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runHipVariantFuncPtr(vid); + + } + + t += 1; + + } + + }); + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runHipVariantVirtFunc(vid); + + } + + t += 1; + + } + + }); + + } +#endif } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HALOEXCHANGE_FUSED, Hip) +void HALOEXCHANGE_FUSED::setHipTuningDefinitions(VariantID vid) +{ + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); + + } + + }); + +#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL + if (vid == RAJA_HIP) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "funcptr_"+std::to_string(block_size)); + + } + + }); + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "virtfunc_"+std::to_string(block_size)); + + } + + }); + + } +#endif +} } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 6f228a8f6..2f80f9922 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -222,6 +222,199 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ break; } + case RAJA_OpenMP : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + RAJA::direct_dispatch, + camp::list> >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +void HALOEXCHANGE_FUSED::runOpenMPVariantFuncPtr(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + HALOEXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + + case RAJA_OpenMP : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_FUSED_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_FUSED_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +void HALOEXCHANGE_FUSED::runOpenMPVariantVirtFunc(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + HALOEXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + case RAJA_OpenMP : { using AllocatorHolder = RAJAPoolAllocatorHolder< @@ -233,7 +426,8 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::omp_work, RAJA::ordered, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + RAJA::indirect_virtual_function_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -311,5 +505,49 @@ void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ #endif } + +void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (tune_idx == t) { + + runOpenMPVariantDirect(vid); + + } + + t += 1; + + if (vid == RAJA_OpenMP) { + + if (tune_idx == t) { + + runOpenMPVariantFuncPtr(vid); + + } + + t += 1; + + if (tune_idx == t) { + + runOpenMPVariantVirtFunc(vid); + + } + + t += 1; + + } +} + +void HALOEXCHANGE_FUSED::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "direct"); + + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "funcptr"); + addVariantTuningName(vid, "virtfunc"); + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 4dd2dad31..d55b32bfd 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -63,7 +63,7 @@ namespace apps delete[] h_unpack_ptrs; -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_FUSED::runOpenMPTargetVariantDirect(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -161,10 +161,186 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_target_work /**/, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + RAJA::direct_dispatch, + camp::list> >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + } + stopTimer(); + + HALOEXCHANGE_FUSED_DATA_TEARDOWN_OMP_TARGET; + + } else { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +void HALOEXCHANGE_FUSED::runOpenMPTargetVariantFuncPtr(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_OpenMPTarget ) { + + HALOEXCHANGE_FUSED_DATA_SETUP_OMP_TARGET; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::omp_target_work /**/, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_FUSED_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_FUSED_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + } + stopTimer(); + + HALOEXCHANGE_FUSED_DATA_TEARDOWN_OMP_TARGET; + + } else { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +void HALOEXCHANGE_FUSED::runOpenMPTargetVariantVirtFunc(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALOEXCHANGE_FUSED_DATA_SETUP; + + if ( vid == RAJA_OpenMPTarget ) { + + HALOEXCHANGE_FUSED_DATA_SETUP_OMP_TARGET; + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::omp_target_work /**/, RAJA::ordered, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + RAJA::indirect_virtual_function_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -233,6 +409,50 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_U } } + +void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (tune_idx == t) { + + runOpenMPTargetVariantDirect(vid); + + } + + t += 1; + + if (vid == RAJA_OpenMPTarget) { + + if (tune_idx == t) { + + runOpenMPTargetVariantFuncPtr(vid); + + } + + t += 1; + + if (tune_idx == t) { + + runOpenMPTargetVariantVirtFunc(vid); + + } + + t += 1; + + } +} + +void HALOEXCHANGE_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "direct"); + + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "funcptr"); + addVariantTuningName(vid, "virtfunc"); + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index e6aa5fdbe..270958583 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -18,7 +18,7 @@ namespace apps { -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void HALOEXCHANGE_FUSED::runSeqVariantDirect(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -149,6 +149,96 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG break; } + case RAJA_Seq : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using range_segment = RAJA::TypedRangeSegment; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::seq_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + RAJA::direct_dispatch, + camp::list> >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +void HALOEXCHANGE_FUSED::runSeqVariantFuncPtr(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALOEXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + +#if defined(RUN_RAJA_SEQ) case RAJA_Seq : { using AllocatorHolder = RAJAPoolAllocatorHolder< @@ -160,7 +250,8 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::seq_work, RAJA::ordered, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -236,5 +327,145 @@ void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } +void HALOEXCHANGE_FUSED::runSeqVariantVirtFunc(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + HALOEXCHANGE_FUSED_DATA_SETUP; + + switch ( vid ) { + +#if defined(RUN_RAJA_SEQ) + case RAJA_Seq : { + + using AllocatorHolder = RAJAPoolAllocatorHolder< + RAJA::basic_mempool::MemPool>; + using Allocator = AllocatorHolder::Allocator; + + AllocatorHolder allocatorHolder; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::seq_work, + RAJA::ordered, + RAJA::constant_stride_array_of_objects, + RAJA::indirect_virtual_function_dispatch >; + + using workpool = RAJA::WorkPool< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + Index_type, + RAJA::xargs<>, + Allocator >; + + workpool pool_pack (allocatorHolder.template getAllocator()); + workpool pool_unpack(allocatorHolder.template getAllocator()); + pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_FUSED_PACK_BODY; + }; + pool_pack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_pack_base_lam ); + buffer += len; + } + } + workgroup group_pack = pool_pack.instantiate(); + worksite site_pack = group_pack.run(); + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_FUSED_UNPACK_BODY; + }; + pool_unpack.enqueue( + RAJA::TypedRangeSegment(0, len), + haloexchange_fused_unpack_base_lam ); + buffer += len; + } + } + workgroup group_unpack = pool_unpack.instantiate(); + worksite site_unpack = group_unpack.run(); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; + } + + } + +} + +void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if (tune_idx == t) { + + runSeqVariantDirect(vid); + + } + + t += 1; + + if (vid == RAJA_Seq) { + + if (tune_idx == t) { + + runSeqVariantFuncPtr(vid); + + } + + t += 1; + + if (tune_idx == t) { + + runSeqVariantVirtFunc(vid); + + } + + t += 1; + + } +} + +void HALOEXCHANGE_FUSED::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "direct"); + + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "funcptr"); + addVariantTuningName(vid, "virtfunc"); + } +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index b0af7e60e..f1ce2db36 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -138,12 +138,53 @@ class HALOEXCHANGE_FUSED : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + + void runSeqVariantDirect(VariantID vid); + void runOpenMPVariantDirect(VariantID vid); + template < size_t block_size > + void runCudaVariantDirect(VariantID vid); + template < size_t block_size > + void runHipVariantDirect(VariantID vid); + void runOpenMPTargetVariantDirect(VariantID vid); + + void runSeqVariantFuncPtr(VariantID vid); + void runOpenMPVariantFuncPtr(VariantID vid); + template < size_t block_size > + void runCudaVariantFuncPtr(VariantID vid); + template < size_t block_size > + void runHipVariantFuncPtr(VariantID vid); + void runOpenMPTargetVariantFuncPtr(VariantID vid); + + void runSeqVariantVirtFunc(VariantID vid); + void runOpenMPVariantVirtFunc(VariantID vid); template < size_t block_size > - void runCudaVariantImpl(VariantID vid); + void runCudaVariantVirtFunc(VariantID vid); template < size_t block_size > - void runHipVariantImpl(VariantID vid); + void runHipVariantVirtFunc(VariantID vid); + void runOpenMPTargetVariantVirtFunc(VariantID vid); + + struct Packer { + Real_ptr buffer; + Real_ptr var; + Int_ptr list; + RAJA_HOST_DEVICE void operator()(Index_type i) const { + HALOEXCHANGE_FUSED_PACK_BODY; + } + }; + + struct UnPacker { + Real_ptr buffer; + Real_ptr var; + Int_ptr list; + RAJA_HOST_DEVICE void operator()(Index_type i) const { + HALOEXCHANGE_FUSED_UNPACK_BODY; + } + }; private: static const size_t default_gpu_block_size = 1024; From ab7307fba45b82fc6d3c3677fa5cb2da47a93b00 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 12 Dec 2023 12:17:56 -0800 Subject: [PATCH 194/454] Remove commented out vars --- src/lcals/FIRST_MIN-Cuda.cpp | 2 -- src/lcals/FIRST_MIN-Hip.cpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index e50c4c7a8..ce87cf949 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -62,7 +62,6 @@ template < size_t block_size > void FIRST_MIN::runCudaVariantBlockHost(VariantID vid) { const Index_type run_reps = getRunReps(); - // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -142,7 +141,6 @@ template < size_t block_size > void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); - // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 9724e6875..968370691 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -62,7 +62,6 @@ template < size_t block_size > void FIRST_MIN::runHipVariantBlockHost(VariantID vid) { const Index_type run_reps = getRunReps(); - // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -145,7 +144,6 @@ template < size_t block_size > void FIRST_MIN::runHipVariantBlockHostOccGS(VariantID vid) { const Index_type run_reps = getRunReps(); - // const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; From f91e433cc8f530745ab64920f3b4c4a0d0963af1 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 13 Dec 2023 10:06:57 -0800 Subject: [PATCH 195/454] Convert stream kernels to new GPU launch method --- src/stream/ADD-Cuda.cpp | 21 +++++++++++++++------ src/stream/ADD-Hip.cpp | 15 +++++++++++---- src/stream/COPY-Cuda.cpp | 21 +++++++++++++++------ src/stream/COPY-Hip.cpp | 15 +++++++++++---- src/stream/DOT-Cuda.cpp | 14 ++++++++++---- src/stream/DOT-Hip.cpp | 12 ++++++++---- src/stream/MUL-Cuda.cpp | 21 +++++++++++++++------ src/stream/MUL-Hip.cpp | 15 +++++++++++---- src/stream/TRIAD-Cuda.cpp | 21 +++++++++++++++------ src/stream/TRIAD-Hip.cpp | 15 +++++++++++---- 10 files changed, 122 insertions(+), 48 deletions(-) diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index e8e095665..482df0c9d 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -52,8 +52,11 @@ void ADD::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - add<<>>( c, a, b, - iend ); + + RPlaunchCudaKernel( (add), + grid_size, block_size, + shmem, res.get_stream(), + c, a, b, iend ); cudaErrchk( cudaGetLastError() ); } @@ -64,12 +67,18 @@ void ADD::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto add_lambda = [=] __device__ (Index_type i) { + ADD_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - ADD_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, add_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 50ab42466..6f8dbe965 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -51,8 +51,11 @@ void ADD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((add), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), c, a, b, - iend ); + + RPlaunchHipKernel( (add), + grid_size, block_size, + shmem, res.get_stream(), + c, a, b, iend ); hipErrchk( hipGetLastError() ); } @@ -69,8 +72,12 @@ void ADD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, add_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, add_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 3bea59764..2a5020e1e 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -51,8 +51,11 @@ void COPY::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - copy<<>>( c, a, - iend ); + + RPlaunchCudaKernel( (copy), + grid_size, block_size, + shmem, res.get_stream(), + c, a, iend ); cudaErrchk( cudaGetLastError() ); } @@ -63,12 +66,18 @@ void COPY::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto copy_lambda = [=] __device__ (Index_type i) { + COPY_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - COPY_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 305892fdb..52f1be4d4 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -51,8 +51,11 @@ void COPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((copy), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - c, a, iend ); + + RPlaunchHipKernel( (copy), + grid_size, block_size, + shmem, res.get_stream(), + c, a, iend ); hipErrchk( hipGetLastError() ); } @@ -69,8 +72,12 @@ void COPY::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 7398b58ff..39e0f9302 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -75,8 +75,11 @@ void DOT::runCudaVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - dot<<>>( - a, b, dprod, m_dot_init, iend ); + + RPlaunchCudaKernel( (dot), + grid_size, block_size, + shmem, res.get_stream(), + a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); Real_type rdprod; @@ -136,8 +139,11 @@ void DOT::runCudaVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - dot<<>>( - a, b, dprod, m_dot_init, iend ); + + RPlaunchCudaKernel( (dot), + grid_size, block_size, + shmem, res.get_stream(), + a, b, dprod, m_dot_init, iend ); cudaErrchk( cudaGetLastError() ); Real_type rdprod; diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index db68e7685..5e955dbdc 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -75,8 +75,10 @@ void DOT::runHipVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL((dot), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), + + RPlaunchHipKernel( (dot), + grid_size, block_size, + shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); @@ -137,8 +139,10 @@ void DOT::runHipVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL((dot), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), + + RPlaunchHipKernel( (dot), + grid_size, block_size, + shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); hipErrchk( hipGetLastError() ); diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index adfebfd01..5c210f344 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -51,8 +51,11 @@ void MUL::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - mul<<>>( b, c, alpha, - iend ); + + RPlaunchCudaKernel( (mul), + grid_size, block_size, + shmem, res.get_stream(), + b, c, alpha, iend ); cudaErrchk( cudaGetLastError() ); } @@ -63,12 +66,18 @@ void MUL::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto mul_lambda = [=] __device__ (Index_type i) { + MUL_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - MUL_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, mul_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 8a2394612..f231d4bad 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -51,8 +51,11 @@ void MUL::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((mul), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), b, c, alpha, - iend ); + + RPlaunchHipKernel( (mul), + grid_size, block_size, + shmem, res.get_stream(), + b, c, alpha, iend ); hipErrchk( hipGetLastError() ); } @@ -69,8 +72,12 @@ void MUL::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, mul_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, mul_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index af3af1c63..ac8cfc8f6 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -51,8 +51,11 @@ void TRIAD::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - triad<<>>( a, b, c, alpha, - iend ); + + RPlaunchCudaKernel( (triad), + grid_size, block_size, + shmem, res.get_stream(), + a, b, c, alpha, iend ); cudaErrchk( cudaGetLastError() ); } @@ -63,12 +66,18 @@ void TRIAD::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto triad_lambda = [=] __device__ (Index_type i) { + TRIAD_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - TRIAD_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, triad_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index a8a5b9f99..667f67278 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -51,8 +51,11 @@ void TRIAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((triad), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), a, b, c, alpha, - iend ); + + RPlaunchHipKernel( (triad), + grid_size, block_size, + shmem, res.get_stream(), + a, b, c, alpha, iend ); hipErrchk( hipGetLastError() ); } @@ -69,8 +72,12 @@ void TRIAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, triad_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, triad_lambda ); hipErrchk( hipGetLastError() ); } From 5f4962ab3edacadbf2a31d1d0b3948f760b6513a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 13 Dec 2023 10:44:14 -0800 Subject: [PATCH 196/454] Generalize rajaperf::seq_all It now takes a camp::list of types and calls the function with a default initialized object of each type in order. Gpu block sizes were changed from integer sequences to lists of integral constants to work with the changes. --- src/common/GPUUtils.hpp | 80 +++++++++++++------------------------- src/rajaperf_config.hpp.in | 38 ++++++++++++------ 2 files changed, 54 insertions(+), 64 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 003efd858..1356be990 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -44,50 +44,40 @@ constexpr size_t lesser_of_squarest_factor_pair_helper(size_t n, size_t guess) : lesser_of_squarest_factor_pair_helper(n, guess - 1); // continue searching } -// class to get the size of a camp::int_seq -template < typename IntSeq > -struct SizeOfIntSeq; -/// -template < size_t... Is > -struct SizeOfIntSeq> -{ - static const size_t size = sizeof...(Is); -}; - // class to help prepend integers to a list -// this is used for the false case where I is not prepended to IntSeq -template < bool B, size_t I, typename IntSeq > +// this is used for the false case where I is not prepended to List +template < bool B, typename T, typename List > struct conditional_prepend { - using type = IntSeq; + using type = List; }; -/// this is used for the true case where I is prepended to IntSeq -template < size_t I, size_t... Is > -struct conditional_prepend> +/// this is used for the true case where I is prepended to List +template < typename T, typename... Ts > +struct conditional_prepend> { - using type = camp::int_seq; + using type = camp::list; }; -// class to help create a sequence that is only the valid values in IntSeq -template < typename validity_checker, typename IntSeq > +// class to help create a sequence that is only the valid values in List +template < typename validity_checker, typename List > struct remove_invalid; // base case where the list is empty, use the empty list template < typename validity_checker > -struct remove_invalid> +struct remove_invalid> { - using type = camp::int_seq; + using type = camp::list<>; }; -// check validity of I and conditionally prepend I to a recursively generated +// check validity of T and conditionally prepend T to a recursively generated // list of valid values -template < typename validity_checker, size_t I, size_t... Is > -struct remove_invalid> +template < typename validity_checker, typename T, typename... Ts > +struct remove_invalid> { using type = typename conditional_prepend< - validity_checker::template valid(), - I, - typename remove_invalid>::type + validity_checker::valid(T{}), + T, + typename remove_invalid>::type >::type; }; @@ -119,36 +109,30 @@ constexpr size_t greater_of_squarest_factor_pair(size_t n) // always true struct AllowAny { - template < size_t I > - static constexpr bool valid() { return true; } + static constexpr bool valid(size_t RAJAPERF_UNUSED_ARG(i)) { return true; } }; -// true if of I is a multiple of N, false otherwise +// true if of i is a multiple of N, false otherwise template < size_t N > struct MultipleOf { - template < size_t I > - static constexpr bool valid() { return (I/N)*N == I; } + static constexpr bool valid(size_t i) { return (i/N)*N == i; } }; -// true if the sqrt of I is representable as a size_t, false otherwise +// true if the sqrt of i is representable as a size_t, false otherwise struct ExactSqrt { - template < size_t I > - static constexpr bool valid() { return sqrt(I)*sqrt(I) == I; } + static constexpr bool valid(size_t i) { return sqrt(i)*sqrt(i) == i; } }; -template < size_t... block_sizes > -using list_type = camp::int_seq; - -// A camp::int_seq of size_t's that is rajaperf::configuration::gpu_block_sizes -// if rajaperf::configuration::gpu_block_sizes is not empty -// and a camp::int_seq of default_block_size otherwise -// with invalid entries removed according to validity_checker +// A camp::list of camp::integral_constant types. +// If gpu_block_sizes from the configuration is not empty it is those gpu_block_sizes, +// otherwise it is a list containing just default_block_size. +// Invalid entries are removed according to validity_checker in either case. template < size_t default_block_size, typename validity_checker = AllowAny > using make_list_type = typename detail::remove_invalid::size > 0), + typename std::conditional< (camp::size::value > 0), rajaperf::configuration::gpu_block_sizes, list_type >::type @@ -156,16 +140,6 @@ using make_list_type = } // closing brace for gpu_block_size namespace -//compile time loop over an integer sequence -//this allows for creating a loop over a compile time constant variable -template -inline void seq_for(camp::int_seq const&, Func&& func) -{ - // braced init lists are evaluated in order - int seq_unused_array[] = {(func(camp::integral_constant{}), 0)...}; - RAJAPERF_UNUSED_VAR(seq_unused_array); -} - } // closing brace for rajaperf namespace // diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index d545c0b93..ff55f4feb 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -22,6 +22,7 @@ #include "RAJA/config.hpp" #include "camp/number.hpp" +#include "camp/list.hpp" #include @@ -42,8 +43,23 @@ #include #endif +// Squash compiler warnings about unused variables +template < typename ... Ts > +inline void RAJAPERF_UNUSED_VAR(Ts&&...) { } + +// Squash compiler warnings about unused arguments +#define RAJAPERF_UNUSED_ARG(...) + namespace rajaperf { +namespace gpu_block_size { + +// helper alias to convert comma separated integer literals into list +template < size_t... Is > +using list_type = camp::list< camp::integral_constant... >; + +} // closing brace for gpu_block_size namespace + struct configuration { #if defined(RAJA_PERFSUITE_USE_CALIPER) @@ -92,11 +108,8 @@ const adiak::catstring adiak_systype_build = std::string("@RAJAPERF_BUILD_SYSTYP const adiak::catstring adiak_machine_build = std::string("@RAJAPERF_BUILD_HOST@"); #endif -// helper alias to void trailing comma in no-arg case -template < size_t... Is > -using i_seq = camp::int_seq; // List of GPU block sizes -using gpu_block_sizes = i_seq<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; +using gpu_block_sizes = gpu_block_size::list_type<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; // Name of user who ran code std::string user_run; @@ -110,13 +123,16 @@ std::string machine_run; }; -} // closing brace for rajaperf namespace - -// Squash compiler warnings about unused variables -template < typename ... Ts > -inline void RAJAPERF_UNUSED_VAR(Ts&&...) { } +//compile time loop over an integer sequence +//this allows for creating a loop over a compile time constant variable +template +inline void seq_for(camp::list const&, Func&& func) +{ + // braced init lists are evaluated in order + int seq_unused_array[] = {(func(Ts{}), 0)...}; + RAJAPERF_UNUSED_VAR(seq_unused_array); +} -// Squash compiler warnings about unused arguments -#define RAJAPERF_UNUSED_ARG(...) +} // closing brace for rajaperf namespace #endif // closing endif for header file include guard From e7e120e5c24464f0e1c24406c7067213ba60b36b Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 13 Dec 2023 15:09:12 -0800 Subject: [PATCH 197/454] Convert more basic kernels to new GPU launch method --- src/basic/ARRAY_OF_PTRS-Cuda.cpp | 21 ++++++++++++------ src/basic/ARRAY_OF_PTRS-Hip.cpp | 15 +++++++++---- src/basic/COPY8-Cuda.cpp | 31 ++++++++++++++++++--------- src/basic/COPY8-Hip.cpp | 25 ++++++++++++++------- src/basic/DAXPY-Cuda.cpp | 6 +++--- src/basic/DAXPY_ATOMIC-Cuda.cpp | 21 ++++++++++++------ src/basic/IF_QUAD-Cuda.cpp | 22 ++++++++++++++----- src/basic/IF_QUAD-Hip.cpp | 17 +++++++++++---- src/basic/INDEXLIST-Cuda.cpp | 14 ++++++------ src/basic/INDEXLIST-Hip.cpp | 15 +++++++------ src/basic/INDEXLIST_3LOOP-Cuda.cpp | 13 +++++++---- src/basic/INDEXLIST_3LOOP-Hip.cpp | 13 +++++++---- src/basic/INIT3-Cuda.cpp | 23 ++++++++++++++------ src/basic/INIT3-Hip.cpp | 17 +++++++++++---- src/basic/INIT_VIEW1D-Cuda.cpp | 20 ++++++++++++----- src/basic/INIT_VIEW1D-Hip.cpp | 15 +++++++++---- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 23 ++++++++++++++------ src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 16 ++++++++++---- src/basic/MULADDSUB-Cuda.cpp | 23 ++++++++++++++------ src/basic/MULADDSUB-Hip.cpp | 17 +++++++++++---- src/basic/NESTED_INIT-Cuda.cpp | 27 +++++++++++++++-------- src/basic/NESTED_INIT-Hip.cpp | 29 +++++++++++++++---------- src/basic/PI_ATOMIC-Cuda.cpp | 24 +++++++++++++++------ src/basic/PI_ATOMIC-Hip.cpp | 19 +++++++++++----- src/basic/PI_REDUCE-Cuda.cpp | 22 ++++++++++++------- src/basic/PI_REDUCE-Hip.cpp | 20 +++++++++++------ src/basic/REDUCE3_INT-Cuda.cpp | 30 +++++++++++++++----------- src/basic/REDUCE3_INT-Hip.cpp | 31 ++++++++++++++++----------- src/basic/TRAP_INT-Cuda.cpp | 30 +++++++++++++++----------- src/basic/TRAP_INT-Hip.cpp | 29 +++++++++++++++---------- 30 files changed, 429 insertions(+), 199 deletions(-) diff --git a/src/basic/ARRAY_OF_PTRS-Cuda.cpp b/src/basic/ARRAY_OF_PTRS-Cuda.cpp index b6b53e249..1df18e4bb 100644 --- a/src/basic/ARRAY_OF_PTRS-Cuda.cpp +++ b/src/basic/ARRAY_OF_PTRS-Cuda.cpp @@ -54,8 +54,11 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - array_of_ptrs<<>>( - y, x_array, array_size, iend ); + + RPlaunchCudaKernel( (array_of_ptrs), + grid_size, block_size, + shmem, res.get_stream(), + y, x_array, array_size, iend ); cudaErrchk( cudaGetLastError() ); } @@ -66,12 +69,18 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto array_of_ptrs_lambda = [=] __device__ (Index_type i) { + ARRAY_OF_PTRS_BODY(x); + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - ARRAY_OF_PTRS_BODY(x); - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, array_of_ptrs_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/ARRAY_OF_PTRS-Hip.cpp b/src/basic/ARRAY_OF_PTRS-Hip.cpp index 26c0f8800..4c3f39d9f 100644 --- a/src/basic/ARRAY_OF_PTRS-Hip.cpp +++ b/src/basic/ARRAY_OF_PTRS-Hip.cpp @@ -54,8 +54,11 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((array_of_ptrs),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - y, x_array, array_size, iend ); + + RPlaunchHipKernel( (array_of_ptrs), + grid_size, block_size, + shmem, res.get_stream(), + y, x_array, array_size, iend ); hipErrchk( hipGetLastError() ); } @@ -72,8 +75,12 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, array_of_ptrs_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, array_of_ptrs_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/COPY8-Cuda.cpp b/src/basic/COPY8-Cuda.cpp index b5bfeafbc..b29360256 100644 --- a/src/basic/COPY8-Cuda.cpp +++ b/src/basic/COPY8-Cuda.cpp @@ -23,8 +23,10 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) -__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, - Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, +__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, + Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, + Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, + Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; @@ -52,10 +54,13 @@ void COPY8::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - copy8<<>>( - y0, y1, y2, y3, y4, y5, y6, y7, - x0, x1, x2, x3, x4, x5, x6, x7, - iend ); + + RPlaunchCudaKernel( (copy8), + grid_size, block_size, + shmem, res.get_stream(), + y0, y1, y2, y3, y4, y5, y6, y7, + x0, x1, x2, x3, x4, x5, x6, x7, + iend ); cudaErrchk( cudaGetLastError() ); } @@ -66,12 +71,18 @@ void COPY8::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto copy8_lambda = [=] __device__ (Index_type i) { + COPY8_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - COPY8_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy8_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/COPY8-Hip.cpp b/src/basic/COPY8-Hip.cpp index fe24822f5..2c6fa5781 100644 --- a/src/basic/COPY8-Hip.cpp +++ b/src/basic/COPY8-Hip.cpp @@ -23,8 +23,10 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) -__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, - Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, +__global__ void copy8(Real_ptr y0, Real_ptr y1, Real_ptr y2, Real_ptr y3, + Real_ptr y4, Real_ptr y5, Real_ptr y6, Real_ptr y7, + Real_ptr x0, Real_ptr x1, Real_ptr x2, Real_ptr x3, + Real_ptr x4, Real_ptr x5, Real_ptr x6, Real_ptr x7, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; @@ -53,10 +55,13 @@ void COPY8::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((copy8),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - y0, y1, y2, y3, y4, y5, y6, y7, - x0, x1, x2, x3, x4, x5, x6, x7, - iend ); + + RPlaunchHipKernel( (copy8), + grid_size, block_size, + shmem, res.get_stream(), + y0, y1, y2, y3, y4, y5, y6, y7, + x0, x1, x2, x3, x4, x5, x6, x7, + iend ); hipErrchk( hipGetLastError() ); } @@ -73,8 +78,12 @@ void COPY8::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy8_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, copy8_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index eb852bf49..707facc84 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -67,13 +67,13 @@ void DAXPY::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - auto daxpy_lambda = [=] __device__ (Index_type i) { DAXPY_BODY; }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + RPlaunchCudaKernel( (lambda_cuda_forall), grid_size, block_size, diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 29a142d01..50b46bf8d 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -52,8 +52,11 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - daxpy_atomic<<>>( y, x, a, - iend ); + + RPlaunchCudaKernel( (daxpy_atomic), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); cudaErrchk( cudaGetLastError() ); } @@ -64,12 +67,18 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto daxpy_atomic_lambda = [=] __device__ (Index_type i) { + DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - DAXPY_ATOMIC_RAJA_BODY(RAJA::cuda_atomic); - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_atomic_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 0702e7d2d..01354199b 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -53,7 +53,13 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - ifquad<<>>( x1, x2, a, b, c, iend ); + + RPlaunchCudaKernel( (ifquad), + grid_size, block_size, + shmem, res.get_stream(), + x1, x2, + a, b, c, + iend ); cudaErrchk( cudaGetLastError() ); } @@ -63,12 +69,18 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto ifquad_lambda = [=] __device__ (Index_type i) { + IF_QUAD_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - IF_QUAD_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, ifquad_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 5b47d786b..4a0192094 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -53,8 +53,13 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((ifquad), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x1, x2, a, b, c, - iend ); + + RPlaunchHipKernel( (ifquad), + grid_size, block_size, + shmem, res.get_stream(), + x1, x2, + a, b, c, + iend ); hipErrchk( hipGetLastError() ); } @@ -71,8 +76,12 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, ifquad_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, ifquad_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index de674e5a7..3a682cfe9 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -268,12 +268,14 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - indexlist - <<>>( - x+ibegin, list+ibegin, - block_counts, grid_counts, block_readys, - len, iend-ibegin ); + cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + RPlaunchCudaKernel( (indexlist), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, list+ibegin, + block_counts, grid_counts, block_readys, + len, iend-ibegin ); cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index def89b8c5..84c02c045 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -268,12 +268,15 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - indexlist - <<>>( - x+ibegin, list+ibegin, - block_counts, grid_counts, block_readys, - len, iend-ibegin ); + hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + + RPlaunchHipKernel( (indexlist), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, list+ibegin, + block_counts, grid_counts, block_readys, + len, iend-ibegin ); hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 878e11d2b..8aff37e96 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -101,8 +101,11 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - indexlist_conditional<<>>( - x, counts, iend ); + + RPlaunchCudaKernel( (indexlist_conditional), + grid_size, block_size, + shmem, stream, + x, counts, iend ); cudaErrchk( cudaGetLastError() ); cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, @@ -114,8 +117,10 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) scan_size, stream)); - indexlist_make_list<<>>( - list, counts, len, iend ); + RPlaunchCudaKernel( (indexlist_make_list), + grid_size, block_size, + shmem, stream, + list, counts, len, iend ); cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize(stream) ); diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 7c2751bc0..2e4d8aa66 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -112,8 +112,11 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((indexlist_conditional), grid_size, block_size, shmem, stream, - x, counts, iend ); + + RPlaunchHipKernel( (indexlist_conditional), + grid_size, block_size, + shmem, stream, + x, counts, iend ); hipErrchk( hipGetLastError() ); #if defined(__HIPCC__) @@ -136,8 +139,10 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) stream)); #endif - hipLaunchKernelGGL((indexlist_make_list), grid_size, block_size, shmem, stream, - list, counts, len, iend ); + RPlaunchHipKernel( (indexlist_make_list), + grid_size, block_size, + shmem, stream, + list, counts, len, iend ); hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize(stream) ); diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index a6f61d73a..f30f74655 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -53,8 +53,13 @@ void INIT3::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - init3<<>>( out1, out2, out3, in1, in2, - iend ); + + RPlaunchCudaKernel( (init3), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); cudaErrchk( cudaGetLastError() ); } @@ -65,12 +70,18 @@ void INIT3::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto init3_lambda = [=] __device__ (Index_type i) { + INIT3_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - INIT3_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, init3_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index 99f5eec2b..c22d0b419 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -53,8 +53,13 @@ void INIT3::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((init3), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), out1, out2, out3, in1, in2, - iend ); + + RPlaunchHipKernel( (init3), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); hipErrchk( hipGetLastError() ); } @@ -71,8 +76,12 @@ void INIT3::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, init3_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, init3_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index ca6fbdf3c..828c87fb4 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -53,7 +53,11 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - initview1d<<>>( a, v, iend ); + + RPlaunchCudaKernel( (initview1d), + grid_size, block_size, + shmem, res.get_stream(), + a, v, iend ); cudaErrchk( cudaGetLastError() ); } @@ -64,12 +68,18 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto initview1d_lambda = [=] __device__ (Index_type i) { + INIT_VIEW1D_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - INIT_VIEW1D_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 0951d954f..8ef65ddb1 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -53,8 +53,11 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((initview1d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - a, v, iend ); + + RPlaunchHipKernel( (initview1d), + grid_size, block_size, + shmem, res.get_stream(), + a, v, iend ); hipErrchk( hipGetLastError() ); } @@ -71,8 +74,12 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 7d9bee43b..e973d6328 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -54,9 +54,12 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - initview1d_offset<<>>( a, v, - ibegin, - iend ); + + RPlaunchCudaKernel( (initview1d_offset), + grid_size, block_size, + shmem, res.get_stream(), + a, v, + ibegin, iend ); cudaErrchk( cudaGetLastError() ); } @@ -67,12 +70,18 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto initview1d_offset_lambda = [=] __device__ (Index_type i) { + INIT_VIEW1D_OFFSET_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - INIT_VIEW1D_OFFSET_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_offset_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 2fb16872f..2300fec69 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -54,8 +54,12 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((initview1d_offset), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - a, v, ibegin, iend ); + + RPlaunchHipKernel( (initview1d_offset), + grid_size, block_size, + shmem, res.get_stream(), + a, v, + ibegin, iend ); hipErrchk( hipGetLastError() ); } @@ -72,8 +76,12 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_offset_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, initview1d_offset_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 3f0dec4dd..f999394a8 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -53,8 +53,13 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - muladdsub<<>>( out1, out2, out3, in1, in2, - iend ); + + RPlaunchCudaKernel( (muladdsub), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); cudaErrchk( cudaGetLastError() ); } @@ -65,12 +70,18 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto muladdsub_lambda = [=] __device__ (Index_type i) { + MULADDSUB_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - MULADDSUB_BODY; - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, muladdsub_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 9d292001f..cc199ef17 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -53,8 +53,13 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((muladdsub), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - out1, out2, out3, in1, in2, iend ); + + RPlaunchHipKernel( (muladdsub), + grid_size, block_size, + shmem, res.get_stream(), + out1, out2, out3, + in1, in2, + iend ); hipErrchk( hipGetLastError() ); } @@ -71,8 +76,12 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, muladdsub_lambda ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, muladdsub_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 605778eb7..3b5053dcf 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -89,9 +89,11 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) NESTED_INIT_NBLOCKS_CUDA; constexpr size_t shmem = 0; - nested_init - <<>>(array, - ni, nj, nk); + RPlaunchCudaKernel( + (nested_init), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + array, ni, nj, nk ); cudaErrchk( cudaGetLastError() ); } @@ -102,16 +104,23 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto nested_init_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { + NESTED_INIT_BODY; + }; + NESTED_INIT_THREADS_PER_BLOCK_CUDA; NESTED_INIT_NBLOCKS_CUDA; constexpr size_t shmem = 0; - nested_init_lam - <<>>(ni, nj, nk, - [=] __device__ (Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - } - ); + RPlaunchCudaKernel( + (nested_init_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, nk, + nested_init_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index b7d023d7f..7a58485ee 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -89,9 +89,11 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) NESTED_INIT_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((nested_init), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - array, ni, nj, nk); + RPlaunchHipKernel( + (nested_init), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + array, ni, nj, nk ); hipErrchk( hipGetLastError() ); } @@ -102,18 +104,23 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - NESTED_INIT_THREADS_PER_BLOCK_HIP; - NESTED_INIT_NBLOCKS_HIP; - constexpr size_t shmem = 0; - - auto nested_init_lambda = [=] __device__ (Index_type i, Index_type j, + auto nested_init_lambda = [=] __device__ (Index_type i, + Index_type j, Index_type k) { NESTED_INIT_BODY; }; - hipLaunchKernelGGL((nested_init_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, nk, nested_init_lambda); + NESTED_INIT_THREADS_PER_BLOCK_HIP; + NESTED_INIT_NBLOCKS_HIP; + constexpr size_t shmem = 0; + + RPlaunchHipKernel( + (nested_init_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, nk, + nested_init_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 7446618fa..0ffebe002 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -58,7 +58,13 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - pi_atomic<<>>( pi, dx, iend ); + + RPlaunchCudaKernel( (pi_atomic), + grid_size, block_size, + shmem, res.get_stream(), + pi, + dx, + iend ); cudaErrchk( cudaGetLastError() ); Real_type rpi; @@ -75,13 +81,19 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + auto pi_atomic_lambda = [=] __device__ (Index_type i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, [=] __device__ (Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, pi_atomic_lambda ); cudaErrchk( cudaGetLastError() ); Real_type rpi; diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 1e6fd2e7a..f61e5f86b 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -58,7 +58,13 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((atomic_pi),grid_size, block_size, shmem, res.get_stream(), pi, dx, iend ); + + RPlaunchHipKernel( (pi_atomic), + grid_size, block_size, + shmem, res.get_stream(), + pi, + dx, + iend ); hipErrchk( hipGetLastError() ); Real_type rpi; @@ -75,16 +81,19 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); - auto atomic_pi_lambda = [=] __device__ (Index_type i) { + auto pi_atomic_lambda = [=] __device__ (Index_type i) { double x = (double(i) + 0.5) * dx; RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, atomic_pi_lambda); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, pi_atomic_lambda ); Real_type rpi; RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index df7ef4e11..4a8ae56d2 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -76,10 +76,13 @@ void PI_REDUCE::runCudaVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - pi_reduce<<>>( dx, - pi, m_pi_init, - iend ); + + RPlaunchCudaKernel( (pi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + dx, + pi, m_pi_init, + iend ); cudaErrchk( cudaGetLastError() ); Real_type rpi; @@ -139,10 +142,13 @@ void PI_REDUCE::runCudaVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - pi_reduce<<>>( dx, - pi, m_pi_init, - iend ); + + RPlaunchCudaKernel( (pi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + dx, + pi, m_pi_init, + iend ); cudaErrchk( cudaGetLastError() ); Real_type rpi; diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 095ca8e16..e4e11c218 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -76,9 +76,13 @@ void PI_REDUCE::runHipVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), - dx, pi, m_pi_init, iend ); + + RPlaunchHipKernel( (pi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + dx, + pi, m_pi_init, + iend ); hipErrchk( hipGetLastError() ); Real_type rpi; @@ -138,9 +142,13 @@ void PI_REDUCE::runHipVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL( (pi_reduce), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), - dx, pi, m_pi_init, iend ); + + RPlaunchHipKernel( (pi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + dx, + pi, m_pi_init, + iend ); hipErrchk( hipGetLastError() ); Real_type rpi; diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 0272af043..3e4781433 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -90,12 +90,15 @@ void REDUCE3_INT::runCudaVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - reduce3int<<>>(vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); + + RPlaunchCudaKernel( (reduce3int), + grid_size, block_size, + shmem, res.get_stream(), + vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); cudaErrchk( cudaGetLastError() ); Int_type rvmem[3]; @@ -162,12 +165,15 @@ void REDUCE3_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - reduce3int<<>>(vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); + + RPlaunchCudaKernel( (reduce3int), + grid_size, block_size, + shmem, res.get_stream(), + vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); cudaErrchk( cudaGetLastError() ); Int_type rvmem[3]; diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 73726baef..b8b59a034 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -90,12 +90,15 @@ void REDUCE3_INT::runHipVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); + + RPlaunchHipKernel( (reduce3int), + grid_size, block_size, + shmem, res.get_stream(), + vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); hipErrchk( hipGetLastError() ); Int_type rvmem[3]; @@ -162,13 +165,15 @@ void REDUCE3_INT::runHipVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), - vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); + + RPlaunchHipKernel( (reduce3int), + grid_size, block_size, + shmem, res.get_stream(), + vec, + vmem + 0, m_vsum_init, + vmem + 1, m_vmin_init, + vmem + 2, m_vmax_init, + iend ); hipErrchk( hipGetLastError() ); Int_type rvmem[3]; diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 5bafd38e1..8bf85508d 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -95,12 +95,15 @@ void TRAP_INT::runCudaVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - trapint<<>>(x0, xp, - y, yp, - h, - sumx, - iend); + + RPlaunchCudaKernel( (trapint), + grid_size, block_size, + shmem, res.get_stream(), + x0, xp, + y, yp, + h, + sumx, + iend); cudaErrchk( cudaGetLastError() ); Real_type rsumx; @@ -160,12 +163,15 @@ void TRAP_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - trapint<<>>(x0, xp, - y, yp, - h, - sumx, - iend); + + RPlaunchCudaKernel( (trapint), + grid_size, block_size, + shmem, res.get_stream(), + x0, xp, + y, yp, + h, + sumx, + iend); cudaErrchk( cudaGetLastError() ); Real_type rsumx; diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index fe6d5a5f9..94b791f04 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -95,11 +95,15 @@ void TRAP_INT::runHipVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x0, xp, - y, yp, - h, - sumx, - iend); + + RPlaunchHipKernel( (trapint), + grid_size, block_size, + shmem, res.get_stream(), + x0, xp, + y, yp, + h, + sumx, + iend); hipErrchk( hipGetLastError() ); Real_type rsumx; @@ -159,12 +163,15 @@ void TRAP_INT::runHipVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), x0, xp, - y, yp, - h, - sumx, - iend); + + RPlaunchHipKernel( (trapint), + grid_size, block_size, + shmem, res.get_stream(), + x0, xp, + y, yp, + h, + sumx, + iend); hipErrchk( hipGetLastError() ); Real_type rsumx; From a2e0b75c7caa5031f7238e384a750fd835ad6c10 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 13 Dec 2023 15:47:16 -0800 Subject: [PATCH 198/454] Abstract FUSED workgroup implementations Make a templated member function to handle different tunings. --- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 247 +++------------------ src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 259 +++------------------- src/apps/HALOEXCHANGE_FUSED-OMP.cpp | 229 +++---------------- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 219 +++--------------- src/apps/HALOEXCHANGE_FUSED-Seq.cpp | 225 +++---------------- src/apps/HALOEXCHANGE_FUSED.hpp | 61 +++-- 6 files changed, 206 insertions(+), 1034 deletions(-) diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 5c03be327..e23abf424 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -163,83 +163,13 @@ void HALOEXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { - - using AllocatorHolder = RAJAPoolAllocatorHolder; - using Allocator = AllocatorHolder::Allocator; - - AllocatorHolder allocatorHolder; - - using range_segment = RAJA::TypedRangeSegment; - - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::cuda_work_async, - RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects, - RAJA::direct_dispatch, - camp::list> >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(res); - res.wait(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(res); - res.wait(); - - } - stopTimer(); - } else { getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; } } -template < size_t block_size > -void HALOEXCHANGE_FUSED::runCudaVariantFuncPtr(VariantID vid) +template < size_t block_size, typename dispatch_helper > +void HALOEXCHANGE_FUSED::runCudaVariantWorkGroup(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -254,102 +184,17 @@ void HALOEXCHANGE_FUSED::runCudaVariantFuncPtr(VariantID vid) AllocatorHolder allocatorHolder; - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::cuda_work_async, - RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects, - RAJA::indirect_function_call_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(res); - res.wait(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(res); - res.wait(); - - } - stopTimer(); - - } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void HALOEXCHANGE_FUSED::runCudaVariantVirtFunc(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - - auto res{getCudaResource()}; - - HALOEXCHANGE_FUSED_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { - - using AllocatorHolder = RAJAPoolAllocatorHolder; - using Allocator = AllocatorHolder::Allocator; + using range_segment = RAJA::TypedRangeSegment; - AllocatorHolder allocatorHolder; + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::cuda_work_async, RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, RAJA::constant_stride_array_of_objects, - RAJA::indirect_virtual_function_dispatch >; + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -380,12 +225,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantVirtFunc(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } @@ -399,12 +239,7 @@ void HALOEXCHANGE_FUSED::runCudaVariantVirtFunc(VariantID vid) Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -420,30 +255,11 @@ void HALOEXCHANGE_FUSED::runCudaVariantVirtFunc(VariantID vid) } } - void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - - if (tune_idx == t) { - - runCudaVariantDirect(vid); - - } - - t += 1; - - } - - }); - - - if (vid == RAJA_CUDA) { + if (vid == Base_CUDA || vid == Lambda_CUDA) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -452,7 +268,7 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { - runCudaVariantFuncPtr(vid); + runCudaVariantDirect(vid); } @@ -462,18 +278,26 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) }); + } + + if (vid == RAJA_CUDA) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - runCudaVariantVirtFunc(vid); + if (tune_idx == t) { - } + runCudaVariantWorkGroup(vid); - t += 1; + } + + t += 1; + + }); } @@ -485,36 +309,33 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) void HALOEXCHANGE_FUSED::setCudaTuningDefinitions(VariantID vid) { - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - - addVariantTuningName(vid, "direct_"+std::to_string(block_size)); - - } - - }); - - if (vid == RAJA_CUDA) { + if (vid == Base_CUDA || vid == Lambda_CUDA) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "funcptr_"+std::to_string(block_size)); + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); } }); + } + + if (vid == RAJA_CUDA) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "virtfunc_"+std::to_string(block_size)); + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size)); + + }); } diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 72293495a..38ac329b4 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -163,85 +163,14 @@ void HALOEXCHANGE_FUSED::runHipVariantDirect(VariantID vid) HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP; - } else if ( vid == RAJA_HIP ) { - - using AllocatorHolder = RAJAPoolAllocatorHolder; - using Allocator = AllocatorHolder::Allocator; - - AllocatorHolder allocatorHolder; - - using range_segment = RAJA::TypedRangeSegment; - - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::hip_work_async, - RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects, - RAJA::direct_dispatch, - camp::list> >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(res); - res.wait(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(res); - res.wait(); - - } - stopTimer(); - } else { getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; } } -template < size_t block_size > -void HALOEXCHANGE_FUSED::runHipVariantFuncPtr(VariantID vid) +template < size_t block_size, typename dispatch_helper > +void HALOEXCHANGE_FUSED::runHipVariantWorkGroup(VariantID vid) { -#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) const Index_type run_reps = getRunReps(); auto res{getHipResource()}; @@ -255,107 +184,17 @@ void HALOEXCHANGE_FUSED::runHipVariantFuncPtr(VariantID vid) AllocatorHolder allocatorHolder; - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::hip_work_async, - RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects, - RAJA::indirect_function_call_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(res); - res.wait(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(res); - res.wait(); - - } - stopTimer(); - - } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; - } -#else - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; -#endif -} - - -template < size_t block_size > -void HALOEXCHANGE_FUSED::runHipVariantVirtFunc(VariantID vid) -{ -#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL - const Index_type run_reps = getRunReps(); - - auto res{getHipResource()}; - - HALOEXCHANGE_FUSED_DATA_SETUP; - - if ( vid == RAJA_HIP ) { - - using AllocatorHolder = RAJAPoolAllocatorHolder; - using Allocator = AllocatorHolder::Allocator; + using range_segment = RAJA::TypedRangeSegment; - AllocatorHolder allocatorHolder; + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::hip_work_async, RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, RAJA::constant_stride_array_of_objects, - RAJA::indirect_virtual_function_dispatch >; + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -386,12 +225,7 @@ void HALOEXCHANGE_FUSED::runHipVariantVirtFunc(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } @@ -405,12 +239,7 @@ void HALOEXCHANGE_FUSED::runHipVariantVirtFunc(VariantID vid) Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -424,35 +253,13 @@ void HALOEXCHANGE_FUSED::runHipVariantVirtFunc(VariantID vid) } else { getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; } -#else - getCout() << "\n HALOEXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl; -#endif } - void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - - if (tune_idx == t) { - - runHipVariantDirect(vid); - - } - - t += 1; - - } - - }); - -#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL - if (vid == RAJA_HIP) { + if (vid == Base_HIP || vid == Lambda_HIP) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -461,7 +268,7 @@ void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { - runHipVariantFuncPtr(vid); + runHipVariantDirect(vid); } @@ -471,67 +278,69 @@ void HALOEXCHANGE_FUSED::runHipVariant(VariantID vid, size_t tune_idx) }); + } + + if (vid == RAJA_HIP) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - runHipVariantVirtFunc(vid); + if (tune_idx == t) { - } + runHipVariantWorkGroup(vid); - t += 1; + } + + t += 1; + + }); } }); } -#endif } void HALOEXCHANGE_FUSED::setHipTuningDefinitions(VariantID vid) { - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { - - addVariantTuningName(vid, "direct_"+std::to_string(block_size)); - - } - - }); - -#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL - if (vid == RAJA_HIP) { + if (vid == Base_HIP || vid == Lambda_HIP) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "funcptr_"+std::to_string(block_size)); + addVariantTuningName(vid, "direct_"+std::to_string(block_size)); } }); + } + + if (vid == RAJA_HIP) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "virtfunc_"+std::to_string(block_size)); + seq_for(hip_workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { + + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()+"_"+std::to_string(block_size)); + + }); } }); } -#endif } } // end namespace apps diff --git a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp index 2f80f9922..43e04b471 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMP.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMP.cpp @@ -222,78 +222,6 @@ void HALOEXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid) break; } - case RAJA_OpenMP : { - - using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; - using Allocator = AllocatorHolder::Allocator; - - AllocatorHolder allocatorHolder; - - using range_segment = RAJA::TypedRangeSegment; - - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::omp_work, - RAJA::ordered, - RAJA::constant_stride_array_of_objects, - RAJA::direct_dispatch, - camp::list> >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - - } - stopTimer(); - - break; - } - default : { getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; } @@ -305,7 +233,8 @@ void HALOEXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid) #endif } -void HALOEXCHANGE_FUSED::runOpenMPVariantFuncPtr(VariantID vid) +template < typename dispatch_helper > +void HALOEXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -323,11 +252,17 @@ void HALOEXCHANGE_FUSED::runOpenMPVariantFuncPtr(VariantID vid) AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::omp_work, RAJA::ordered, RAJA::constant_stride_array_of_objects, - RAJA::indirect_function_call_dispatch >; + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -358,12 +293,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariantFuncPtr(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } @@ -376,12 +306,7 @@ void HALOEXCHANGE_FUSED::runOpenMPVariantFuncPtr(VariantID vid) Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -405,147 +330,55 @@ void HALOEXCHANGE_FUSED::runOpenMPVariantFuncPtr(VariantID vid) #endif } -void HALOEXCHANGE_FUSED::runOpenMPVariantVirtFunc(VariantID vid) +void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t tune_idx) { -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - - HALOEXCHANGE_FUSED_DATA_SETUP; - - switch ( vid ) { + size_t t = 0; - case RAJA_OpenMP : { + if (vid == Base_OpenMP || vid == Lambda_OpenMP) { - using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; - using Allocator = AllocatorHolder::Allocator; + if (tune_idx == t) { - AllocatorHolder allocatorHolder; + runOpenMPVariantDirect(vid); - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::omp_work, - RAJA::ordered, - RAJA::constant_stride_array_of_objects, - RAJA::indirect_virtual_function_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + } - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + t += 1; - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + } - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + if (vid == RAJA_OpenMP) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); + if (tune_idx == t) { - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); + runOpenMPVariantWorkGroup(vid); } - stopTimer(); - break; - } + t += 1; - default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; - } + }); } - -#else - RAJA_UNUSED_VAR(vid); -#endif } - -void HALOEXCHANGE_FUSED::runOpenMPVariant(VariantID vid, size_t tune_idx) +void HALOEXCHANGE_FUSED::setOpenMPTuningDefinitions(VariantID vid) { - size_t t = 0; - - if (tune_idx == t) { + if (vid == Base_OpenMP || vid == Lambda_OpenMP) { - runOpenMPVariantDirect(vid); + addVariantTuningName(vid, "direct"); } - t += 1; - if (vid == RAJA_OpenMP) { - if (tune_idx == t) { + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - runOpenMPVariantFuncPtr(vid); + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); - } + }); - t += 1; - - if (tune_idx == t) { - - runOpenMPVariantVirtFunc(vid); - - } - - t += 1; - - } -} - -void HALOEXCHANGE_FUSED::setOpenMPTuningDefinitions(VariantID vid) -{ - addVariantTuningName(vid, "direct"); - - if (vid == RAJA_OpenMP) { - addVariantTuningName(vid, "funcptr"); - addVariantTuningName(vid, "virtfunc"); } } diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index d55b32bfd..8f06ebd2a 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -153,83 +153,13 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariantDirect(VariantID vid) HALOEXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_OMP_TARGET; - } else if ( vid == RAJA_OpenMPTarget ) { - - using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; - using Allocator = AllocatorHolder::Allocator; - - AllocatorHolder allocatorHolder; - - using range_segment = RAJA::TypedRangeSegment; - - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::omp_target_work /**/, - RAJA::ordered, - RAJA::constant_stride_array_of_objects, - RAJA::direct_dispatch, - camp::list> >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - - } - stopTimer(); - - HALOEXCHANGE_FUSED_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } } -void HALOEXCHANGE_FUSED::runOpenMPTargetVariantFuncPtr(VariantID vid) +template < typename dispatch_helper > +void HALOEXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -237,19 +167,23 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariantFuncPtr(VariantID vid) if ( vid == RAJA_OpenMPTarget ) { - HALOEXCHANGE_FUSED_DATA_SETUP_OMP_TARGET; - using AllocatorHolder = RAJAPoolAllocatorHolder< RAJA::basic_mempool::MemPool>; using Allocator = AllocatorHolder::Allocator; AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::omp_target_work /**/, RAJA::ordered, RAJA::constant_stride_array_of_objects, - RAJA::indirect_function_call_dispatch >; + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -280,12 +214,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariantFuncPtr(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } @@ -298,12 +227,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariantFuncPtr(VariantID vid) Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -320,136 +244,55 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariantFuncPtr(VariantID vid) } } -void HALOEXCHANGE_FUSED::runOpenMPTargetVariantVirtFunc(VariantID vid) +void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { - const Index_type run_reps = getRunReps(); - - HALOEXCHANGE_FUSED_DATA_SETUP; + size_t t = 0; - if ( vid == RAJA_OpenMPTarget ) { + if (vid == Base_OpenMPTarget || vid == Lambda_OpenMPTarget) { - HALOEXCHANGE_FUSED_DATA_SETUP_OMP_TARGET; + if (tune_idx == t) { - using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; - using Allocator = AllocatorHolder::Allocator; + runOpenMPTargetVariantDirect(vid); - AllocatorHolder allocatorHolder; + } - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::omp_target_work /**/, - RAJA::ordered, - RAJA::constant_stride_array_of_objects, - RAJA::indirect_virtual_function_dispatch >; + t += 1; - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + } - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + if (vid == RAJA_OpenMPTarget) { - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + if (tune_idx == t) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + runOpenMPTargetVariantWorkGroup(vid); - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); - buffer += len; - } } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); + t += 1; - } - stopTimer(); + }); - } else { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } } - -void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) +void HALOEXCHANGE_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid) { - size_t t = 0; + if (vid == Base_OpenMPTarget || vid == Lambda_OpenMPTarget) { - if (tune_idx == t) { - - runOpenMPTargetVariantDirect(vid); + addVariantTuningName(vid, "direct"); } - t += 1; - if (vid == RAJA_OpenMPTarget) { - if (tune_idx == t) { - - runOpenMPTargetVariantFuncPtr(vid); - - } - - t += 1; - - if (tune_idx == t) { - - runOpenMPTargetVariantVirtFunc(vid); - - } - - t += 1; + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - } -} + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); -void HALOEXCHANGE_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid) -{ - addVariantTuningName(vid, "direct"); + }); - if (vid == RAJA_OpenMPTarget) { - addVariantTuningName(vid, "funcptr"); - addVariantTuningName(vid, "virtfunc"); } } diff --git a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp index 270958583..8bcd52e7d 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Seq.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Seq.cpp @@ -148,78 +148,6 @@ void HALOEXCHANGE_FUSED::runSeqVariantDirect(VariantID vid) break; } - - case RAJA_Seq : { - - using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; - using Allocator = AllocatorHolder::Allocator; - - AllocatorHolder allocatorHolder; - - using range_segment = RAJA::TypedRangeSegment; - - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::seq_work, - RAJA::ordered, - RAJA::constant_stride_array_of_objects, - RAJA::direct_dispatch, - camp::list> >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; - - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); - - } - stopTimer(); - - break; - } #endif // RUN_RAJA_SEQ default : { @@ -230,7 +158,8 @@ void HALOEXCHANGE_FUSED::runSeqVariantDirect(VariantID vid) } -void HALOEXCHANGE_FUSED::runSeqVariantFuncPtr(VariantID vid) +template < typename dispatch_helper > +void HALOEXCHANGE_FUSED::runSeqVariantWorkGroup(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -247,11 +176,17 @@ void HALOEXCHANGE_FUSED::runSeqVariantFuncPtr(VariantID vid) AllocatorHolder allocatorHolder; + using range_segment = RAJA::TypedRangeSegment; + + using dispatch_policy = typename dispatch_helper::template dispatch_policy< + camp::list, + camp::list>; + using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::seq_work, RAJA::ordered, RAJA::constant_stride_array_of_objects, - RAJA::indirect_function_call_dispatch >; + dispatch_policy >; using workpool = RAJA::WorkPool< workgroup_policy, Index_type, @@ -282,12 +217,7 @@ void HALOEXCHANGE_FUSED::runSeqVariantFuncPtr(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); buffer += len; } } @@ -300,12 +230,7 @@ void HALOEXCHANGE_FUSED::runSeqVariantFuncPtr(VariantID vid) Index_type len = unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); buffer += len; } } @@ -327,143 +252,55 @@ void HALOEXCHANGE_FUSED::runSeqVariantFuncPtr(VariantID vid) } -void HALOEXCHANGE_FUSED::runSeqVariantVirtFunc(VariantID vid) +void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t tune_idx) { - const Index_type run_reps = getRunReps(); - - HALOEXCHANGE_FUSED_DATA_SETUP; - - switch ( vid ) { + size_t t = 0; -#if defined(RUN_RAJA_SEQ) - case RAJA_Seq : { + if (vid == Base_Seq || vid == Lambda_Seq) { - using AllocatorHolder = RAJAPoolAllocatorHolder< - RAJA::basic_mempool::MemPool>; - using Allocator = AllocatorHolder::Allocator; + if (tune_idx == t) { - AllocatorHolder allocatorHolder; + runSeqVariantDirect(vid); - using workgroup_policy = RAJA::WorkGroupPolicy < - RAJA::seq_work, - RAJA::ordered, - RAJA::constant_stride_array_of_objects, - RAJA::indirect_virtual_function_dispatch >; - - using workpool = RAJA::WorkPool< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + } - using workgroup = RAJA::WorkGroup< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + t += 1; - using worksite = RAJA::WorkSite< workgroup_policy, - Index_type, - RAJA::xargs<>, - Allocator >; + } - workpool pool_pack (allocatorHolder.template getAllocator()); - workpool pool_unpack(allocatorHolder.template getAllocator()); - pool_pack.reserve(num_neighbors * num_vars, 1024ull*1024ull); - pool_unpack.reserve(num_neighbors * num_vars, 1024ull*1024ull); + if (vid == RAJA_Seq) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_PACK_BODY; - }; - pool_pack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_pack_base_lam ); - buffer += len; - } - } - workgroup group_pack = pool_pack.instantiate(); - worksite site_pack = group_pack.run(); + if (tune_idx == t) { - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_fused_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_FUSED_UNPACK_BODY; - }; - pool_unpack.enqueue( - RAJA::TypedRangeSegment(0, len), - haloexchange_fused_unpack_base_lam ); - buffer += len; - } - } - workgroup group_unpack = pool_unpack.instantiate(); - worksite site_unpack = group_unpack.run(); + runSeqVariantWorkGroup(vid); } - stopTimer(); - break; - } -#endif // RUN_RAJA_SEQ + t += 1; - default : { - getCout() << "\n HALOEXCHANGE_FUSED : Unknown variant id = " << vid << std::endl; - } + }); } - } -void HALOEXCHANGE_FUSED::runSeqVariant(VariantID vid, size_t tune_idx) +void HALOEXCHANGE_FUSED::setSeqTuningDefinitions(VariantID vid) { - size_t t = 0; - - if (tune_idx == t) { + if (vid == Base_Seq || vid == Lambda_Seq) { - runSeqVariantDirect(vid); + addVariantTuningName(vid, "direct"); } - t += 1; - if (vid == RAJA_Seq) { - if (tune_idx == t) { + seq_for(workgroup_dispatch_helpers{}, [&](auto dispatch_helper) { - runSeqVariantFuncPtr(vid); + addVariantTuningName(vid, decltype(dispatch_helper)::get_name()); - } + }); - t += 1; - - if (tune_idx == t) { - - runSeqVariantVirtFunc(vid); - - } - - t += 1; - - } -} - -void HALOEXCHANGE_FUSED::setSeqTuningDefinitions(VariantID vid) -{ - addVariantTuningName(vid, "direct"); - - if (vid == RAJA_Seq) { - addVariantTuningName(vid, "funcptr"); - addVariantTuningName(vid, "virtfunc"); } } diff --git a/src/apps/HALOEXCHANGE_FUSED.hpp b/src/apps/HALOEXCHANGE_FUSED.hpp index f1ce2db36..833588b53 100644 --- a/src/apps/HALOEXCHANGE_FUSED.hpp +++ b/src/apps/HALOEXCHANGE_FUSED.hpp @@ -117,6 +117,40 @@ namespace rajaperf { class RunParams; +struct direct_dispatch_helper +{ + template < typename... Ts > + using dispatch_policy = RAJA::direct_dispatch; + static std::string get_name() { return "direct"; } +}; + +struct indirect_function_call_dispatch_helper +{ + template < typename... Ts > + using dispatch_policy = RAJA::indirect_function_call_dispatch; + static std::string get_name() { return "funcptr"; } +}; + +struct indirect_virtual_function_dispatch_helper +{ + template < typename... Ts > + using dispatch_policy = RAJA::indirect_virtual_function_dispatch; + static std::string get_name() { return "virtfunc"; } +}; + +using workgroup_dispatch_helpers = camp::list< + direct_dispatch_helper, + indirect_function_call_dispatch_helper, + indirect_virtual_function_dispatch_helper >; + +using hip_workgroup_dispatch_helpers = camp::list< + direct_dispatch_helper +#ifdef RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL + ,indirect_function_call_dispatch_helper + ,indirect_virtual_function_dispatch_helper +#endif + >; + namespace apps { @@ -146,27 +180,22 @@ class HALOEXCHANGE_FUSED : public KernelBase void runSeqVariantDirect(VariantID vid); void runOpenMPVariantDirect(VariantID vid); + void runOpenMPTargetVariantDirect(VariantID vid); template < size_t block_size > void runCudaVariantDirect(VariantID vid); template < size_t block_size > void runHipVariantDirect(VariantID vid); - void runOpenMPTargetVariantDirect(VariantID vid); - void runSeqVariantFuncPtr(VariantID vid); - void runOpenMPVariantFuncPtr(VariantID vid); - template < size_t block_size > - void runCudaVariantFuncPtr(VariantID vid); - template < size_t block_size > - void runHipVariantFuncPtr(VariantID vid); - void runOpenMPTargetVariantFuncPtr(VariantID vid); - - void runSeqVariantVirtFunc(VariantID vid); - void runOpenMPVariantVirtFunc(VariantID vid); - template < size_t block_size > - void runCudaVariantVirtFunc(VariantID vid); - template < size_t block_size > - void runHipVariantVirtFunc(VariantID vid); - void runOpenMPTargetVariantVirtFunc(VariantID vid); + template < typename dispatch_policy > + void runSeqVariantWorkGroup(VariantID vid); + template < typename dispatch_policy > + void runOpenMPVariantWorkGroup(VariantID vid); + template < typename dispatch_policy > + void runOpenMPTargetVariantWorkGroup(VariantID vid); + template < size_t block_size, typename dispatch_policy > + void runCudaVariantWorkGroup(VariantID vid); + template < size_t block_size, typename dispatch_policy > + void runHipVariantWorkGroup(VariantID vid); struct Packer { Real_ptr buffer; From 26ac87def6ff4cc8150062bbbfbbb4c85d266e1c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Dec 2023 10:39:51 -0800 Subject: [PATCH 199/454] Re-add conjunction impl for C++ std < 17. Removed in earlier merge of develop --- src/rajaperf_config.hpp.in | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 7d7c2561e..d0e41694f 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -124,6 +124,17 @@ std::string machine_run; }; +#if __cplusplus < 201703L +// Implement std::conjunction from https://en.cppreference.com/w/cpp/types/conjunction +template struct conjunction : std::true_type {}; +template struct conjunction : B1 {}; +template +struct conjunction + : std::conditional_t, B1> {}; +#else +using std::conjunction; +#endif + //compile time loop over an integer sequence //this allows for creating a loop over a compile time constant variable template From 36df3bbe64726f2bc81cd22eb2fc5ec4223a326d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Dec 2023 11:06:45 -0800 Subject: [PATCH 200/454] Fix compilation error and make function name consistent with Cuda --- src/basic/PI_ATOMIC-Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index f61e5f86b..27e428c0c 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -23,7 +23,7 @@ namespace basic template < size_t block_size > __launch_bounds__(block_size) -__global__ void atomic_pi(Real_ptr pi, +__global__ void pi_atomic(Real_ptr pi, Real_type dx, Index_type iend) { From 1f6d740edd6e4f06a5fec096c3bca1f086ce0655 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Dec 2023 11:34:30 -0800 Subject: [PATCH 201/454] Convert REDUCE_STRUCT kernels, fix type mismatch --- src/basic/REDUCE_STRUCT-Cuda.cpp | 32 ++++++++++++++++++-------------- src/basic/REDUCE_STRUCT-Hip.cpp | 15 ++++++++------- src/basic/REDUCE_STRUCT.hpp | 2 +- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index d21258647..92f7fcb1d 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -121,13 +121,15 @@ void REDUCE_STRUCT::runCudaVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - reduce_struct<<>>( - points.x, points.y, - mem, mem+1, mem+2, // xcenter,xmin,xmax - mem+3, mem+4, mem+5, // ycenter,ymin,ymax - m_init_sum, m_init_min, m_init_max, - points.N); + + RPlaunchCudaKernel( (reduce_struct), + grid_size, block_size, + shmem, res.get_stream(), + points.x, points.y, + mem, mem+1, mem+2, // xcenter,xmin,xmax + mem+3, mem+4, mem+5, // ycenter,ymin,ymax + m_init_sum, m_init_min, m_init_max, + points.N ); cudaErrchk( cudaGetLastError() ); Real_type rmem[6]; @@ -205,13 +207,15 @@ void REDUCE_STRUCT::runCudaVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - reduce_struct<<>>( - points.x, points.y, - mem, mem+1, mem+2, // xcenter,xmin,xmax - mem+3, mem+4, mem+5, // ycenter,ymin,ymax - m_init_sum, m_init_min, m_init_max, - points.N); + + RPlaunchCudaKernel( (reduce_struct), + grid_size, block_size, + shmem, res.get_stream(), + points.x, points.y, + mem, mem+1, mem+2, // xcenter,xmin,xmax + mem+3, mem+4, mem+5, // ycenter,ymin,ymax + m_init_sum, m_init_min, m_init_max, + points.N ); cudaErrchk( cudaGetLastError() ); Real_type rmem[6]; diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 489c60411..9bede2bc1 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -123,14 +123,14 @@ void REDUCE_STRUCT::runHipVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - hipLaunchKernelGGL((reduce_struct), - dim3(grid_size), dim3(block_size), + RPlaunchHipKernel( (reduce_struct), + grid_size, block_size, shmem, res.get_stream(), - points.x, points.y, + points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, - points.N); + points.N ); hipErrchk( hipGetLastError() ); Real_type rmem[6]; @@ -207,14 +207,15 @@ void REDUCE_STRUCT::runHipVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL((reduce_struct), - dim3(grid_size), dim3(block_size), + + RPlaunchHipKernel( (reduce_struct), + grid_size, block_size, shmem, res.get_stream(), points.x, points.y, mem, mem+1, mem+2, // xcenter,xmin,xmax mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, - points.N); + points.N ); hipErrchk( hipGetLastError() ); Real_type rmem[6]; diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index ef70a50ca..10222f941 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -107,7 +107,7 @@ class REDUCE_STRUCT : public KernelBase void runHipVariantBlockDeviceOccGS(VariantID vid); struct PointsType { - Int_type N; + Index_type N; Real_ptr x, y; Real_ptr GetCenter(){return ¢er[0];}; From e929abe6cf0480c8621612e143437c9c74f3b782 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Dec 2023 13:08:33 -0800 Subject: [PATCH 202/454] Convert kernel to new launch methods --- src/basic/MAT_MAT_SHARED-Cuda.cpp | 16 ++++++++++++---- src/basic/MAT_MAT_SHARED-Hip.cpp | 17 ++++++++++------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index f63af21d7..ee77d0b22 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -73,8 +73,10 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - mat_mat_shared<<>>(N, C, A, B); - + RPlaunchCudaKernel( (mat_mat_shared), + gridDim, blockDim, + shmem, res.get_stream(), + N, C, A, B ); cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -84,7 +86,8 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - lambda_cuda<<>>([=] __device__() { + auto mat_mat_shared_lambda = [=] __device__() { + auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { MAT_MAT_SHARED_BODY_0(tile_size) @@ -171,8 +174,13 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) Index_type by = blockIdx.y; if(by < Ny) outer_y(by); } - }); + }; + RPlaunchCudaKernel( (lambda_cuda), + gridDim, blockDim, + shmem, res.get_stream(), + mat_mat_shared_lambda ); cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index d548395e3..789698670 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -73,9 +73,10 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((mat_mat_shared), dim3(gridDim), dim3(blockDim), shmem, res.get_stream(), - N, C, A, B); - + RPlaunchHipKernel( (mat_mat_shared), + gridDim, blockDim, + shmem, res.get_stream(), + N, C, A, B ); hipErrchk( hipGetLastError() ); } stopTimer(); @@ -85,7 +86,7 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - auto mat_mat_shared_lam = [=] __device__() { + auto mat_mat_shared_lambda = [=] __device__() { auto outer_y = [&](Index_type by) { auto outer_x = [&](Index_type bx) { @@ -175,9 +176,11 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((lambda_hip), - gridDim, blockDim, shmem, res.get_stream(), mat_mat_shared_lam); - + RPlaunchHipKernel( (lambda_hip), + gridDim, blockDim, + shmem, res.get_stream(), + mat_mat_shared_lambda ); hipErrchk( hipGetLastError() ); } stopTimer(); From 3ad7beb473949a39e81dde0b2fef9765ab5c6212 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Dec 2023 13:43:47 -0800 Subject: [PATCH 203/454] Convert GPU variants to new launch method --- src/lcals/DIFF_PREDICT-Cuda.cpp | 8 ++++--- src/lcals/DIFF_PREDICT-Hip.cpp | 8 ++++--- src/lcals/EOS-Cuda.cpp | 10 ++++++--- src/lcals/EOS-Hip.cpp | 10 ++++++--- src/lcals/FIRST_DIFF-Cuda.cpp | 8 +++++-- src/lcals/FIRST_DIFF-Hip.cpp | 8 +++++-- src/lcals/FIRST_MIN-Cuda.cpp | 15 +++++++++---- src/lcals/FIRST_MIN-Hip.cpp | 21 +++++++++--------- src/lcals/FIRST_SUM-Cuda.cpp | 14 +++++++----- src/lcals/FIRST_SUM-Hip.cpp | 8 +++++-- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 22 +++++++++++++------ src/lcals/GEN_LIN_RECUR-Hip.cpp | 26 ++++++++++++++-------- src/lcals/HYDRO_1D-Cuda.cpp | 10 ++++++--- src/lcals/HYDRO_1D-Hip.cpp | 10 ++++++--- src/lcals/HYDRO_2D-Cuda.cpp | 37 ++++++++++++++++++++------------ src/lcals/HYDRO_2D-Hip.cpp | 24 +++++++++++++-------- src/lcals/INT_PREDICT-Cuda.cpp | 15 ++++++++----- src/lcals/INT_PREDICT-Hip.cpp | 15 ++++++++----- src/lcals/PLANCKIAN-Cuda.cpp | 10 ++++++--- src/lcals/PLANCKIAN-Hip.cpp | 10 ++++++--- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 15 ++++++++----- src/lcals/TRIDIAG_ELIM-Hip.cpp | 14 ++++++++---- 22 files changed, 212 insertions(+), 106 deletions(-) diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index c66ca2598..e51370454 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -52,9 +52,11 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - diff_predict<<>>( px, cx, - offset, - iend ); + + RPlaunchCudaKernel( (diff_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, cx, offset, iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 7bd49a994..9c2967a1a 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -53,9 +53,11 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((diff_predict), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), px, cx, - offset, - iend ); + + RPlaunchHipKernel( (diff_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, cx, offset, iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index a3583ca53..8483d71bc 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -52,9 +52,13 @@ void EOS::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - eos<<>>( x, y, z, u, - q, r, t, - iend ); + + RPlaunchCudaKernel( (eos), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, u, + q, r, t, + iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 2cbd78891..22610f813 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -52,9 +52,13 @@ void EOS::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((eos), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, z, u, - q, r, t, - iend ); + + RPlaunchHipKernel( (eos), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, u, + q, r, t, + iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 05d73d38f..3731b1afe 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -51,8 +51,12 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - first_diff<<>>( x, y, - iend ); + + RPlaunchCudaKernel( (first_diff), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 651590776..770c62081 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -51,8 +51,12 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((first_diff), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, - iend ); + + RPlaunchHipKernel( (first_diff), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index ce87cf949..1a29d3052 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -81,8 +81,12 @@ void FIRST_MIN::runCudaVariantBlockHost(VariantID vid) RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - first_min<<>>(x, dminloc, mymin, iend); + + RPlaunchCudaKernel( (first_min), + grid_size, block_size, + shmem, res.get_stream(), + x, dminloc, mymin, + iend ); cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); @@ -164,8 +168,11 @@ void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) FIRST_MIN_MINLOC_INIT; RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); - first_min<<>>(x, dminloc, mymin, iend); + RPlaunchCudaKernel( (first_min), + grid_size, block_size, + shmem, res.get_stream(), + x, dminloc, mymin, + iend ); cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 968370691..f8592fdca 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -81,11 +81,12 @@ void FIRST_MIN::runHipVariantBlockHost(VariantID vid) RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - hipLaunchKernelGGL( (first_min), grid_size, block_size, - shmem, res.get_stream(), x, - dminloc, - mymin, - iend ); + + RPlaunchHipKernel( (first_min), + grid_size, block_size, + shmem, res.get_stream(), + x, dminloc, mymin, + iend ); hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); @@ -167,11 +168,11 @@ void FIRST_MIN::runHipVariantBlockHostOccGS(VariantID vid) FIRST_MIN_MINLOC_INIT; RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); - hipLaunchKernelGGL( (first_min), grid_size, block_size, - shmem, res.get_stream(), x, - dminloc, - mymin, - iend ); + RPlaunchHipKernel( (first_min), + grid_size, block_size, + shmem, res.get_stream(), + x, dminloc, mymin, + iend ); hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 2ac57e5a1..7a29a3154 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -49,11 +49,15 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - first_sum<<>>( x, y, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (first_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); + cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 5f48abe69..af7e35534 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -51,8 +51,12 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((first_sum),grid_size, block_size, shmem, res.get_stream(), x, y, - iend ); + + RPlaunchCudaKernel( (first_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 3790be5f5..a049f6f10 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -65,15 +65,25 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size); - genlinrecur1<<>>( b5, stb5, sa, sb, - kb5i, - N ); + + RPlaunchCudaKernel( (genlinrecur1), + grid_size1, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); cudaErrchk( cudaGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - genlinrecur2<<>>( b5, stb5, sa, sb, - kb5i, - N ); + + RPlaunchCudaKernel( (genlinrecur2), + grid_size2, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index b4dc1be54..b583c795b 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -65,17 +65,25 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(N, block_size); - hipLaunchKernelGGL((genlinrecur1), grid_size1, block_size, shmem, res.get_stream(), - b5, stb5, sa, sb, - kb5i, - N ); - hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (genlinrecur1), + grid_size1, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); + cudaErrchk( hipGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); - hipLaunchKernelGGL((genlinrecur2), grid_size2, block_size, shmem, res.get_stream(), - b5, stb5, sa, sb, - kb5i, - N ); + + RPlaunchHipKernel( (genlinrecur2), + grid_size2, block_size, + shmem, res.get_stream(), + b5, stb5, + sa, sb, + kb5i, + N ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 960f80c49..3d5a66f1e 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -52,9 +52,13 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hydro_1d<<>>( x, y, z, - q, r, t, - iend ); + + RPlaunchCudaKernel( (hydro_1d), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, + q, r, t, + iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index c04da1eb2..078f6269f 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -52,9 +52,13 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((hydro_1d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, z, - q, r, t, - iend ); + + RPlaunchHipKernel( (hydro_1d), + grid_size, block_size, + shmem, res.get_stream(), + x, y, z, + q, r, t, + iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index 2f46572b4..be8b37afa 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -111,24 +111,33 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_CUDA; HYDRO_2D_NBLOCKS_CUDA; - hydro_2d1 - <<>>(zadat, zbdat, - zpdat, zqdat, zrdat, zmdat, - jn, kn); + RPlaunchCudaKernel( (hydro_2d1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + zadat, zbdat, + zpdat, zqdat, + zrdat, zmdat, + jn, kn); cudaErrchk( cudaGetLastError() ); - hydro_2d2 - <<>>(zudat, zvdat, - zadat, zbdat, zzdat, zrdat, - s, - jn, kn); + RPlaunchCudaKernel( (hydro_2d2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + zudat, zvdat, + zadat, zbdat, + zzdat, zrdat, + s, + jn, kn); cudaErrchk( cudaGetLastError() ); - hydro_2d3 - <<>>(zroutdat, zzoutdat, - zrdat, zudat, zzdat, zvdat, - t, - jn, kn); + RPlaunchCudaKernel( (hydro_2d3), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + zroutdat, zzoutdat, + zrdat, zudat, + zzdat, zvdat, + t, + jn, kn); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index 0d65cb260..e737ff278 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -111,25 +111,31 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) HYDRO_2D_THREADS_PER_BLOCK_HIP; HYDRO_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((hydro_2d1), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), + RPlaunchHipKernel( (hydro_2d1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), zadat, zbdat, - zpdat, zqdat, zrdat, zmdat, + zpdat, zqdat, + zrdat, zmdat, jn, kn); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d2), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), + RPlaunchHipKernel( (hydro_2d2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), zudat, zvdat, - zadat, zbdat, zzdat, zrdat, + zadat, zbdat, + zzdat, zrdat, s, jn, kn); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((hydro_2d3), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), + RPlaunchHipKernel( (hydro_2d3), + nblocks, nthreads_per_block, + shmem, res.get_stream(), zroutdat, zzoutdat, - zrdat, zudat, zzdat, zvdat, + zrdat, zudat, + zzdat, zvdat, t, jn, kn); hipErrchk( hipGetLastError() ); diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index 02b22cbb8..ba05ea81f 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -55,11 +55,16 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - int_predict<<>>( px, - dm22, dm23, dm24, dm25, - dm26, dm27, dm28, c0, - offset, - iend ); + + RPlaunchCudaKernel( (int_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, + dm22, dm23, dm24, + dm25, dm26, dm27, + dm28, c0, + offset, + iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index cc0c06477..301c54f94 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -55,11 +55,16 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((int_predict), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), px, - dm22, dm23, dm24, dm25, - dm26, dm27, dm28, c0, - offset, - iend ); + + RPlaunchHipKernel( (int_predict), + grid_size, block_size, + shmem, res.get_stream(), + px, + dm22, dm23, dm24, + dm25, dm26, dm27, + dm28, c0, + offset, + iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index 76c5082cd..56011beb5 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -53,9 +53,13 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - planckian<<>>( x, y, - u, v, w, - iend ); + + RPlaunchCudaKernel( (planckian), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + u, v, w, + iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index 7d93b2dca..c73468bc2 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -53,9 +53,13 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((planckian), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), x, y, - u, v, w, - iend ); + + RPlaunchHipKernel( (planckian), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + u, v, w, + iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 8b6643d2b..21831fb9e 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -23,8 +23,9 @@ namespace lcals template < size_t block_size > __launch_bounds__(block_size) -__global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, - Index_type N) +__global__ void tridiag_elim(Real_ptr xout, Real_ptr xin, + Real_ptr y, Real_ptr z, + Index_type N) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N) { @@ -51,9 +52,13 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - eos - <<>>( xout, xin, y, z, - iend ); + + RPlaunchCudaKernel( (tridiag_elim), + grid_size, block_size, + shmem, res.get_stream(), + xout, xin, + y, z, + iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index f6c4c9ebe..fa07a9f34 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -23,8 +23,9 @@ namespace lcals template < size_t block_size > __launch_bounds__(block_size) -__global__ void eos(Real_ptr xout, Real_ptr xin, Real_ptr y, Real_ptr z, - Index_type N) +__global__ void tridiag_elim(Real_ptr xout, Real_ptr xin, + Real_ptr y, Real_ptr z, + Index_type N) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i > 0 && i < N) { @@ -51,8 +52,13 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((eos), grid_size, block_size, shmem, res.get_stream(), xout, xin, y, z, - iend ); + + RPlaunchHipKernel( (tridiag_elim), + grid_size, block_size, + shmem, res.get_stream(), + xout, xin, + y, z, + iend ); hipErrchk( hipGetLastError() ); } From 06a31be195b941afba6a9772ac21e7c07913bf66 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Dec 2023 16:02:35 -0800 Subject: [PATCH 204/454] Convert more kernels to new GPU launch method + code cleanup --- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 40 +++++---- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 34 +++++--- src/apps/EDGE3D-Cuda.cpp | 25 ++++-- src/apps/EDGE3D-Hip.cpp | 29 ++++--- src/apps/ENERGY-Cuda.cpp | 108 ++++++++++++++---------- src/apps/ENERGY-Hip.cpp | 108 ++++++++++++++---------- src/apps/FIR-Cuda.cpp | 28 +++--- src/apps/FIR-Hip.cpp | 28 +++--- src/apps/LTIMES-Cuda.cpp | 32 ++++--- src/apps/LTIMES-Hip.cpp | 35 ++++---- src/apps/LTIMES_NOVIEW-Cuda.cpp | 32 ++++--- src/apps/LTIMES_NOVIEW-Hip.cpp | 34 ++++---- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 11 ++- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 11 ++- src/apps/PRESSURE-Cuda.cpp | 31 ++++--- src/apps/PRESSURE-Hip.cpp | 31 ++++--- src/apps/VOL3D-Cuda.cpp | 15 ++-- src/apps/VOL3D-Hip.cpp | 15 ++-- src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp | 11 ++- src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp | 11 ++- 20 files changed, 398 insertions(+), 271 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 64094c4ab..41a9db07a 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -52,6 +52,7 @@ template < size_t block_size > void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; auto res{getCudaResource()}; @@ -64,16 +65,19 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - deldotvec2d<<>>(div, - x1, x2, x3, x4, - y1, y2, y3, y4, - fx1, fx2, fx3, fx4, - fy1, fy2, fy3, fy4, - real_zones, - half, ptiny, - iend); + + RPlaunchCudaKernel( (deldotvec2d), + grid_size, block_size, + shmem, res.get_stream(), + div, + x1, x2, x3, x4, + y1, y2, y3, y4, + fx1, fx2, fx3, fx4, + fy1, fy2, fy3, fy4, + real_zones, + half, ptiny, + iend ); cudaErrchk( cudaGetLastError() ); } @@ -84,16 +88,20 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + auto deldotvec2d_lambda = [=] __device__ (Index_type ii) { + DEL_DOT_VEC_2D_BODY_INDEX; + DEL_DOT_VEC_2D_BODY; + }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - 0, iend, - [=] __device__ (Index_type ii) { - DEL_DOT_VEC_2D_BODY_INDEX; - DEL_DOT_VEC_2D_BODY; - }); + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + deldotvec2d_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 590ea31b2..0ded3b56e 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -52,6 +52,7 @@ template < size_t block_size > void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; auto res{getHipResource()}; @@ -64,16 +65,19 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - hipLaunchKernelGGL((deldotvec2d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), div, - x1, x2, x3, x4, - y1, y2, y3, y4, - fx1, fx2, fx3, fx4, - fy1, fy2, fy3, fy4, - real_zones, - half, ptiny, - iend); + + RPlaunchHipKernel( (deldotvec2d), + grid_size, block_size, + shmem, res.get_stream(), + div, + x1, x2, x3, x4, + y1, y2, y3, y4, + fx1, fx2, fx3, fx4, + fy1, fy2, fy3, fy4, + real_zones, + half, ptiny, + iend ); hipErrchk( hipGetLastError() ); } @@ -85,17 +89,19 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { auto deldotvec2d_lambda = [=] __device__ (Index_type ii) { - DEL_DOT_VEC_2D_BODY_INDEX; DEL_DOT_VEC_2D_BODY; }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - 0, iend, deldotvec2d_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + deldotvec2d_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/apps/EDGE3D-Cuda.cpp b/src/apps/EDGE3D-Cuda.cpp index 9136dc961..ae7c9e394 100644 --- a/src/apps/EDGE3D-Cuda.cpp +++ b/src/apps/EDGE3D-Cuda.cpp @@ -66,11 +66,14 @@ void EDGE3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - edge3d<<>>(sum, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - ibegin, iend); + RPlaunchCudaKernel( (edge3d), + grid_size, block_size, + shmem, res.get_stream(), + sum, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + ibegin, iend ); cudaErrchk( cudaGetLastError() ); } @@ -81,13 +84,17 @@ void EDGE3D::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto edge3d_lambda = [=] __device__ (Index_type i) { EDGE3D_BODY; }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - auto edge3d_lam = [=] __device__ (Index_type i) { EDGE3D_BODY; }; - - lambda_cuda_forall<<>>( - ibegin, iend, edge3d_lam); + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + edge3d_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/EDGE3D-Hip.cpp b/src/apps/EDGE3D-Hip.cpp index 5da3606c4..b8dace845 100644 --- a/src/apps/EDGE3D-Hip.cpp +++ b/src/apps/EDGE3D-Hip.cpp @@ -65,12 +65,15 @@ void EDGE3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - - hipLaunchKernelGGL((edge3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), sum, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - ibegin, iend); + + RPlaunchHipKernel( (edge3d), + grid_size, block_size, + shmem, res.get_stream(), + sum, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + ibegin, iend ); hipErrchk( hipGetLastError() ); } @@ -81,15 +84,17 @@ void EDGE3D::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto edge3d_lambda = [=] __device__ (Index_type i) { EDGE3D_BODY; }; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - auto edge3d_lam = [=] __device__ (Index_type i) { EDGE3D_BODY; }; - - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, edge3d_lam); - + RPlaunchHipKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, + edge3d_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index a62974a15..d937826ae 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -123,51 +123,69 @@ void ENERGY::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - energycalc1<<>>( e_new, e_old, delvc, - p_old, q_old, work, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc2<<>>( delvc, q_new, - compHalfStep, pHalfStep, - e_new, bvc, pbvc, - ql_old, qq_old, - rho0, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc3<<>>( e_new, delvc, - p_old, q_old, - pHalfStep, q_new, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc4<<>>( e_new, work, - e_cut, emin, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc5<<>>( delvc, - pbvc, e_new, vnewc, - bvc, p_new, - ql_old, qq_old, - p_old, q_old, - pHalfStep, q_new, - rho0, e_cut, emin, - iend ); - cudaErrchk( cudaGetLastError() ); - - energycalc6<<>>( delvc, - pbvc, e_new, vnewc, - bvc, p_new, - q_new, - ql_old, qq_old, - rho0, q_cut, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (energycalc1), + grid_size, block_size, + shmem, res.get_stream(), + e_new, e_old, delvc, + p_old, q_old, work, + iend ); + cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (energycalc2), + grid_size, block_size, + shmem, res.get_stream(), + delvc, q_new, + compHalfStep, pHalfStep, + e_new, bvc, pbvc, + ql_old, qq_old, + rho0, + iend ); + cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (energycalc3), + grid_size, block_size, + shmem, res.get_stream(), + e_new, delvc, + p_old, q_old, + pHalfStep, q_new, + iend ); + cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (energycalc4), + grid_size, block_size, + shmem, res.get_stream(), + e_new, work, + e_cut, emin, + iend ); + cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (energycalc5), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + ql_old, qq_old, + p_old, q_old, + pHalfStep, q_new, + rho0, e_cut, emin, + iend ); + cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (energycalc6), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + q_new, + ql_old, qq_old, + rho0, q_cut, + iend ); + cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index c7064e591..2e88a1921 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -123,51 +123,69 @@ void ENERGY::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - hipLaunchKernelGGL((energycalc1), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), e_new, e_old, delvc, - p_old, q_old, work, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc2), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), delvc, q_new, - compHalfStep, pHalfStep, - e_new, bvc, pbvc, - ql_old, qq_old, - rho0, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc3), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), e_new, delvc, - p_old, q_old, - pHalfStep, q_new, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc4), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), e_new, work, - e_cut, emin, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc5), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), delvc, - pbvc, e_new, vnewc, - bvc, p_new, - ql_old, qq_old, - p_old, q_old, - pHalfStep, q_new, - rho0, e_cut, emin, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((energycalc6), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), delvc, - pbvc, e_new, vnewc, - bvc, p_new, - q_new, - ql_old, qq_old, - rho0, q_cut, - iend ); - hipErrchk( hipGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (energycalc1), + grid_size, block_size, + shmem, res.get_stream(), + e_new, e_old, delvc, + p_old, q_old, work, + iend ); + hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (energycalc2), + grid_size, block_size, + shmem, res.get_stream(), + delvc, q_new, + compHalfStep, pHalfStep, + e_new, bvc, pbvc, + ql_old, qq_old, + rho0, + iend ); + hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (energycalc3), + grid_size, block_size, + shmem, res.get_stream(), + e_new, delvc, + p_old, q_old, + pHalfStep, q_new, + iend ); + hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (energycalc4), + grid_size, block_size, + shmem, res.get_stream(), + e_new, work, + e_cut, emin, + iend ); + hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (energycalc5), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + ql_old, qq_old, + p_old, q_old, + pHalfStep, q_new, + rho0, e_cut, emin, + iend ); + hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (energycalc6), + grid_size, block_size, + shmem, res.get_stream(), + delvc, + pbvc, e_new, vnewc, + bvc, p_new, + q_new, + ql_old, qq_old, + rho0, q_cut, + iend ); + hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 01266c7d6..2a1725522 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -98,20 +98,26 @@ void FIR::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; #if defined(USE_CUDA_CONSTANT_MEMORY) - fir<<>>( out, in, - coefflen, - iend ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coefflen, + iend ); + cudaErrchk( cudaGetLastError() ); #else - fir<<>>( out, in, - coeff, - coefflen, - iend ); - cudaErrchk( cudaGetLastError() ); + RPlaunchCudaKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coeff, + coefflen, + iend ); + udaErrchk( cudaGetLastError() ); #endif } diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 47dc40efb..f075b3c9c 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -96,20 +96,26 @@ void FIR::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; #if defined(USE_HIP_CONSTANT_MEMORY) - hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), out, in, - coefflen, - iend ); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coefflen, + iend ); + hipErrchk( hipGetLastError() ); #else - hipLaunchKernelGGL((fir), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), out, in, - coeff, - coefflen, - iend ); - hipErrchk( hipGetLastError() ); + RPlaunchHipKernel( (fir), + grid_size, block_size, + shmem, res.get_stream(), + out, in, + coeff, + coefflen, + iend ); + hipErrchk( hipGetLastError() ); #endif } diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 8fe91fbf7..1f40e840b 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -91,10 +91,12 @@ void LTIMES::runCudaVariantImpl(VariantID vid) LTIMES_NBLOCKS_CUDA; constexpr size_t shmem = 0; - ltimes - <<>>(phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); + RPlaunchCudaKernel( + (ltimes), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); cudaErrchk( cudaGetLastError() ); } @@ -105,18 +107,24 @@ void LTIMES::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, + Index_type m) { + for (Index_type d = 0; d < num_d; ++d ) { + LTIMES_BODY; + } + }; + LTIMES_THREADS_PER_BLOCK_CUDA; LTIMES_NBLOCKS_CUDA; constexpr size_t shmem = 0; - ltimes_lam - <<>>(num_m, num_g, num_z, - [=] __device__ (Index_type z, Index_type g, Index_type m) { - for (Index_type d = 0; d < num_d; ++d ) { - LTIMES_BODY; - } - } - ); + RPlaunchCudaKernel( + (ltimes_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 035bbc12d..07ce0c7bc 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -90,11 +90,12 @@ void LTIMES::runHipVariantImpl(VariantID vid) LTIMES_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((ltimes), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); + RPlaunchHipKernel( + (ltimes), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); hipErrchk( hipGetLastError() ); } @@ -105,20 +106,24 @@ void LTIMES::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto ltimes_lambda = [=] __device__ (Index_type z, Index_type g, + Index_type m) { + for (Index_type d = 0; d < num_d; ++d ) { + LTIMES_BODY; + } + }; + LTIMES_THREADS_PER_BLOCK_HIP; LTIMES_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto ltimes_lambda = - [=] __device__ (Index_type z, Index_type g, Index_type m) { - for (Index_type d = 0; d < num_d; ++d ) { - LTIMES_BODY; - } - }; - - hipLaunchKernelGGL((ltimes_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - num_m, num_g, num_z, ltimes_lambda); + RPlaunchHipKernel( + (ltimes_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index f12e5d131..9b7bd4ac4 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -90,10 +90,12 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) LTIMES_NOVIEW_NBLOCKS_CUDA; constexpr size_t shmem = 0; - ltimes_noview - <<>>(phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); + RPlaunchCudaKernel( + (ltimes_noview), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); cudaErrchk( cudaGetLastError() ); } @@ -104,18 +106,24 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA; - LTIMES_NOVIEW_NBLOCKS_CUDA; - constexpr size_t shmem = 0; - - ltimes_noview_lam - <<>>(num_m, num_g, num_z, + auto ltimes_noview_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_NOVIEW_BODY; } - } - ); + }; + + LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA; + LTIMES_NOVIEW_NBLOCKS_CUDA; + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( + (ltimes_noview_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_noview_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 7252a5402..4ecf9c344 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -90,11 +90,12 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) LTIMES_NOVIEW_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((ltimes_noview), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - phidat, elldat, psidat, - num_d, - num_m, num_g, num_z); + RPlaunchHipKernel( + (ltimes_noview), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + phidat, elldat, psidat, + num_d, num_m, num_g, num_z ); hipErrchk( hipGetLastError() ); } @@ -105,21 +106,24 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; - LTIMES_NOVIEW_NBLOCKS_HIP; - constexpr size_t shmem = 0; - - auto ltimes_noview_lambda = + auto ltimes_noview_lambda = [=] __device__ (Index_type z, Index_type g, Index_type m) { for (Index_type d = 0; d < num_d; ++d ) { LTIMES_NOVIEW_BODY; } - }; + }; + + LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP; + LTIMES_NOVIEW_NBLOCKS_HIP; + constexpr size_t shmem = 0; - hipLaunchKernelGGL((ltimes_noview_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - num_m, num_g, num_z, - ltimes_noview_lambda); + RPlaunchHipKernel( + (ltimes_noview_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + num_m, num_g, num_z, + ltimes_noview_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index 0b5d3b078..670a5e03a 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -61,10 +61,13 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - nodal_accumulation_3d<<>>(vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); + RPlaunchCudaKernel( (nodal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index 476ab5da8..66297261f 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -61,10 +61,13 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((nodal_accumulation_3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); + RPlaunchHipKernel( (nodal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 16b395259..151b83890 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -64,19 +64,24 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - pressurecalc1<<>>( bvc, compression, - cls, - iend ); - cudaErrchk( cudaGetLastError() ); - - pressurecalc2<<>>( p_new, bvc, e_old, - vnewc, - p_cut, eosvmax, pmin, - iend ); - cudaErrchk( cudaGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (pressurecalc1), + grid_size, block_size, + shmem, res.get_stream(), + bvc, compression, cls, + iend ); + cudaErrchk( cudaGetLastError() ); + + RPlaunchCudaKernel( (pressurecalc2), + grid_size, block_size, + shmem, res.get_stream(), + p_new, bvc, e_old, + vnewc, + p_cut, eosvmax, pmin, + iend ); + cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index 1d590e6f0..bf69c3815 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -64,19 +64,24 @@ void PRESSURE::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - hipLaunchKernelGGL((pressurecalc1), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), bvc, compression, - cls, - iend ); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((pressurecalc2), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), p_new, bvc, e_old, - vnewc, - p_cut, eosvmax, pmin, - iend ); - hipErrchk( hipGetLastError() ); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (pressurecalc1), + grid_size, block_size, + shmem, res.get_stream(), + bvc, compression, cls, + iend ); + hipErrchk( hipGetLastError() ); + + RPlaunchHipKernel( (pressurecalc2), + grid_size, block_size, + shmem, res.get_stream(), + p_new, bvc, e_old, + vnewc, + p_cut, eosvmax, pmin, + iend ); + hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 18bb89c4c..2e34858a7 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -68,12 +68,15 @@ void VOL3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - vol3d<<>>(vol, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - vnormq, - ibegin, iend); + RPlaunchCudaKernel( (vol3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + vnormq, + ibegin, iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 9a0a2323b..207718691 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -68,12 +68,15 @@ void VOL3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((vol3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol, - x0, x1, x2, x3, x4, x5, x6, x7, - y0, y1, y2, y3, y4, y5, y6, y7, - z0, z1, z2, z3, z4, z5, z6, z7, - vnormq, - ibegin, iend); + RPlaunchHipKernel( (vol3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + y0, y1, y2, y3, y4, y5, y6, y7, + z0, z1, z2, z3, z4, z5, z6, z7, + vnormq, + ibegin, iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp index 1dc6216d7..cc7180d80 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp @@ -61,10 +61,13 @@ void ZONAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - zonal_accumulation_3d<<>>(vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); + RPlaunchCudaKernel( (zonal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp index d861128b9..acf838d64 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp @@ -61,10 +61,13 @@ void ZONAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((zonal_accumulation_3d), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), vol, - x0, x1, x2, x3, x4, x5, x6, x7, - real_zones, - ibegin, iend); + RPlaunchHipKernel( (zonal_accumulation_3d), + grid_size, block_size, + shmem, res.get_stream(), + vol, + x0, x1, x2, x3, x4, x5, x6, x7, + real_zones, + ibegin, iend ); hipErrchk( hipGetLastError() ); } From ec57a5f2f19b9018b5233b8bcbfed51b71661e81 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Dec 2023 10:50:51 -0800 Subject: [PATCH 205/454] Convert more kernels and fix compilation errors --- src/apps/EDGE3D-Hip.cpp | 4 ++-- src/apps/HALOEXCHANGE-Cuda.cpp | 10 ++++++-- src/apps/HALOEXCHANGE-Hip.cpp | 12 ++++++---- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 36 ++++++++++++++++++++-------- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 36 ++++++++++++++++++++-------- src/lcals/FIRST_SUM-Hip.cpp | 10 ++++---- src/lcals/GEN_LIN_RECUR-Hip.cpp | 2 +- 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/src/apps/EDGE3D-Hip.cpp b/src/apps/EDGE3D-Hip.cpp index b8dace845..0baf48312 100644 --- a/src/apps/EDGE3D-Hip.cpp +++ b/src/apps/EDGE3D-Hip.cpp @@ -89,8 +89,8 @@ void EDGE3D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - RPlaunchHipKernel( (lambda_cuda_forall), + RPlaunchHipKernel( (lambda_hip_forall), grid_size, block_size, shmem, res.get_stream(), ibegin, iend, diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 3a8ae049b..b106fce82 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -69,7 +69,10 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_pack<<>>(buffer, list, var, len); + RPlaunchCudaKernel( (haloexchange_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -85,7 +88,10 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - haloexchange_unpack<<>>(buffer, list, var, len); + RPlaunchCudaKernel( (haloexchange_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); cudaErrchk( cudaGetLastError() ); buffer += len; } diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index 9831b6a69..f13c48bfb 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -69,8 +69,10 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), - buffer, list, var, len); + RPlaunchHipKernel( (haloexchange_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); hipErrchk( hipGetLastError() ); buffer += len; } @@ -86,8 +88,10 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((haloexchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), - buffer, list, var, len); + RPlaunchHipKernel( (haloexchange_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len ); hipErrchk( hipGetLastError() ); buffer += len; } diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index e23abf424..aef5adfa1 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -51,8 +51,10 @@ namespace apps template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, - Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) +__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, + Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, + Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,8 +72,10 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, - Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) +__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, + Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, + Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -127,8 +131,13 @@ void HALOEXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - haloexchange_fused_pack<<>>( - pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); + RPlaunchCudaKernel( (haloexchange_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs ); cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); @@ -150,11 +159,18 @@ void HALOEXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) buffer += len; } } - Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / + unpack_index; dim3 unpack_nthreads_per_block(block_size); - dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - haloexchange_fused_unpack<<>>( - unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, + unpack_index); + RPlaunchCudaKernel( (haloexchange_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs ); cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 38ac329b4..3fd78f925 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -51,8 +51,10 @@ namespace apps template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, - Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) +__global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, + Int_ptr* pack_list_ptrs, + Real_ptr* pack_var_ptrs, + Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,8 +72,10 @@ __global__ void haloexchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pac template < size_t block_size > __launch_bounds__(block_size) -__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, - Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) +__global__ void haloexchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, + Int_ptr* unpack_list_ptrs, + Real_ptr* unpack_var_ptrs, + Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -127,8 +131,13 @@ void HALOEXCHANGE_FUSED::runHipVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - hipLaunchKernelGGL((haloexchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), - pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); + RPlaunchHipKernel( (haloexchange_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs ); hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); @@ -150,11 +159,18 @@ void HALOEXCHANGE_FUSED::runHipVariantDirect(VariantID vid) buffer += len; } } - Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; + Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / + unpack_index; dim3 unpack_nthreads_per_block(block_size); - dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - hipLaunchKernelGGL((haloexchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), - unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); + dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, + unpack_index); + RPlaunchHipKernel( (haloexchange_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs ); hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index af7e35534..7e35bfefd 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -52,11 +52,11 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - RPlaunchCudaKernel( (first_sum), - grid_size, block_size, - shmem, res.get_stream(), - x, y, - iend ); + RPlaunchHipKernel( (first_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + iend ); hipErrchk( hipGetLastError() ); } diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index b583c795b..35cb3fd2a 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -73,7 +73,7 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) sa, sb, kb5i, N ); - cudaErrchk( hipGetLastError() ); + hipErrchk( hipGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); From 6ce7fefe4cd1cdcef25275a6dd383c0478e1ccec Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Dec 2023 11:11:17 -0800 Subject: [PATCH 206/454] Convert remaining apps kernels to new GPU launch methods + code cleanup --- src/apps/CONVECTION3DPA-Cuda.cpp | 9 +++++---- src/apps/CONVECTION3DPA-Hip.cpp | 13 ++++++------- src/apps/DIFFUSION3DPA-Cuda.cpp | 9 +++++---- src/apps/DIFFUSION3DPA-Hip.cpp | 11 +++++------ src/apps/MASS3DEA-Cuda.cpp | 10 ++++++---- src/apps/MASS3DEA-Hip.cpp | 12 ++++++------ src/apps/MASS3DPA-Cuda.cpp | 10 ++++++---- src/apps/MASS3DPA-Hip.cpp | 12 ++++++------ 8 files changed, 45 insertions(+), 41 deletions(-) diff --git a/src/apps/CONVECTION3DPA-Cuda.cpp b/src/apps/CONVECTION3DPA-Cuda.cpp index 5b5e5f1f4..83a0317de 100644 --- a/src/apps/CONVECTION3DPA-Cuda.cpp +++ b/src/apps/CONVECTION3DPA-Cuda.cpp @@ -138,15 +138,16 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); constexpr size_t shmem = 0; - Convection3DPA<<>> - (Basis, tBasis, dBasis, D, X, Y); + RPlaunchCudaKernel( (Convection3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, tBasis, dBasis, D, X, Y ); cudaErrchk(cudaGetLastError()); } stopTimer(); diff --git a/src/apps/CONVECTION3DPA-Hip.cpp b/src/apps/CONVECTION3DPA-Hip.cpp index ed0eef3e4..bf783bfdd 100644 --- a/src/apps/CONVECTION3DPA-Hip.cpp +++ b/src/apps/CONVECTION3DPA-Hip.cpp @@ -138,17 +138,16 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(CPA_Q1D, CPA_Q1D, CPA_Q1D); constexpr size_t shmem = 0; - hipLaunchKernelGGL((Convection3DPA), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - Basis, tBasis, dBasis, D, X, Y); - + + RPlaunchHipKernel( (Convection3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, tBasis, dBasis, D, X, Y ); hipErrchk(hipGetLastError()); } stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 863f83854..7270dfb97 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -117,15 +117,16 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); constexpr size_t shmem = 0; - Diffusion3DPA<<>>( - Basis, dBasis, D, X, Y, symmetric); + RPlaunchCudaKernel( (Diffusion3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, dBasis, D, X, Y, symmetric ); cudaErrchk(cudaGetLastError()); } stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 7f0dd77b1..249fd9721 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -117,17 +117,16 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + dim3 nthreads_per_block(DPA_Q1D, DPA_Q1D, DPA_Q1D); constexpr size_t shmem = 0; - hipLaunchKernelGGL((Diffusion3DPA), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - Basis, dBasis, D, X, Y, symmetric); + RPlaunchHipKernel( (Diffusion3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + Basis, dBasis, D, X, Y, symmetric ); hipErrchk(hipGetLastError()); } stopTimer(); diff --git a/src/apps/MASS3DEA-Cuda.cpp b/src/apps/MASS3DEA-Cuda.cpp index 87d918b11..c1eb62e84 100644 --- a/src/apps/MASS3DEA-Cuda.cpp +++ b/src/apps/MASS3DEA-Cuda.cpp @@ -69,14 +69,16 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Mass3DEA<<>>(B, D, M); + dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + constexpr size_t shmem = 0; + RPlaunchCudaKernel( (Mass3DEA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, M ); cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/MASS3DEA-Hip.cpp b/src/apps/MASS3DEA-Hip.cpp index 7184694e3..831a0434d 100644 --- a/src/apps/MASS3DEA-Hip.cpp +++ b/src/apps/MASS3DEA-Hip.cpp @@ -69,16 +69,16 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Mass3DEA), dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - B, D, M); + dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D); + constexpr size_t shmem = 0; + RPlaunchHipKernel( (Mass3DEA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, D, M ); hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index a8f769d6a..0b7d5ec56 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -99,14 +99,16 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { case Base_CUDA: { - dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Mass3DPA<<>>(B, Bt, D, X, Y); + dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); + constexpr size_t shmem = 0; + RPlaunchCudaKernel( (Mass3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, Bt, D, X, Y ); cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index c9d600136..9dd759423 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -99,16 +99,16 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { case Base_HIP: { - dim3 nblocks(NE); - dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); - constexpr size_t shmem = 0; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipLaunchKernelGGL((Mass3DPA), dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - B, Bt, D, X, Y); + dim3 nthreads_per_block(MPA_Q1D, MPA_Q1D, 1); + constexpr size_t shmem = 0; + RPlaunchHipKernel( (Mass3DPA), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, Bt, D, X, Y ); hipErrchk( hipGetLastError() ); } From 85e717889ced58f296408b2fcbc381b8ef939718 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Dec 2023 11:57:10 -0800 Subject: [PATCH 207/454] Convert first batch of polybench kernels to new GPU launch methods --- src/polybench/POLYBENCH_2MM-Cuda.cpp | 68 +++++++++++------- src/polybench/POLYBENCH_2MM-Hip.cpp | 48 ++++++++----- src/polybench/POLYBENCH_3MM-Cuda.cpp | 101 +++++++++++++++++---------- src/polybench/POLYBENCH_3MM-Hip.cpp | 69 +++++++++++------- 4 files changed, 185 insertions(+), 101 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 7a8f43e58..99fa4b1ff 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -125,15 +125,25 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_2MM_1_NBLOCKS_CUDA; - poly_2mm_1 - <<>>(tmp, A, B, alpha, - ni, nj, nk); + + RPlaunchCudaKernel( + (poly_2mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + tmp, A, B, + alpha, + ni, nj, nk ); cudaErrchk( cudaGetLastError() ); POLY_2MM_2_NBLOCKS_CUDA; - poly_2mm_2 - <<>>(tmp, C, D, beta, - ni, nl, nj); + + RPlaunchCudaKernel( + (poly_2mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + tmp, C, D, + beta, + ni, nl, nj ); cudaErrchk( cudaGetLastError() ); } @@ -148,29 +158,39 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_2MM_1_NBLOCKS_CUDA; - poly_2mm_1_lam - <<>>(ni, nj, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_2MM_BODY1; - for (Index_type k=0; k < nk; ++k) { - POLYBENCH_2MM_BODY2; - } - POLYBENCH_2MM_BODY3; + + auto poly_2mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_2MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_2MM_BODY2; } - ); + POLYBENCH_2MM_BODY3; + }; + + RPlaunchCudaKernel( + (poly_2mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_2mm_1_lambda ); cudaErrchk( cudaGetLastError() ); POLY_2MM_2_NBLOCKS_CUDA; - poly_2mm_2_lam - <<>>(ni, nl, - [=] __device__ (Index_type i, Index_type l) { - POLYBENCH_2MM_BODY4; - for (Index_type j=0; j < nj; ++j) { - POLYBENCH_2MM_BODY5; - } - POLYBENCH_2MM_BODY6; + + auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) { + POLYBENCH_2MM_BODY4; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_2MM_BODY5; } - ); + POLYBENCH_2MM_BODY6; + }; + + RPlaunchCudaKernel( + (poly_2mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_2mm_2_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 1a0f26ecd..5cae041a7 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -125,17 +125,25 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1), - dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(), - tmp, A, B, alpha, - ni, nj, nk); + + RPlaunchHipKernel( + (poly_2mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + tmp, A, B, + alpha, + ni, nj, nk ); hipErrchk( hipGetLastError() ); POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - tmp, C, D, beta, - ni, nl, nj); + + RPlaunchHipKernel( + (poly_2mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + tmp, C, D, + beta, + ni, nl, nj ); hipErrchk( hipGetLastError() ); } @@ -149,6 +157,8 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) POLY_2MM_THREADS_PER_BLOCK_HIP; constexpr size_t shmem = 0; + POLY_2MM_1_NBLOCKS_HIP; + auto poly_2mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_2MM_BODY1; for (Index_type k=0; k < nk; ++k) { @@ -157,12 +167,16 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) POLYBENCH_2MM_BODY3; }; - POLY_2MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_1_lam), - dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, poly_2mm_1_lambda); + RPlaunchHipKernel( + (poly_2mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_2mm_1_lambda ); hipErrchk( hipGetLastError() ); + POLY_2MM_2_NBLOCKS_HIP; + auto poly_2mm_2_lambda = [=] __device__ (Index_type i, Index_type l) { POLYBENCH_2MM_BODY4; for (Index_type j=0; j < nj; ++j) { @@ -171,10 +185,12 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) POLYBENCH_2MM_BODY6; }; - POLY_2MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_2mm_2_lam), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nl, poly_2mm_2_lambda); + RPlaunchHipKernel( + (poly_2mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_2mm_2_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 9131a629a..05f81f7ca 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -159,21 +159,33 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_3MM_1_NBLOCKS_CUDA; - poly_3mm_1 - <<>>(E, A, B, - ni, nj, nk); + + RPlaunchCudaKernel( + (poly_3mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + E, A, B, + ni, nj, nk ); cudaErrchk( cudaGetLastError() ); POLY_3MM_2_NBLOCKS_CUDA; - poly_3mm_2 - <<>>(F, C, D, - nj, nl, nm); + + RPlaunchCudaKernel( + (poly_3mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + F, C, D, + nj, nl, nm ); cudaErrchk( cudaGetLastError() ); POLY_3MM_3_NBLOCKS_CUDA; - poly_3mm_3 - <<>>(G, E, F, - ni, nl, nj); + + RPlaunchCudaKernel( + (poly_3mm_3), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + G, E, F, + ni, nl, nj ); cudaErrchk( cudaGetLastError() ); } @@ -188,42 +200,57 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_3MM_1_NBLOCKS_CUDA; - poly_3mm_1_lam - <<>>(ni, nj, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_3MM_BODY1; - for (Index_type k=0; k < nk; ++k) { - POLYBENCH_3MM_BODY2; - } - POLYBENCH_3MM_BODY3; + + auto poly_3mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_3MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_3MM_BODY2; } - ); + POLYBENCH_3MM_BODY3; + }; + + RPlaunchCudaKernel( + (poly_3mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_3mm_1_lambda ); cudaErrchk( cudaGetLastError() ); POLY_3MM_2_NBLOCKS_CUDA; - poly_3mm_2_lam - <<>>(nj, nl, - [=] __device__ (Index_type j, Index_type l) { - POLYBENCH_3MM_BODY4; - for (Index_type m=0; m < nm; ++m) { - POLYBENCH_3MM_BODY5; - } - POLYBENCH_3MM_BODY6; - } - ); + + auto poly_3mm_2_lambda = [=] __device__ (Index_type j, Index_type l) { + POLYBENCH_3MM_BODY4; + for (Index_type m=0; m < nm; ++m) { + POLYBENCH_3MM_BODY5; + } + POLYBENCH_3MM_BODY6; + }; + + RPlaunchCudaKernel( + (poly_3mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + nj, nl, poly_3mm_2_lambda ); cudaErrchk( cudaGetLastError() ); POLY_3MM_3_NBLOCKS_CUDA; - poly_3mm_3_lam - <<>>(ni, nl, - [=] __device__ (Index_type i, Index_type l) { - POLYBENCH_3MM_BODY7; - for (Index_type j=0; j < nj; ++j) { - POLYBENCH_3MM_BODY8; - } - POLYBENCH_3MM_BODY9; + + auto poly_3mm_3_lambda = [=] __device__ (Index_type i, Index_type l) { + POLYBENCH_3MM_BODY7; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_3MM_BODY8; } - ); + POLYBENCH_3MM_BODY9; + }; + + RPlaunchCudaKernel( + (poly_3mm_3_lam), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_3mm_3_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 2cdf3df13..295ad3293 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -158,24 +158,33 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1), - dim3(nblocks1) , dim3(nthreads_per_block), shmem, res.get_stream(), - E, A, B, - ni, nj, nk); + + RPlaunchHipKernel( + (poly_3mm_1), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + E, A, B, + ni, nj, nk ); hipErrchk( hipGetLastError() ); POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - F, C, D, - nj, nl, nm); + + RPlaunchHipKernel( + (poly_3mm_2), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + F, C, D, + nj, nl, nm ); hipErrchk( hipGetLastError() ); POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3), - dim3(nblocks3), dim3(nthreads_per_block), shmem, res.get_stream(), - G, E, F, - ni, nl, nj); + + RPlaunchHipKernel( + (poly_3mm_3), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + G, E, F, + ni, nl, nj ); hipErrchk( hipGetLastError() ); } @@ -189,6 +198,8 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLY_3MM_THREADS_PER_BLOCK_HIP; constexpr size_t shmem = 0; + POLY_3MM_1_NBLOCKS_HIP; + auto poly_3mm_1_lambda = [=] __device__ (Index_type i, Index_type j) { POLYBENCH_3MM_BODY1; for (Index_type k=0; k < nk; ++k) { @@ -197,12 +208,16 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLYBENCH_3MM_BODY3; }; - POLY_3MM_1_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_1_lam), - dim3(nblocks1), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, poly_3mm_1_lambda); + RPlaunchHipKernel( + (poly_3mm_1_lam), + nblocks1, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_3mm_1_lambda ); hipErrchk( hipGetLastError() ); + POLY_3MM_2_NBLOCKS_HIP; + auto poly_3mm_2_lambda = [=] __device__ (Index_type j, Index_type l) { POLYBENCH_3MM_BODY4; for (Index_type m=0; m < nm; ++m) { @@ -211,12 +226,16 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLYBENCH_3MM_BODY6; }; - POLY_3MM_2_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_2_lam), - dim3(nblocks2), dim3(nthreads_per_block), shmem, res.get_stream(), - nj, nl, poly_3mm_2_lambda); + RPlaunchHipKernel( + (poly_3mm_2_lam), + nblocks2, nthreads_per_block, + shmem, res.get_stream(), + nj, nl, poly_3mm_2_lambda ); hipErrchk( hipGetLastError() ); + POLY_3MM_3_NBLOCKS_HIP; + auto poly_3mm_3_lambda = [=] __device__ (Index_type i, Index_type l) { POLYBENCH_3MM_BODY7; for (Index_type j=0; j < nj; ++j) { @@ -225,10 +244,12 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) POLYBENCH_3MM_BODY9; }; - POLY_3MM_3_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_3mm_3_lam), - dim3(nblocks3), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nl, poly_3mm_3_lambda); + RPlaunchHipKernel( + (poly_3mm_3_lam), + nblocks3, nthreads_per_block, + shmem, res.get_stream(), + ni, nl, poly_3mm_3_lambda ); hipErrchk( hipGetLastError() ); } From 13d7674d16bd8ceada09f6dbfd0d160d1c4a1ad0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Dec 2023 15:26:45 -0800 Subject: [PATCH 208/454] Convert more GPU kernels, code cleanup, fix compilation errors --- src/polybench/POLYBENCH_2MM-Hip.cpp | 2 +- src/polybench/POLYBENCH_ADI-Cuda.cpp | 78 +++++++++------ src/polybench/POLYBENCH_ADI-Hip.cpp | 46 +++++---- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 52 ++++++---- src/polybench/POLYBENCH_ATAX-Hip.cpp | 36 ++++--- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 94 +++++++++++++------ src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 94 +++++++++++-------- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 26 +++-- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 29 +++--- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 35 ++++--- src/polybench/POLYBENCH_GEMM-Hip.cpp | 20 ++-- 11 files changed, 319 insertions(+), 193 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 5cae041a7..08b906cf3 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -168,7 +168,7 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) }; RPlaunchHipKernel( - (poly_2mm_1_lam), nblocks1, nthreads_per_block, shmem, res.get_stream(), diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index fc3348fff..cc175291f 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -23,7 +23,7 @@ namespace polybench template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi1(const Index_type n, +__global__ void poly_adi1(const Index_type n, const Real_type a, const Real_type b, const Real_type c, const Real_type d, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -43,7 +43,7 @@ __global__ void adi1(const Index_type n, template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi2(const Index_type n, +__global__ void poly_adi2(const Index_type n, const Real_type a, const Real_type c, const Real_type d, const Real_type e, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -63,7 +63,7 @@ __global__ void adi2(const Index_type n, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void adi_lam(const Index_type n, +__global__ void poly_adi_lam(const Index_type n, Lambda body) { Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; @@ -92,14 +92,22 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - adi1<<>>(n, - a, b, c, d, f, - P, Q, U, V); + RPlaunchCudaKernel( (poly_adi1), + grid_size, block_size, + shmem, res.get_stream(), + n, + a, b, c, + d, f, + P, Q, U, V ); cudaErrchk( cudaGetLastError() ); - adi2<<>>(n, - a, c, d, e, f, - P, Q, U, V); + RPlaunchCudaKernel( (poly_adi2), + grid_size, block_size, + shmem, res.get_stream(), + n, + a, c, d, + e, f, + P, Q, U, V ); cudaErrchk( cudaGetLastError() ); } // tstep loop @@ -117,32 +125,40 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - adi_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_ADI_BODY2; - for (Index_type j = 1; j < n-1; ++j) { - POLYBENCH_ADI_BODY3; - } - POLYBENCH_ADI_BODY4; - for (Index_type k = n-2; k >= 1; --k) { - POLYBENCH_ADI_BODY5; - } + auto poly_adi1_lambda = [=] __device__ (Index_type i) { + POLYBENCH_ADI_BODY2; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY3; } - ); + POLYBENCH_ADI_BODY4; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY5; + } + }; + + RPlaunchCudaKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi1_lambda ); cudaErrchk( cudaGetLastError() ); - adi_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_ADI_BODY6; - for (Index_type j = 1; j < n-1; ++j) { - POLYBENCH_ADI_BODY7; - } - POLYBENCH_ADI_BODY8; - for (Index_type k = n-2; k >= 1; --k) { - POLYBENCH_ADI_BODY9; - } + auto poly_adi2_lambda = [=] __device__ (Index_type i) { + POLYBENCH_ADI_BODY6; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY7; } - ); + POLYBENCH_ADI_BODY8; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY9; + } + }; + + RPlaunchCudaKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi2_lambda ); cudaErrchk( cudaGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index f5791ce88..39e10dc62 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -23,7 +23,7 @@ namespace polybench template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi1(const Index_type n, +__global__ void poly_adi1(const Index_type n, const Real_type a, const Real_type b, const Real_type c, const Real_type d, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -43,7 +43,7 @@ __global__ void adi1(const Index_type n, template < size_t block_size > __launch_bounds__(block_size) -__global__ void adi2(const Index_type n, +__global__ void poly_adi2(const Index_type n, const Real_type a, const Real_type c, const Real_type d, const Real_type e, const Real_type f, Real_ptr P, Real_ptr Q, Real_ptr U, Real_ptr V) @@ -63,7 +63,7 @@ __global__ void adi2(const Index_type n, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void adi_lam(const Index_type n, +__global__ void poly_adi_lam(const Index_type n, Lambda body) { Index_type i = 1 + blockIdx.x * block_size + threadIdx.x; @@ -92,18 +92,22 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((adi1), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + RPlaunchHipKernel( (poly_adi1), + grid_size, block_size, + shmem, res.get_stream(), n, - a, b, c, d, f, - P, Q, U, V); + a, b, c, + d, f, + P, Q, U, V ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((adi2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + RPlaunchHipKernel( (poly_adi2), + grid_size, block_size, + shmem, res.get_stream(), n, - a, c, d, e, f, - P, Q, U, V); + a, c, d, + e, f, + P, Q, U, V ); hipErrchk( hipGetLastError() ); } // tstep loop @@ -121,7 +125,7 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(n-2, block_size); constexpr size_t shmem = 0; - auto adi1_lamda = [=] __device__ (Index_type i) { + auto poly_adi1_lambda = [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY2; for (Index_type j = 1; j < n-1; ++j) { POLYBENCH_ADI_BODY3; @@ -132,12 +136,14 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((adi_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, adi1_lamda); + RPlaunchHipKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi1_lambda ); hipErrchk( hipGetLastError() ); - auto adi2_lamda = [=] __device__ (Index_type i) { + auto poly_adi2_lambda = [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY6; for (Index_type j = 1; j < n-1; ++j) { POLYBENCH_ADI_BODY7; @@ -148,9 +154,11 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) } }; - hipLaunchKernelGGL((adi_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, adi2_lamda); + RPlaunchHipKernel( (poly_adi_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_adi2_lambda ); hipErrchk( hipGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index a787276ec..6309b1590 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -83,10 +83,18 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_atax_1<<>>(A, x, y, tmp, N); + RPlaunchCudaKernel( (poly_atax_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, tmp, + N ); cudaErrchk( cudaGetLastError() ); - poly_atax_2<<>>(A, tmp, y, N); + RPlaunchCudaKernel( (poly_atax_2), + grid_size, block_size, + shmem, res.get_stream(), + A, tmp, y, + N ); cudaErrchk( cudaGetLastError() ); } @@ -100,26 +108,34 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_atax_lam<<>>(N, - [=] __device__ (Index_type i) { - POLYBENCH_ATAX_BODY1; - for (Index_type j = 0; j < N; ++j ) { - POLYBENCH_ATAX_BODY2; - } - POLYBENCH_ATAX_BODY3; + auto poly_atax1_lambda = [=] __device__ (Index_type i) { + POLYBENCH_ATAX_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY2; } - ); + POLYBENCH_ATAX_BODY3; + }; + + RPlaunchCudaKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax1_lambda ); cudaErrchk( cudaGetLastError() ); - poly_atax_lam<<>>(N, - [=] __device__ (Index_type j) { - POLYBENCH_ATAX_BODY4; - for (Index_type i = 0; i < N; ++i ) { - POLYBENCH_ATAX_BODY5; - } - POLYBENCH_ATAX_BODY6; + auto poly_atax2_lambda = [=] __device__ (Index_type j) { + POLYBENCH_ATAX_BODY4; + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY5; } - ); + POLYBENCH_ATAX_BODY6; + }; + + RPlaunchCudaKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax2_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index 3ac954ab8..f28cb4dec 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -83,14 +83,18 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_atax_1), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x, y, tmp, N); + RPlaunchHipKernel( (poly_atax_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, tmp, + N ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_atax_2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, tmp, y, N); + RPlaunchHipKernel( (poly_atax_2), + grid_size, block_size, + shmem, res.get_stream(), + A, tmp, y, + N ); hipErrchk( hipGetLastError() ); } @@ -104,7 +108,7 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - auto poly_atax_1_lambda = [=] __device__ (Index_type i) { + auto poly_atax1_lambda = [=] __device__ (Index_type i) { POLYBENCH_ATAX_BODY1; for (Index_type j = 0; j < N; ++j ) { POLYBENCH_ATAX_BODY2; @@ -112,12 +116,14 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) POLYBENCH_ATAX_BODY3; }; - hipLaunchKernelGGL((poly_atax_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - N, poly_atax_1_lambda); + RPlaunchHipKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax1_lambda ); hipErrchk( hipGetLastError() ); - auto poly_atax_2_lambda = [=] __device__ (Index_type j) { + auto poly_atax2_lambda = [=] __device__ (Index_type j) { POLYBENCH_ATAX_BODY4; for (Index_type i = 0; i < N; ++i ) { POLYBENCH_ATAX_BODY5; @@ -125,9 +131,11 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) POLYBENCH_ATAX_BODY6; }; - hipLaunchKernelGGL((poly_atax_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - N, poly_atax_2_lambda); + RPlaunchHipKernel( (poly_atax_lam), + grid_size, block_size, + shmem, res.get_stream(), + N, poly_atax2_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 415e2fc94..f7237cc08 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -160,22 +160,35 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - poly_fdtd2d_1<<>>(ey, fict, ny, t); + + RPlaunchCudaKernel( (poly_fdtd2d_1), + grid_size1, block_size, + shmem, res.get_stream(), + ey, fict, ny, t ); cudaErrchk( cudaGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_CUDA; FDTD_2D_NBLOCKS_CUDA; - poly_fdtd2d_2 - <<>>(ey, hz, nx, ny); + RPlaunchCudaKernel( + (poly_fdtd2d_2), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ey, hz, nx, ny ); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_3 - <<>>(ex, hz, nx, ny); + RPlaunchCudaKernel( + (poly_fdtd2d_3), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ex, hz, nx, ny ); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_4 - <<>>(hz, ex, ey, nx, ny); + RPlaunchCudaKernel( + (poly_fdtd2d_4), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + hz, ex, ey, nx, ny ); cudaErrchk( cudaGetLastError() ); } // tstep loop @@ -193,37 +206,58 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - poly_fdtd2d_1_lam<<>>(ny, - [=] __device__ (Index_type j) { - POLYBENCH_FDTD_2D_BODY1; - } - ); + + auto poly_fdtd2d_1_lambda = [=] __device__ (Index_type j) { + POLYBENCH_FDTD_2D_BODY1; + }; + + RPlaunchCudaKernel( (poly_fdtd2d_1_lam), + grid_size1, block_size, + shmem, res.get_stream(), + ny, poly_fdtd2d_1_lambda ); + cudaErrchk( cudaGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_CUDA; FDTD_2D_NBLOCKS_CUDA; - poly_fdtd2d_2_lam - <<>>(nx, ny, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY2; - } - ); + auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY2; + }; + + RPlaunchCudaKernel( + (poly_fdtd2d_2_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_2_lambda ); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_3_lam - <<>>(nx, ny, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY3; - } - ); + auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY3; + }; + + RPlaunchCudaKernel( + (poly_fdtd2d_3_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_3_lambda ); cudaErrchk( cudaGetLastError() ); - poly_fdtd2d_4_lam - <<>>(nx, ny, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY4; - } - ); + auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY4; + }; + + RPlaunchCudaKernel( + (poly_fdtd2d_4_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_4_lambda ); cudaErrchk( cudaGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index ad8bd66d1..b6a2e21d8 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -159,27 +159,35 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - hipLaunchKernelGGL((poly_fdtd2d_1), - dim3(grid_size1), dim3(block_size), shmem, res.get_stream(), - ey, fict, ny, t); + + RPlaunchHipKernel( (poly_fdtd2d_1), + grid_size1, block_size, + shmem, res.get_stream(), + ey, fict, ny, t ); hipErrchk( hipGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - hipLaunchKernelGGL((poly_fdtd2d_2), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - ey, hz, nx, ny); + RPlaunchHipKernel( + (poly_fdtd2d_2), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ey, hz, nx, ny ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_3), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - ex, hz, nx, ny); + RPlaunchHipKernel( + (poly_fdtd2d_3), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + ex, hz, nx, ny ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_fdtd2d_4), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - hz, ex, ey, nx, ny); + RPlaunchHipKernel( + (poly_fdtd2d_4), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + hz, ex, ey, nx, ny ); hipErrchk( hipGetLastError() ); } // tstep loop @@ -196,47 +204,59 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; + const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); + auto poly_fdtd2d_1_lambda = [=] __device__ (Index_type j) { POLYBENCH_FDTD_2D_BODY1; }; - const size_t grid_size1 = RAJA_DIVIDE_CEILING_INT(ny, block_size); - hipLaunchKernelGGL((poly_fdtd2d_1_lam), - dim3(grid_size1), dim3(block_size), shmem, res.get_stream(), - ny, poly_fdtd2d_1_lambda); + RPlaunchHipKernel( (poly_fdtd2d_1_lam), + grid_size1, block_size, + shmem, res.get_stream(), + ny, poly_fdtd2d_1_lambda ); hipErrchk( hipGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; - auto poly_fdtd2d_2_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY2; - }; + auto poly_fdtd2d_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY2; + }; - hipLaunchKernelGGL((poly_fdtd2d_2_lam), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - nx, ny, poly_fdtd2d_2_lambda); + RPlaunchHipKernel( + (poly_fdtd2d_2_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_2_lambda ); hipErrchk( hipGetLastError() ); - auto poly_fdtd2d_3_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY3; - }; + auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY3; + }; - hipLaunchKernelGGL((poly_fdtd2d_3_lam), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - nx, ny, poly_fdtd2d_3_lambda); + RPlaunchHipKernel( + (poly_fdtd2d_3_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_3_lambda ); hipErrchk( hipGetLastError() ); - auto poly_fdtd2d_4_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FDTD_2D_BODY4; - }; + auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FDTD_2D_BODY4; + }; - hipLaunchKernelGGL((poly_fdtd2d_4_lam), - dim3(nblocks234), dim3(nthreads_per_block234), shmem, res.get_stream(), - nx, ny, poly_fdtd2d_4_lambda); + RPlaunchHipKernel( + (poly_fdtd2d_4_lam), + nblocks234, nthreads_per_block234, + shmem, res.get_stream(), + nx, ny, poly_fdtd2d_4_lambda ); hipErrchk( hipGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 7aff525c2..c7692ad0f 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -85,9 +85,11 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_floyd_warshall - <<>>(pout, pin, - k, N); + RPlaunchCudaKernel( + (poly_floyd_warshall), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + pout, pin, k, N ); cudaErrchk( cudaGetLastError() ); } @@ -106,12 +108,18 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_floyd_warshall_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FLOYD_WARSHALL_BODY; - } - ); + auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY; + }; + + RPlaunchCudaKernel( + (poly_floyd_warshall_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_floyd_warshall_lambda ); + cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index c3581748c..8065c5f2e 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -85,10 +85,11 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) POLY_FLOYD_WARSHALL_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_floyd_warshall), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - pout, pin, - k, N); + RPlaunchHipKernel( + (poly_floyd_warshall), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + pout, pin, k, N ); hipErrchk( hipGetLastError() ); } @@ -103,19 +104,21 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) for (Index_type k = 0; k < N; ++k) { - auto poly_floyd_warshall_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_FLOYD_WARSHALL_BODY; - }; - POLY_FLOYD_WARSHALL_THREADS_PER_BLOCK_HIP; POLY_FLOYD_WARSHALL_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL( - (poly_floyd_warshall_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_floyd_warshall_lambda); + auto poly_floyd_warshall_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY; + }; + + RPlaunchHipKernel( + (poly_floyd_warshall_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_floyd_warshall_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index afaa17185..b64c5a221 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -90,10 +90,13 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) POLY_GEMM_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemm - <<>>(C, A, B, - alpha, beta, - ni, nj, nk); + RPlaunchCudaKernel( + (poly_gemm), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + C, A, B, + alpha, beta, + ni, nj, nk ); cudaErrchk( cudaGetLastError() ); } @@ -108,17 +111,21 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) POLY_GEMM_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemm_lam - <<>>(ni, nj, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_GEMM_BODY1; - POLYBENCH_GEMM_BODY2; - for (Index_type k = 0; k < nk; ++k ) { - POLYBENCH_GEMM_BODY3; - } - POLYBENCH_GEMM_BODY4; + auto poly_gemm_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_GEMM_BODY1; + POLYBENCH_GEMM_BODY2; + for (Index_type k = 0; k < nk; ++k ) { + POLYBENCH_GEMM_BODY3; } - ); + POLYBENCH_GEMM_BODY4; + }; + + RPlaunchCudaKernel( + (poly_gemm_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_gemm_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index 4ee83f375..fcbd22e88 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -90,10 +90,13 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) POLY_GEMM_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_gemm), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - C, A, B, alpha, beta, - ni, nj, nk); + RPlaunchHipKernel( + (poly_gemm), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + C, A, B, + alpha, beta, + ni, nj, nk ); hipErrchk( hipGetLastError() ); } @@ -117,9 +120,12 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) POLYBENCH_GEMM_BODY4; }; - hipLaunchKernelGGL((poly_gemm_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - ni, nj, poly_gemm_lambda); + RPlaunchHipKernel( + (poly_gemm_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + ni, nj, poly_gemm_lambda ); hipErrchk( hipGetLastError() ); } From 811ba28a19859c3ae266bfb0104ba189bea99d06 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 15 Dec 2023 16:06:05 -0800 Subject: [PATCH 209/454] Convert more kernels to new GPU laucnh methods --- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 130 +++++++++++++---------- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 126 ++++++++++++---------- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 12 ++- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 10 +- src/polybench/POLYBENCH_MVT-Cuda.cpp | 12 ++- src/polybench/POLYBENCH_MVT-Hip.cpp | 16 +-- 6 files changed, 178 insertions(+), 128 deletions(-) diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 1360e93b2..d7ae8f280 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -41,10 +41,10 @@ namespace polybench template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1(Real_ptr A, - Real_ptr u1, Real_ptr v1, - Real_ptr u2, Real_ptr v2, - Index_type n) +__global__ void poly_gemver_1(Real_ptr A, + Real_ptr u1, Real_ptr v1, + Real_ptr u2, Real_ptr v2, + Index_type n) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -56,7 +56,7 @@ __global__ void poly_gemmver_1(Real_ptr A, template < size_t j_block_size, size_t i_block_size, typename Lambda > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1_lam(Index_type n, Lambda body) +__global__ void poly_gemver_1_lam(Index_type n, Lambda body) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -68,10 +68,10 @@ __global__ void poly_gemmver_1_lam(Index_type n, Lambda body) template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_2(Real_ptr A, - Real_ptr x, Real_ptr y, - Real_type beta, - Index_type n) +__global__ void poly_gemver_2(Real_ptr A, + Real_ptr x, Real_ptr y, + Real_type beta, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -85,8 +85,8 @@ __global__ void poly_gemmver_2(Real_ptr A, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, - Index_type n) +__global__ void poly_gemver_3(Real_ptr x, Real_ptr z, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -96,10 +96,10 @@ __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_4(Real_ptr A, - Real_ptr x, Real_ptr w, - Real_type alpha, - Index_type n) +__global__ void poly_gemver_4(Real_ptr A, + Real_ptr x, Real_ptr w, + Real_type alpha, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -113,7 +113,7 @@ __global__ void poly_gemmver_4(Real_ptr A, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void poly_gemmver_234_lam(Index_type n, Lambda body) +__global__ void poly_gemver_234_lam(Index_type n, Lambda body) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -140,25 +140,31 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) GEMVER_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemmver_1 - <<>>(A, u1, v1, u2, v2, - n); + RPlaunchCudaKernel( + (poly_gemver_1), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + A, u1, v1, u2, v2, n ); cudaErrchk( cudaGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - poly_gemmver_2<<>>(A, x, y, - beta, - n); + RPlaunchCudaKernel( (poly_gemver_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, beta, n ); cudaErrchk( cudaGetLastError() ); - poly_gemmver_3<<>>(x, z, - n); + RPlaunchCudaKernel( (poly_gemver_3), + grid_size, block_size, + shmem, res.get_stream(), + x, z, n ); cudaErrchk( cudaGetLastError() ); - poly_gemmver_4<<>>(A, x, w, - alpha, - n); + RPlaunchCudaKernel( (poly_gemver_4), + grid_size, block_size, + shmem, res.get_stream(), + A, x, w, alpha, n ); cudaErrchk( cudaGetLastError() ); } @@ -173,43 +179,59 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) GEMVER_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_gemmver_1_lam - <<>>(n, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_GEMVER_BODY1; - } - ); + auto poly_gemver1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1; + }; + + RPlaunchCudaKernel( + (poly_gemver_1_lam), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + n, poly_gemver1_lambda ); cudaErrchk( cudaGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - poly_gemmver_234_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY2; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY3; - } - POLYBENCH_GEMVER_BODY4; + auto poly_gemver2_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY3; } - ); + POLYBENCH_GEMVER_BODY4; + }; + + RPlaunchCudaKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver2_lambda ); cudaErrchk( cudaGetLastError() ); - poly_gemmver_234_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY5; - } - ); + auto poly_gemver3_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY5; + }; + + RPlaunchCudaKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver3_lambda ); cudaErrchk( cudaGetLastError() ); - poly_gemmver_234_lam<<>>(n, - [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY6; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY7; - } - POLYBENCH_GEMVER_BODY8; + auto poly_gemver4_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY7; } - ); + POLYBENCH_GEMVER_BODY8; + }; + + RPlaunchCudaKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver4_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index ab1416bf0..7edf133a8 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -41,10 +41,10 @@ namespace polybench template < size_t j_block_size, size_t i_block_size > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1(Real_ptr A, - Real_ptr u1, Real_ptr v1, - Real_ptr u2, Real_ptr v2, - Index_type n) +__global__ void poly_gemver_1(Real_ptr A, + Real_ptr u1, Real_ptr v1, + Real_ptr u2, Real_ptr v2, + Index_type n) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -56,7 +56,7 @@ __global__ void poly_gemmver_1(Real_ptr A, template < size_t j_block_size, size_t i_block_size, typename Lambda > __launch_bounds__(j_block_size*i_block_size) -__global__ void poly_gemmver_1_lam(Index_type n, Lambda body) +__global__ void poly_gemver_1_lam(Index_type n, Lambda body) { Index_type i = blockIdx.y * i_block_size + threadIdx.y; Index_type j = blockIdx.x * j_block_size + threadIdx.x; @@ -68,10 +68,10 @@ __global__ void poly_gemmver_1_lam(Index_type n, Lambda body) template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_2(Real_ptr A, - Real_ptr x, Real_ptr y, - Real_type beta, - Index_type n) +__global__ void poly_gemver_2(Real_ptr A, + Real_ptr x, Real_ptr y, + Real_type beta, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -85,8 +85,8 @@ __global__ void poly_gemmver_2(Real_ptr A, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, - Index_type n) +__global__ void poly_gemver_3(Real_ptr x, Real_ptr z, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -96,10 +96,10 @@ __global__ void poly_gemmver_3(Real_ptr x, Real_ptr z, template < size_t block_size > __launch_bounds__(block_size) -__global__ void poly_gemmver_4(Real_ptr A, - Real_ptr x, Real_ptr w, - Real_type alpha, - Index_type n) +__global__ void poly_gemver_4(Real_ptr A, + Real_ptr x, Real_ptr w, + Real_type alpha, + Index_type n) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -113,7 +113,7 @@ __global__ void poly_gemmver_4(Real_ptr A, template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void poly_gemmver_234_lam(Index_type n, Lambda body) +__global__ void poly_gemver_234_lam(Index_type n, Lambda body) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < n) { @@ -140,26 +140,31 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) GEMVER_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_gemmver_1), - dim3(nblocks1), dim3(nthreads_per_block1), shmem, res.get_stream(), - A, u1, v1, u2, v2, n); + RPlaunchHipKernel( + (poly_gemver_1), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + A, u1, v1, u2, v2, n ); hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size); - hipLaunchKernelGGL((poly_gemmver_2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x, y, beta, n); + RPlaunchHipKernel( (poly_gemver_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x, y, beta, n ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_3), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, z, n); + RPlaunchHipKernel( (poly_gemver_3), + grid_size, block_size, + shmem, res.get_stream(), + x, z, n ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_gemmver_4), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x, w, alpha, n); + RPlaunchHipKernel( (poly_gemver_4), + grid_size, block_size, + shmem, res.get_stream(), + A, x, w, alpha, n ); hipErrchk( hipGetLastError() ); } @@ -174,50 +179,59 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) GEMVER_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto poly_gemmver_1_lambda = [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_GEMVER_BODY1; + auto poly_gemver1_lambda = [=] __device__ (Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1; }; - hipLaunchKernelGGL((poly_gemmver_1_lam), - dim3(nblocks1), dim3(nthreads_per_block1), shmem, res.get_stream(), - n, poly_gemmver_1_lambda); + RPlaunchHipKernel( + (poly_gemver_1_lam), + nblocks1, nthreads_per_block1, + shmem, res.get_stream(), + n, poly_gemver1_lambda ); hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); - auto poly_gemmver_2_lambda = [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY2; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY3; - } - POLYBENCH_GEMVER_BODY4; + auto poly_gemver2_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY3; + } + POLYBENCH_GEMVER_BODY4; }; - hipLaunchKernelGGL((poly_gemmver_234_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, poly_gemmver_2_lambda); + RPlaunchHipKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver2_lambda ); hipErrchk( hipGetLastError() ); - auto poly_gemmver_3_lambda = [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY5; + auto poly_gemver3_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY5; }; - hipLaunchKernelGGL((poly_gemmver_234_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, poly_gemmver_3_lambda); + RPlaunchHipKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver3_lambda ); hipErrchk( hipGetLastError() ); - auto poly_gemmver_4_lambda = [=] __device__ (Index_type i) { - POLYBENCH_GEMVER_BODY6; - for (Index_type j = 0; j < n; ++j) { - POLYBENCH_GEMVER_BODY7; - } - POLYBENCH_GEMVER_BODY8; + auto poly_gemver4_lambda = [=] __device__ (Index_type i) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY7; + } + POLYBENCH_GEMVER_BODY8; }; - hipLaunchKernelGGL((poly_gemmver_234_lam), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - n, poly_gemmver_4_lambda); + RPlaunchHipKernel( (poly_gemver_234_lam), + grid_size, block_size, + shmem, res.get_stream(), + n, poly_gemver4_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 3e921c2d2..9d0157653 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -56,10 +56,14 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_gesummv<<>>(x, y, - A, B, - alpha, beta, - N); + + RPlaunchCudaKernel( (poly_gesummv), + grid_size, block_size, + shmem, res.get_stream(), + x, y, + A, B, + alpha, beta, + N ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 7f4468849..fbf8f7381 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -56,12 +56,14 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_gesummv), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), + + RPlaunchHipKernel( (poly_gesummv), + grid_size, block_size, + shmem, res.get_stream(), x, y, - A, B, + A, B, alpha, beta, - N); + N ); hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index 871fef013..ffa0debe8 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -69,12 +69,18 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - constexpr size_t shmem = 0; + constexpr size_t shmem = 0; - poly_mvt_1<<>>(A, x1, y1, N); + RPlaunchCudaKernel( (poly_mvt_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x1, y1, N ); cudaErrchk( cudaGetLastError() ); - poly_mvt_2<<>>(A, x2, y2, N); + RPlaunchCudaKernel( (poly_mvt_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x2, y2, N ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 32b1b5161..fb6dd4d4d 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -69,16 +69,18 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); - constexpr size_t shmem = 0; + constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_mvt_1), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x1, y1, N); + RPlaunchHipKernel( (poly_mvt_1), + grid_size, block_size, + shmem, res.get_stream(), + A, x1, y1, N ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_mvt_2), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, x2, y2, N); + RPlaunchHipKernel( (poly_mvt_2), + grid_size, block_size, + shmem, res.get_stream(), + A, x2, y2, N ); hipErrchk( hipGetLastError() ); } From f3a975bc0d18c03e0c0ee4157e579d3de243e26a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Sat, 16 Dec 2023 09:46:28 -0800 Subject: [PATCH 210/454] Convert remaining polybench kernels to new GPU launch methods plus formatting cleanup --- .../POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 9 +-- .../POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 9 +-- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 7 +- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 7 +- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 68 ++++++++++++------- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 65 +++++++++++------- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 10 ++- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 12 ++-- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 62 +++++++++++------ src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 68 +++++++++++-------- 10 files changed, 197 insertions(+), 120 deletions(-) diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index c7692ad0f..d45726e20 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -146,10 +146,11 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + res, [=] __device__ (Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 8065c5f2e..cb67abc1e 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -146,10 +146,11 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}, - RAJA::RangeSegment{0, N}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + res, [=] __device__ (Index_type k, Index_type i, Index_type j) { POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; } diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index d7ae8f280..4f31f14c8 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -270,9 +270,10 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, n}, - RAJA::RangeSegment{0, n}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMVER_BODY1_RAJA; } diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 7edf133a8..008149c29 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -270,9 +270,10 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, n}, - RAJA::RangeSegment{0, n}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_GEMVER_BODY1_RAJA; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 1b63ee758..88c6e725f 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -100,12 +100,18 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_heat_3D_1 - <<>>(A, B, N); + RPlaunchCudaKernel( + (poly_heat_3D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); cudaErrchk( cudaGetLastError() ); - poly_heat_3D_2 - <<>>(A, B, N); + RPlaunchCudaKernel( + (poly_heat_3D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); cudaErrchk( cudaGetLastError() ); } @@ -124,20 +130,32 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_heat_3D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j, Index_type k) { - POLYBENCH_HEAT_3D_BODY1; - } - ); + auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { + POLYBENCH_HEAT_3D_BODY1; + }; + + RPlaunchCudaKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_1_lambda ); cudaErrchk( cudaGetLastError() ); - poly_heat_3D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j, Index_type k) { - POLYBENCH_HEAT_3D_BODY2; - } - ); + auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { + POLYBENCH_HEAT_3D_BODY2; + }; + + RPlaunchCudaKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_2_lambda ); cudaErrchk( cudaGetLastError() ); } @@ -168,19 +186,21 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1_RAJA; } ); - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 3a7d7f28e..c244f9470 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -100,14 +100,18 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_heat_3D_1), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); + RPlaunchHipKernel( + (poly_heat_3D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_heat_3D_2), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); + RPlaunchHipKernel( + (poly_heat_3D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); hipErrchk( hipGetLastError() ); } @@ -126,25 +130,32 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) HEAT_3D_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i, Index_type j, + auto poly_heat_3D_1_lambda = [=] __device__ (Index_type i, + Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1; }; - auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, Index_type j, Index_type k) { + RPlaunchHipKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_1_lambda ); + hipErrchk( hipGetLastError() ); + + auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, + Index_type j, + Index_type k) { POLYBENCH_HEAT_3D_BODY2; }; - hipLaunchKernelGGL((poly_heat_3D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_heat_3D_1_lambda); - hipErrchk( hipGetLastError() ); - - hipLaunchKernelGGL((poly_heat_3D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_heat_3D_2_lambda); + RPlaunchHipKernel( + (poly_heat_3D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_heat_3D_2_lambda ); hipErrchk( hipGetLastError() ); } @@ -174,19 +185,21 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1_RAJA; } ); - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index e2c728090..e46c5972d 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -63,10 +63,16 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - poly_jacobi_1D_1<<>>(A, B, N); + RPlaunchCudaKernel( (poly_jacobi_1D_1), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); cudaErrchk( cudaGetLastError() ); - poly_jacobi_1D_2<<>>(A, B, N); + RPlaunchCudaKernel( (poly_jacobi_1D_2), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index b0f60255d..d4c0b9ba4 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -63,12 +63,16 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(N, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_jacobi_1D_1), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, B, N); + RPlaunchHipKernel( (poly_jacobi_1D_1), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_jacobi_1D_2), dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - A, B, N); + RPlaunchHipKernel( (poly_jacobi_1D_2), + grid_size, block_size, + shmem, res.get_stream(), + A, B, N ); hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 1e8a824bb..2287a8b9f 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -96,12 +96,18 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_jacobi_2D_1 - <<>>(A, B, N); + RPlaunchCudaKernel( + (poly_jacobi_2D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); cudaErrchk( cudaGetLastError() ); - poly_jacobi_2D_2 - <<>>(A, B, N); + RPlaunchCudaKernel( + (poly_jacobi_2D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); cudaErrchk( cudaGetLastError() ); } @@ -120,20 +126,30 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_CUDA; constexpr size_t shmem = 0; - poly_jacobi_2D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY1; - } - ); + auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY1; + }; + + RPlaunchCudaKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_1_lambda ); cudaErrchk( cudaGetLastError() ); - poly_jacobi_2D_lam - <<>>(N, - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY2; - } - ); + auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY2; + }; + + RPlaunchCudaKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_2_lambda ); cudaErrchk( cudaGetLastError() ); } @@ -161,17 +177,19 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY1_RAJA; } ); - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index 6590a8173..dd83bb5c2 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -96,14 +96,18 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_HIP; constexpr size_t shmem = 0; - hipLaunchKernelGGL((poly_jacobi_2D_1), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); + RPlaunchHipKernel( + (poly_jacobi_2D_1), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); hipErrchk( hipGetLastError() ); - hipLaunchKernelGGL((poly_jacobi_2D_2), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - A, B, N); + RPlaunchHipKernel( + (poly_jacobi_2D_2), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + A, B, N ); hipErrchk( hipGetLastError() ); } @@ -122,24 +126,30 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) JACOBI_2D_NBLOCKS_HIP; constexpr size_t shmem = 0; - auto poly_jacobi_2D_1_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY1; - }; - - hipLaunchKernelGGL((poly_jacobi_2D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_jacobi_2D_1_lambda); + auto poly_jacobi_2D_1_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY1; + }; + + RPlaunchHipKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_1_lambda ); hipErrchk( hipGetLastError() ); - auto poly_jacobi_2D_2_lambda = - [=] __device__ (Index_type i, Index_type j) { - POLYBENCH_JACOBI_2D_BODY2; - }; - - hipLaunchKernelGGL((poly_jacobi_2D_lam), - dim3(nblocks), dim3(nthreads_per_block), shmem, res.get_stream(), - N, poly_jacobi_2D_2_lambda); + auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, + Index_type j) { + POLYBENCH_JACOBI_2D_BODY2; + }; + + RPlaunchHipKernel( + (poly_jacobi_2D_lam), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + N, poly_jacobi_2D_2_lambda ); hipErrchk( hipGetLastError() ); } @@ -167,17 +177,19 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY1_RAJA; } ); - RAJA::kernel_resource(RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, - RAJA::RangeSegment{1, N-1}), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, [=] __device__ (Index_type i, Index_type j) { POLYBENCH_JACOBI_2D_BODY2_RAJA; } From 847b04045378e5f2f43eff07ebaf4f0a5f7574a6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Sat, 16 Dec 2023 09:47:18 -0800 Subject: [PATCH 211/454] Formatting cleanup --- src/apps/HALOEXCHANGE-Cuda.cpp | 16 ++++++++-------- src/apps/HALOEXCHANGE-Hip.cpp | 16 ++++++++-------- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 3 ++- src/apps/LTIMES-Cuda.cpp | 16 +++++++++------- src/apps/LTIMES-Hip.cpp | 16 +++++++++------- src/apps/LTIMES_NOVIEW-Cuda.cpp | 19 +++++++++++-------- src/apps/LTIMES_NOVIEW-Hip.cpp | 17 ++++++++++------- src/basic/INDEXLIST_3LOOP-Cuda.cpp | 6 ++++-- src/basic/INDEXLIST_3LOOP-Hip.cpp | 6 ++++-- src/basic/NESTED_INIT-Cuda.cpp | 9 +++++---- src/basic/NESTED_INIT-Hip.cpp | 9 +++++---- 11 files changed, 75 insertions(+), 58 deletions(-) diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index b106fce82..0919f98d2 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -115,11 +115,11 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; - }; + HALOEXCHANGE_PACK_BODY; + }; RAJA::forall( res, - RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); buffer += len; } } @@ -132,11 +132,11 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; - }; + HALOEXCHANGE_UNPACK_BODY; + }; RAJA::forall( res, - RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); buffer += len; } } diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index f13c48bfb..4a5ecfc42 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -115,11 +115,11 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_pack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_PACK_BODY; - }; + HALOEXCHANGE_PACK_BODY; + }; RAJA::forall( res, - RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); buffer += len; } } @@ -132,11 +132,11 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto haloexchange_unpack_base_lam = [=] __device__ (Index_type i) { - HALOEXCHANGE_UNPACK_BODY; - }; + HALOEXCHANGE_UNPACK_BODY; + }; RAJA::forall( res, - RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); buffer += len; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index aef5adfa1..4535d5bd9 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -307,7 +307,8 @@ void HALOEXCHANGE_FUSED::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { - runCudaVariantWorkGroup(vid); + runCudaVariantWorkGroup(vid); } diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 1f40e840b..7173a0dcf 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -152,14 +152,16 @@ void LTIMES::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(IDRange(0, num_d), - IZRange(0, num_z), - IGRange(0, num_g), - IMRange(0, num_m)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + res, [=] __device__ (ID d, IZ z, IG g, IM m) { - LTIMES_BODY_RAJA; - }); + LTIMES_BODY_RAJA; + } + ); } stopTimer(); diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 07ce0c7bc..12e619259 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -151,14 +151,16 @@ void LTIMES::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(IDRange(0, num_d), - IZRange(0, num_z), - IGRange(0, num_g), - IMRange(0, num_m)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + res, [=] __device__ (ID d, IZ z, IG g, IM m) { - LTIMES_BODY_RAJA; - }); + LTIMES_BODY_RAJA; + } + ); } stopTimer(); diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 9b7bd4ac4..8a34f32e4 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -149,14 +149,17 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, num_d), - RAJA::RangeSegment(0, num_z), - RAJA::RangeSegment(0, num_g), - RAJA::RangeSegment(0, num_m)), - res, - [=] __device__ (Index_type d, Index_type z, Index_type g, Index_type m) { - LTIMES_NOVIEW_BODY; - }); + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + res, + [=] __device__ (Index_type d, Index_type z, + Index_type g, Index_type m) { + LTIMES_NOVIEW_BODY; + } + ); } stopTimer(); diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 4ecf9c344..2549192a8 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -149,14 +149,17 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, num_d), - RAJA::RangeSegment(0, num_z), - RAJA::RangeSegment(0, num_g), - RAJA::RangeSegment(0, num_m)), - res, - [=] __device__ (Index_type d, Index_type z, Index_type g, Index_type m) { + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + res, + [=] __device__ (Index_type d, Index_type z, + Index_type g, Index_type m) { LTIMES_NOVIEW_BODY; - }); + } + ); } stopTimer(); diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 8aff37e96..2195eb380 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -150,8 +150,10 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; }); - RAJA::exclusive_scan_inplace< RAJA::cuda_exec >( res, - RAJA::make_span(counts+ibegin, iend+1-ibegin)); + RAJA::exclusive_scan_inplace< + RAJA::cuda_exec >( + res, + RAJA::make_span(counts+ibegin, iend+1-ibegin) ); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index 2e4d8aa66..d49cd98e4 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -172,8 +172,10 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0; }); - RAJA::exclusive_scan_inplace< RAJA::hip_exec >( res, - RAJA::make_span(counts+ibegin, iend+1-ibegin)); + RAJA::exclusive_scan_inplace< + RAJA::hip_exec >( + res, + RAJA::make_span(counts+ibegin, iend+1-ibegin) ); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 3b5053dcf..4c4323477 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -145,10 +145,11 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }); diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 7a58485ee..796665bdd 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -145,10 +145,11 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - res, + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + res, [=] __device__ (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }); From 86dac5084807cd3b5c04dd59cf470afd5880c2c1 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 18 Dec 2023 09:34:15 -0800 Subject: [PATCH 212/454] Convert remaining algorithm kernels to new GPU launch method --- src/algorithm/MEMCPY-Cuda.cpp | 19 ++++++++++++++----- src/algorithm/MEMCPY-Hip.cpp | 21 ++++++++++++++------- src/algorithm/MEMSET-Cuda.cpp | 17 +++++++++++------ src/algorithm/MEMSET-Hip.cpp | 17 +++++++++++------ src/algorithm/REDUCE_SUM-Cuda.cpp | 18 ++++++++++-------- src/algorithm/REDUCE_SUM-Hip.cpp | 16 ++++++++++------ 6 files changed, 70 insertions(+), 38 deletions(-) diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp index fca6848f8..df8d24ca7 100644 --- a/src/algorithm/MEMCPY-Cuda.cpp +++ b/src/algorithm/MEMCPY-Cuda.cpp @@ -48,7 +48,9 @@ void MEMCPY::runCudaVariantLibrary(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - cudaErrchk( cudaMemcpyAsync(MEMCPY_STD_ARGS, cudaMemcpyDefault, res.get_stream()) ); + cudaErrchk( cudaMemcpyAsync(MEMCPY_STD_ARGS, + cudaMemcpyDefault, + res.get_stream()) ); } stopTimer(); @@ -89,8 +91,11 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - memcpy<<>>( - x, y, iend ); + + RPlaunchCudaKernel( (memcpy), + grid_size, block_size, + shmem, res.get_stream(), + x, y, iend ); cudaErrchk( cudaGetLastError() ); } @@ -107,8 +112,12 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, memcpy_lambda ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memcpy_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp index d0c239a67..a8b0f6326 100644 --- a/src/algorithm/MEMCPY-Hip.cpp +++ b/src/algorithm/MEMCPY-Hip.cpp @@ -48,7 +48,9 @@ void MEMCPY::runHipVariantLibrary(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hipErrchk( hipMemcpyAsync(MEMCPY_STD_ARGS, hipMemcpyDefault, res.get_stream()) ); + hipErrchk( hipMemcpyAsync(MEMCPY_STD_ARGS, + hipMemcpyDefault, + res.get_stream()) ); } stopTimer(); @@ -89,9 +91,11 @@ void MEMCPY::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL( (memcpy), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, y, iend ); + + RPlaunchHipKernel( (memcpy), + grid_size, block_size, + shmem, res.get_stream(), + x, y, iend ); hipErrchk( hipGetLastError() ); } @@ -108,9 +112,12 @@ void MEMCPY::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, memcpy_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memcpy_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp index bca349509..e216d0de7 100644 --- a/src/algorithm/MEMSET-Cuda.cpp +++ b/src/algorithm/MEMSET-Cuda.cpp @@ -89,10 +89,11 @@ void MEMSET::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - memset<<>>( x, - val, - iend ); + + RPlaunchCudaKernel( (memset), + grid_size, block_size, + shmem, res.get_stream(), + x, val, iend ); cudaErrchk( cudaGetLastError() ); } @@ -109,8 +110,12 @@ void MEMSET::runCudaVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - lambda_cuda_forall<<>>( - ibegin, iend, memset_lambda ); + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memset_lambda ); cudaErrchk( cudaGetLastError() ); } diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp index d0dacd545..b36c9db86 100644 --- a/src/algorithm/MEMSET-Hip.cpp +++ b/src/algorithm/MEMSET-Hip.cpp @@ -89,9 +89,11 @@ void MEMSET::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL( (memset), - dim3(grid_size), dim3(block_size), shmem, res.get_stream(), - x, val, iend ); + + RPlaunchHipKernel( (memset), + grid_size, block_size, + shmem, res.get_stream(), + x, val, iend ); hipErrchk( hipGetLastError() ); } @@ -108,9 +110,12 @@ void MEMSET::runHipVariantBlock(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, memset_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, memset_lambda ); hipErrchk( hipGetLastError() ); } diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 2a8bf57ee..3e5ed478e 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -148,10 +148,11 @@ void REDUCE_SUM::runCudaVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - reduce_sum<<>>( x, - sum, m_sum_init, - iend ); + + RPlaunchCudaKernel( (reduce_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, sum, m_sum_init, iend ); cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); @@ -212,10 +213,11 @@ void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - reduce_sum<<>>( x, - sum, m_sum_init, - iend ); + + RPlaunchCudaKernel( (reduce_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, sum, m_sum_init, iend ); cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index a3520ddd9..1d354f1bd 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -175,9 +175,11 @@ void REDUCE_SUM::runHipVariantBlockAtomic(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = sizeof(Real_type)*block_size; - hipLaunchKernelGGL( (reduce_sum), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), - x, sum, m_sum_init, iend ); + + RPlaunchHipKernel( (reduce_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); @@ -238,9 +240,11 @@ void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - hipLaunchKernelGGL( (reduce_sum), dim3(grid_size), dim3(block_size), - shmem, res.get_stream(), - x, sum, m_sum_init, iend ); + + RPlaunchHipKernel( (reduce_sum), + grid_size, block_size, + shmem, res.get_stream(), + x, sum, m_sum_init, iend ); hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); From f005f4393c8ca8d9da9b2c094466e9fa9bc52ab6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 18 Dec 2023 09:46:23 -0800 Subject: [PATCH 213/454] Minor change to retrigger CI checks --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812d339b0..111b86d15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ else() endif() option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable -this, and all other variants, to run _only_ raw C loops." On) +this, and all other variants, to run _only_ base variants." On) option(ENABLE_KOKKOS "Include Kokkos implementations of the kernels in the RAJA Perfsuite" Off) # From 002e2b6258d37f485219c1e430f534f091c41751 Mon Sep 17 00:00:00 2001 From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 19 Dec 2023 19:07:43 +0100 Subject: [PATCH 214/454] [Woptim] fix mpi exec flag (#403) * Allow controlled overlapping in flux allocations * Update to RAJA@develop (grab improved MPI test handling in CI) * From RAJA: From RSC : fix MPIEXEC flag with spectrum-mpi * Fix typo * From RAJA: From RSC: Import the full logic instead of overriding * From RAJA: From RSC: Fix syntax * From RAJA: From RSC: Fix missing import * Remove extra layer of allocation nesting on lassen * Passing flag to sub allocation * Attempt to fix quotes usage * Move to RADIUSS Shared CI 2023.12.0 release branch * Simplify build_and_test script further, remove alloc option * From RAJA: From RSC: Do not change flag for spectrum-mpi (fixed error elsewhere) but change it for cray-mpich * From RAJA: From RSC: Fix typo * From RAJA: From RSC: Fix syntax --- .gitlab-ci.yml | 4 ++-- .gitlab/custom-jobs-and-variables.yml | 4 ++-- scripts/gitlab/build_and_test.sh | 33 +++++---------------------- tpl/RAJA | 2 +- 4 files changed, 11 insertions(+), 32 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d02cf3fe6..9a9b83686 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -70,7 +70,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: 'v2023.10.0' + ref: 'v2023.12.0' file: 'pipelines/${CI_MACHINE}.yml' - artifact: '${CI_MACHINE}-jobs.yml' job: 'generate-job-lists' @@ -81,7 +81,7 @@ stages: include: # [Optional] checks preliminary to running the actual CI test #- project: 'radiuss/radiuss-shared-ci' - # ref: 'v2023.10.0' + # ref: 'v2023.12.0' # file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 22d7623f3..4c974c1cf 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -39,7 +39,7 @@ variables: # Corona # Arguments for top level allocation - CORONA_SHARED_ALLOC: "--exclusive --time-limit=10m --nodes=1" + CORONA_SHARED_ALLOC: "--exclusive --time-limit=10m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation CORONA_JOB_ALLOC: "--time-limit=8m --nodes=1 --begin-time=+5s" # Project specific variants for corona @@ -49,7 +49,7 @@ variables: # Tioga # Arguments for top level allocation - TIOGA_SHARED_ALLOC: "--exclusive --time-limit=26m --nodes=1" + TIOGA_SHARED_ALLOC: "--exclusive --time-limit=26m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation TIOGA_JOB_ALLOC: "--time-limit=8m --nodes=1 --begin-time=+5s" # Project specific variants for tioga diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index c6eec55ac..890ef9d2e 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -227,33 +227,12 @@ then # in case we want to make them disctinct in the future. # - if echo ${sys_type} | grep -q "blueos" && echo ${spec} | grep -q "cuda" ; then - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 ... ctest --output-on-failure -T test" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test - else - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "lrun -n1 ... ctest --output-on-failure -T test" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - lrun -n1 --smpiargs="-disable_gpu_hooks" ctest --output-on-failure -T test - fi - else - if grep -q -i "CMAKE_BUILD_TYPE.*Release" ${hostconfig_path} - then - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - ctest --output-on-failure -T test 2>&1 | tee tests_output.txt - else - echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" - echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - ctest --output-on-failure -T test 2>&1 | tee tests_output.txt - fi - fi + echo "~~~~~~~~~ Run Command: ~~~~~~~~~~~~~~~~~~~~~" + echo "ctest --output-on-failure -T test 2>&1 | tee tests_output.txt" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + date + ctest --output-on-failure -T test 2>&1 | tee tests_output.txt + date no_test_str="No tests were found!!!" if [[ "$(tail -n 1 tests_output.txt)" == "${no_test_str}" ]] diff --git a/tpl/RAJA b/tpl/RAJA index 668476510..8ccbfe780 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 668476510d61b0f58ac71ed0c8c54de601c8355c +Subproject commit 8ccbfe7807fbd128261e28e3bdc504c6cb6dbfe0 From 384ea2d24740a758a4cdd41335576482c173b57f Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 19 Dec 2023 20:19:19 +0100 Subject: [PATCH 215/454] Small allocation increase --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 4c974c1cf..d564a8ae9 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -60,7 +60,7 @@ variables: # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_JOB_ALLOC: "1 -W 15" + LASSEN_JOB_ALLOC: "1 -W 16" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen From d000cb4f7056ecae5c89a0b009f0999a8d89e07d Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 19 Dec 2023 21:35:30 +0100 Subject: [PATCH 216/454] No need to specify cray-mpich dependency --- .gitlab/jobs/corona.yml | 2 +- .gitlab/jobs/tioga.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index 519d70f47..dd5df056f 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -27,7 +27,7 @@ rocmcc_5_6_0_hip: rocmcc_5_6_0_hip_mpi: variables: - SPEC: "~shared ~openmp +rocm +mpi amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ^cray-mpich ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "~shared ~openmp +rocm +mpi amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" extends: .job_on_corona # With GitLab CI, included files cannot be empty. diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index f8ce39da4..504d983bc 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -29,5 +29,5 @@ rocmcc_5_6_0_hip_openmp: rocmcc_5_6_0_hip_openmp_mpi: variables: - SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^cray-mpich ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" extends: .job_on_tioga From b8fa83a1bda58b22c04b027aa9644b4ce953f4ba Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Dec 2023 12:48:18 -0800 Subject: [PATCH 217/454] Update RAJA to latest develop. RAJA version on RAJA Perf develop has namespace issues in omp target exec pols --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index e00f05675..8f7b40a0b 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit e00f05675b7e633c8bfdde583e25efd3a50bf267 +Subproject commit 8f7b40a0b41d37324d7c8224df059ccecadea3ab From 27ea12b53eacd5c6634da8a4376119ddbb8b5a4e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Dec 2023 12:49:56 -0800 Subject: [PATCH 218/454] Fix compilation issue --- src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp index 8f06ebd2a..bbac67809 100644 --- a/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-OMPTarget.cpp @@ -237,8 +237,6 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) } stopTimer(); - HALOEXCHANGE_FUSED_DATA_TEARDOWN_OMP_TARGET; - } else { getCout() << "\n HALOEXCHANGE_FUSED : Unknown OMP Target variant id = " << vid << std::endl; } @@ -248,7 +246,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - if (vid == Base_OpenMPTarget || vid == Lambda_OpenMPTarget) { + if (vid == Base_OpenMPTarget) { if (tune_idx == t) { @@ -279,7 +277,7 @@ void HALOEXCHANGE_FUSED::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) void HALOEXCHANGE_FUSED::setOpenMPTargetTuningDefinitions(VariantID vid) { - if (vid == Base_OpenMPTarget || vid == Lambda_OpenMPTarget) { + if (vid == Base_OpenMPTarget) { addVariantTuningName(vid, "direct"); From 4b347332aadc1c535da57adaac1d04aae3ff222d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Dec 2023 12:50:29 -0800 Subject: [PATCH 219/454] Fix compilation issue --- src/apps/EDGE3D-OMPTarget.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/apps/EDGE3D-OMPTarget.cpp b/src/apps/EDGE3D-OMPTarget.cpp index bf86d856c..64c5f7dd4 100644 --- a/src/apps/EDGE3D-OMPTarget.cpp +++ b/src/apps/EDGE3D-OMPTarget.cpp @@ -61,8 +61,6 @@ void EDGE3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } else if ( vid == RAJA_OpenMPTarget ) { - EDGE3D_DATA_SETUP_OMP_TARGET; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { From 259e0f8acb48746e87e3d32c072bab0666e53124 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 19 Dec 2023 12:51:04 -0800 Subject: [PATCH 220/454] Add missing macro guard --- src/apps/EDGE3D-Seq.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/apps/EDGE3D-Seq.cpp b/src/apps/EDGE3D-Seq.cpp index 658064427..cebd426b7 100644 --- a/src/apps/EDGE3D-Seq.cpp +++ b/src/apps/EDGE3D-Seq.cpp @@ -28,9 +28,11 @@ void EDGE3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) EDGE3D_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto edge3d_lam = [=](Index_type i) { EDGE3D_BODY; }; +#endif switch ( vid ) { From 11e449f1c7524f01c42ce0fa41dfa580a77c36bf Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Wed, 20 Dec 2023 15:39:35 +0100 Subject: [PATCH 221/454] Remove lower level allocation time limit, increase allocation time on corona, remove qos on ruby --- .gitlab/custom-jobs-and-variables.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index d564a8ae9..9210db922 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -15,10 +15,10 @@ variables: # Ruby # Arguments for top level allocation - RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --qos=ci_ruby --time=14 --nodes=2" + RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=14 --nodes=2" # Arguments for job level allocation # Note: We repeat the reservation, necessary when jobs are manually re-triggered. - RUBY_JOB_ALLOC: "--overlap --reservation=ci --qos=ci_ruby --time=12 --nodes=1" + RUBY_JOB_ALLOC: "--overlap --reservation=ci --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby @@ -31,7 +31,7 @@ variables: # We allow allocation overlapping. POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=14 --nodes=1" # Arguments for job level allocation - POODLE_JOB_ALLOC: "--overlap --time=12 --nodes=1" + POODLE_JOB_ALLOC: "--overlap --nodes=1" # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle @@ -39,9 +39,9 @@ variables: # Corona # Arguments for top level allocation - CORONA_SHARED_ALLOC: "--exclusive --time-limit=10m --nodes=1 -o per-resource.count=2" + CORONA_SHARED_ALLOC: "--exclusive --time-limit=12m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation - CORONA_JOB_ALLOC: "--time-limit=8m --nodes=1 --begin-time=+5s" + CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_CORONA_VARIANTS: "~shared ~openmp" # Project specific deps for corona @@ -51,7 +51,7 @@ variables: # Arguments for top level allocation TIOGA_SHARED_ALLOC: "--exclusive --time-limit=26m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation - TIOGA_JOB_ALLOC: "--time-limit=8m --nodes=1 --begin-time=+5s" + TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for tioga PROJECT_TIOGA_VARIANTS: "~shared ~openmp" # Project specific deps for tioga From b1d6b8635c359617f29bf211d5e4db8838c97c9a Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Wed, 20 Dec 2023 14:47:24 +0000 Subject: [PATCH 222/454] Update more kernels to memory space, rely on default camp context --- src/apps/CMakeLists.txt | 1 + src/apps/DEL_DOT_VEC_2D-Sycl.cpp | 61 ++++++++++---------------------- src/apps/DEL_DOT_VEC_2D.cpp | 4 +++ src/apps/DEL_DOT_VEC_2D.hpp | 4 +++ src/common/Executor.cpp | 2 +- src/common/KernelBase.cpp | 4 --- src/common/RAJAPerfSuite.cpp | 2 -- src/stream/ADD-Sycl.cpp | 38 +++++--------------- src/stream/ADD.cpp | 4 +++ src/stream/ADD.hpp | 4 +++ src/stream/CMakeLists.txt | 5 +++ src/stream/COPY-Sycl.cpp | 34 ++++-------------- src/stream/COPY.cpp | 3 ++ src/stream/COPY.hpp | 4 +++ src/stream/DOT-Sycl.cpp | 36 +++++-------------- src/stream/DOT.cpp | 3 ++ src/stream/DOT.hpp | 4 +++ src/stream/MUL-Sycl.cpp | 34 ++++-------------- src/stream/MUL.cpp | 3 ++ src/stream/MUL.hpp | 4 +++ src/stream/TRIAD-Sycl.cpp | 35 ++++-------------- src/stream/TRIAD.cpp | 3 ++ src/stream/TRIAD.hpp | 4 +++ 23 files changed, 109 insertions(+), 187 deletions(-) diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index dc3485354..923e8b47f 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -21,6 +21,7 @@ blt_add_library( DEL_DOT_VEC_2D-Cuda.cpp DEL_DOT_VEC_2D-OMP.cpp DEL_DOT_VEC_2D-OMPTarget.cpp + DEL_DOT_VEC_2D-Sycl.cpp DIFFUSION3DPA.cpp DIFFUSION3DPA-Cuda.cpp DIFFUSION3DPA-Hip.cpp diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp index 27a6a3cfd..23d70ba78 100644 --- a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -23,7 +23,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -31,53 +30,32 @@ namespace rajaperf namespace apps { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define DEL_DOT_VEC_2D_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ - allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ - allocAndInitSyclDeviceData(xdot, m_xdot, m_array_length, qu); \ - allocAndInitSyclDeviceData(ydot, m_ydot, m_array_length, qu); \ - allocAndInitSyclDeviceData(div, m_div, m_array_length, qu); \ - allocAndInitSyclDeviceData(real_zones, m_domain->real_zones, iend, qu); - -#define DEL_DOT_VEC_2D_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_div, div, m_array_length, qu); \ - deallocSyclDeviceData(x, qu); \ - deallocSyclDeviceData(y, qu); \ - deallocSyclDeviceData(xdot, qu); \ - deallocSyclDeviceData(ydot, qu); \ - deallocSyclDeviceData(div, qu); \ - deallocSyclDeviceData(real_zones, qu); - -void DEL_DOT_VEC_2D::runSyclVariant(VariantID vid) +template +void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type iend = m_domain->n_real_zones; + auto res{getSyclResource()}; + DEL_DOT_VEC_2D_DATA_SETUP; if ( vid == Base_SYCL ) { + if (work_group_size != 0) { - DEL_DOT_VEC_2D_DATA_SETUP_SYCL; - - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; +/* NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; + NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;*/ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type ii = item.get_global_id(0); if (ii < iend) { @@ -92,23 +70,23 @@ void DEL_DOT_VEC_2D::runSyclVariant(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - DEL_DOT_VEC_2D_DATA_TEARDOWN_SYCL; - + } } else if ( vid == RAJA_SYCL ) { - DEL_DOT_VEC_2D_DATA_SETUP_SYCL; - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; +/* NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; + NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;*/ - RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones, sycl_res); + //RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones, sycl_res); + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( zones, [=] (Index_type i) { DEL_DOT_VEC_2D_BODY; }); @@ -117,14 +95,13 @@ void DEL_DOT_VEC_2D::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - DEL_DOT_VEC_2D_DATA_TEARDOWN_SYCL; - - } else { std::cout << "\n DEL_DOT_VEC_2D : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DEL_DOT_VEC_2D, Sycl) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index ffe5edeb2..f64eacbc5 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -62,6 +62,10 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index d82efc12f..8ae5309df 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -118,13 +118,17 @@ class DEL_DOT_VEC_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index ba61c7b08..0e95a02b6 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -233,7 +233,7 @@ void Executor::setupSuite() getCout() << "\nSetting up suite based on input..." << endl; #if defined(RAJA_ENABLE_SYCL) - KernelBase::qu = KernelBase::sycl_res.get().get_queue(); + KernelBase::qu = camp::resources::Sycl().get_queue(); #endif using Svector = vector; diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 8f3654c13..d55214f76 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -258,10 +258,6 @@ void KernelBase::execute(VariantID vid, size_t tune_idx) running_variant = vid; running_tuning = tune_idx; -#if defined(RAJA_ENABLE_SYCL) - ::RAJA::sycl::detail::setQueue(&sycl_res); -#endif - resetTimer(); detail::resetDataInitCount(); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 48d796013..4afa1811c 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -986,8 +986,6 @@ KernelBase* getKernelObject(KernelID kid, } #if defined(RAJA_ENABLE_SYCL) -sycl::context ctx; -camp::resources::Resource KernelBase::sycl_res{camp::resources::Sycl(ctx)}; sycl::queue* KernelBase::qu; #endif diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp index 071342647..884860b01 100644 --- a/src/stream/ADD-Sycl.cpp +++ b/src/stream/ADD-Sycl.cpp @@ -21,7 +21,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -29,41 +28,26 @@ namespace rajaperf namespace stream { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - -#define ADD_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(a, m_a, iend, qu); \ - allocAndInitSyclDeviceData(b, m_b, iend, qu); \ - allocAndInitSyclDeviceData(c, m_c, iend, qu); - -#define ADD_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_c, c, iend, qu); \ - deallocSyclDeviceData(a, qu); \ - deallocSyclDeviceData(b, qu); \ - deallocSyclDeviceData(c, qu); - -void ADD::runSyclVariant(VariantID vid) +template +void ADD::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + ADD_DATA_SETUP; if ( vid == Base_SYCL ) { - ADD_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -78,16 +62,12 @@ void ADD::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - ADD_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - ADD_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ADD_BODY; }); @@ -96,13 +76,13 @@ void ADD::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - ADD_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n ADD : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ADD, Sycl) + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 02cf25107..7ad0e0eed 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -54,6 +54,10 @@ ADD::ADD(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } ADD::~ADD() diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 49e09a602..8894773fd 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -53,13 +53,17 @@ class ADD : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt index 03351ff5d..6e8b52453 100644 --- a/src/stream/CMakeLists.txt +++ b/src/stream/CMakeLists.txt @@ -14,29 +14,34 @@ blt_add_library( ADD-Cuda.cpp ADD-OMP.cpp ADD-OMPTarget.cpp + ADD-Sycl.cpp COPY.cpp COPY-Seq.cpp COPY-Hip.cpp COPY-Cuda.cpp COPY-OMP.cpp COPY-OMPTarget.cpp + COPY-Sycl.cpp DOT.cpp DOT-Seq.cpp DOT-Hip.cpp DOT-Cuda.cpp DOT-OMP.cpp DOT-OMPTarget.cpp + DOT-Sycl.cpp MUL.cpp MUL-Seq.cpp MUL-Hip.cpp MUL-Cuda.cpp MUL-OMP.cpp MUL-OMPTarget.cpp + MUL-Sycl.cpp TRIAD.cpp TRIAD-Seq.cpp TRIAD-Hip.cpp TRIAD-Cuda.cpp TRIAD-OMPTarget.cpp TRIAD-OMP.cpp + TRIAD-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp index 1408bb1c0..c03ea9d72 100644 --- a/src/stream/COPY-Sycl.cpp +++ b/src/stream/COPY-Sycl.cpp @@ -21,7 +21,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -29,21 +28,8 @@ namespace rajaperf namespace stream { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - -#define COPY_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(a, m_a, iend, qu); \ - allocAndInitSyclDeviceData(c, m_c, iend, qu); - -#define COPY_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_c, c, iend, qu); \ - deallocSyclDeviceData(a, qu); \ - deallocSyclDeviceData(c, qu); - -void COPY::runSyclVariant(VariantID vid) +template +void COPY::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -53,15 +39,13 @@ void COPY::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - COPY_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -76,16 +60,12 @@ void COPY::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - COPY_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - COPY_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { COPY_BODY; }); @@ -94,14 +74,14 @@ void COPY::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - COPY_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n COPY : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(COPY, Sycl) + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index c92018c63..068017c26 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -54,6 +54,9 @@ COPY::COPY(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } COPY::~COPY() diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 0544e0d2f..7545ba8a6 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -52,13 +52,17 @@ class COPY : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp index cdc8605d4..f7098098c 100644 --- a/src/stream/DOT-Sycl.cpp +++ b/src/stream/DOT-Sycl.cpp @@ -22,21 +22,8 @@ namespace rajaperf namespace stream { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define DOT_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(a, m_a, iend, qu); \ - allocAndInitSyclDeviceData(b, m_b, iend, qu); - -#define DOT_DATA_TEARDOWN_SYCL \ - deallocSyclDeviceData(a, qu); \ - deallocSyclDeviceData(b, qu); - -void DOT::runSyclVariant(VariantID vid) +template +void DOT::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -45,9 +32,7 @@ void DOT::runSyclVariant(VariantID vid) DOT_DATA_SETUP; if ( vid == Base_SYCL ) { - - DOT_DATA_SETUP_SYCL; - + if (work_group_size != 0) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -56,13 +41,13 @@ void DOT::runSyclVariant(VariantID vid) { sycl::buffer buf_dot(&dot, 1); - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { auto sumReduction = reduction(buf_dot, h, sycl::plus()); - h.parallel_for(sycl::nd_range<1>{grid_size, block_size}, + h.parallel_for(sycl::nd_range<1>{global_size, work_group_size}, sumReduction, [=] (sycl::nd_item<1> item, auto& dot) { @@ -79,19 +64,16 @@ void DOT::runSyclVariant(VariantID vid) } stopTimer(); - - DOT_DATA_TEARDOWN_SYCL; + } } else if ( vid == RAJA_SYCL ) { - DOT_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJA::ReduceSum dot(m_dot_init); - RAJA::forall< RAJA::sycl_exec_nontrivial >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { DOT_BODY; }); @@ -101,13 +83,13 @@ void DOT::runSyclVariant(VariantID vid) } stopTimer(); - DOT_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n DOT : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DOT, Sycl) + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index cc32be5f2..0d64f271d 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -54,6 +54,9 @@ DOT::DOT(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } DOT::~DOT() diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 856caef14..1e142743a 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -52,9 +52,11 @@ class DOT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantBlock(VariantID vid); template < size_t block_size > @@ -63,6 +65,8 @@ class DOT : public KernelBase void runHipVariantBlock(VariantID vid); template < size_t block_size > void runHipVariantOccGS(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp index e2c6aa0f7..a9c946d63 100644 --- a/src/stream/MUL-Sycl.cpp +++ b/src/stream/MUL-Sycl.cpp @@ -21,7 +21,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -29,21 +28,8 @@ namespace rajaperf namespace stream { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - -#define MUL_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(b, m_b, iend, qu); \ - allocAndInitSyclDeviceData(c, m_c, iend, qu); - -#define MUL_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_b, b, iend, qu); \ - deallocSyclDeviceData(b, qu); \ - deallocSyclDeviceData(c, qu) - -void MUL::runSyclVariant(VariantID vid) +template +void MUL::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -53,15 +39,13 @@ void MUL::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - MUL_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -75,16 +59,12 @@ void MUL::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - MUL_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - MUL_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { MUL_BODY; }); @@ -93,13 +73,13 @@ void MUL::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - MUL_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n MUL : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MUL, Sycl) + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index fba825bf6..3ddec6e07 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -54,6 +54,9 @@ MUL::MUL(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } MUL::~MUL() diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 3db59092a..2765ccc92 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -53,13 +53,17 @@ class MUL : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp index 45083881c..eeb04d94f 100644 --- a/src/stream/TRIAD-Sycl.cpp +++ b/src/stream/TRIAD-Sycl.cpp @@ -29,23 +29,8 @@ namespace rajaperf namespace stream { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - -#define TRIAD_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(a, m_a, iend, qu); \ - allocAndInitSyclDeviceData(b, m_b, iend, qu); \ - allocAndInitSyclDeviceData(c, m_c, iend, qu); - -#define TRIAD_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_a, a, iend, qu); \ - deallocSyclDeviceData(a, qu); \ - deallocSyclDeviceData(b, qu); \ - deallocSyclDeviceData(c, qu); - -void TRIAD::runSyclVariant(VariantID vid) +template +void TRIAD::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -55,14 +40,12 @@ void TRIAD::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - TRIAD_DATA_SETUP_SYCL; - - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -77,16 +60,12 @@ void TRIAD::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - TRIAD_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - TRIAD_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { TRIAD_BODY; }); @@ -95,14 +74,14 @@ void TRIAD::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - TRIAD_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n TRIAD : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRIAD, Sycl) + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index d9897618c..3dab7fd95 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -58,6 +58,9 @@ TRIAD::TRIAD(const RunParams& params) setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } TRIAD::~TRIAD() diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 3f65bf804..19cb33be5 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -54,13 +54,17 @@ class TRIAD : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From a222aaf8bafb6f60f6760262956bf0cab2a6d9da Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Wed, 20 Dec 2023 18:15:54 +0100 Subject: [PATCH 223/454] Point at RAJA@develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 8ccbfe780..19116582a 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 8ccbfe7807fbd128261e28e3bdc504c6cb6dbfe0 +Subproject commit 19116582ad9ddd87b724656be36bb73e624b13f2 From aa11d0f6faa1b5577b789bb436ee448fa5630fd0 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 20 Dec 2023 10:10:00 -0800 Subject: [PATCH 224/454] fix launch syntax in HALO_EXCHANGE --- src/comm/HALO_EXCHANGE-Cuda.cpp | 10 ++++++++-- src/comm/HALO_EXCHANGE-Hip.cpp | 12 ++++++++---- src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp | 18 ++++++++++++++---- src/comm/HALO_EXCHANGE_FUSED-Hip.cpp | 18 ++++++++++++++---- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/src/comm/HALO_EXCHANGE-Cuda.cpp b/src/comm/HALO_EXCHANGE-Cuda.cpp index 0a2078f46..52962d618 100644 --- a/src/comm/HALO_EXCHANGE-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE-Cuda.cpp @@ -75,7 +75,10 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - HALO_exchange_pack<<>>(buffer, list, var, len); + RPlaunchCudaKernel( (HALO_exchange_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } @@ -109,7 +112,10 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - HALO_exchange_unpack<<>>(buffer, list, var, len); + RPlaunchCudaKernel( (HALO_exchange_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); cudaErrchk( cudaGetLastError() ); buffer += len; } diff --git a/src/comm/HALO_EXCHANGE-Hip.cpp b/src/comm/HALO_EXCHANGE-Hip.cpp index 3a78dc389..97224e29a 100644 --- a/src/comm/HALO_EXCHANGE-Hip.cpp +++ b/src/comm/HALO_EXCHANGE-Hip.cpp @@ -75,8 +75,10 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((HALO_exchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), - buffer, list, var, len); + RPlaunchHipKernel( (HALO_exchange_pack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; } @@ -110,8 +112,10 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((HALO_exchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), - buffer, list, var, len); + RPlaunchHipKernel( (HALO_exchange_unpack), + nblocks, nthreads_per_block, + shmem, res.get_stream(), + buffer, list, var, len); hipErrchk( hipGetLastError() ); buffer += len; } diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp index 68f33ab57..13208b446 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp @@ -133,8 +133,13 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - HALO_exchange_fused_pack<<>>( - pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); + RPlaunchCudaKernel( (HALO_exchange_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs); cudaErrchk( cudaGetLastError() ); if (separate_buffers) { for (Index_type l = 0; l < num_neighbors; ++l) { @@ -180,8 +185,13 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - HALO_exchange_fused_unpack<<>>( - unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); + RPlaunchCudaKernel( (HALO_exchange_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs); cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp index deeaabf36..78c08e49d 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp @@ -133,8 +133,13 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - hipLaunchKernelGGL((HALO_exchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), - pack_buffer_ptrs, pack_list_ptrs, pack_var_ptrs, pack_len_ptrs); + RPlaunchHipKernel( (HALO_exchange_fused_pack), + pack_nblocks, pack_nthreads_per_block, + shmem, res.get_stream(), + pack_buffer_ptrs, + pack_list_ptrs, + pack_var_ptrs, + pack_len_ptrs); hipErrchk( hipGetLastError() ); if (separate_buffers) { for (Index_type l = 0; l < num_neighbors; ++l) { @@ -180,8 +185,13 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - hipLaunchKernelGGL((HALO_exchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), - unpack_buffer_ptrs, unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs); + RPlaunchHipKernel( (HALO_exchange_fused_unpack), + unpack_nblocks, unpack_nthreads_per_block, + shmem, res.get_stream(), + unpack_buffer_ptrs, + unpack_list_ptrs, + unpack_var_ptrs, + unpack_len_ptrs); hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); From d156f5400228971db057dcc686969a37c8272500 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 20 Dec 2023 10:29:27 -0800 Subject: [PATCH 225/454] fix capitalization in HALO_EXCHANGE --- src/comm/HALO_EXCHANGE-Cuda.cpp | 16 ++++++++-------- src/comm/HALO_EXCHANGE-Hip.cpp | 16 ++++++++-------- src/comm/HALO_EXCHANGE-OMP.cpp | 16 ++++++++-------- src/comm/HALO_EXCHANGE-OMPTarget.cpp | 8 ++++---- src/comm/HALO_EXCHANGE-Seq.cpp | 16 ++++++++-------- src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp | 8 ++++---- src/comm/HALO_EXCHANGE_FUSED-Hip.cpp | 8 ++++---- 7 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/comm/HALO_EXCHANGE-Cuda.cpp b/src/comm/HALO_EXCHANGE-Cuda.cpp index 52962d618..96128e962 100644 --- a/src/comm/HALO_EXCHANGE-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE-Cuda.cpp @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -75,7 +75,7 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchCudaKernel( (HALO_exchange_pack), + RPlaunchCudaKernel( (halo_exchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); @@ -112,7 +112,7 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchCudaKernel( (HALO_exchange_unpack), + RPlaunchCudaKernel( (halo_exchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); @@ -146,12 +146,12 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { + auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_exchange_pack_base_lam ); buffer += len; } @@ -181,12 +181,12 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto halo_exchange_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_exchange_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_EXCHANGE-Hip.cpp b/src/comm/HALO_EXCHANGE-Hip.cpp index 97224e29a..25f6a338a 100644 --- a/src/comm/HALO_EXCHANGE-Hip.cpp +++ b/src/comm/HALO_EXCHANGE-Hip.cpp @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -75,7 +75,7 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchHipKernel( (HALO_exchange_pack), + RPlaunchHipKernel( (halo_exchange_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); @@ -112,7 +112,7 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchHipKernel( (HALO_exchange_unpack), + RPlaunchHipKernel( (halo_exchange_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len); @@ -146,12 +146,12 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { + auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_exchange_pack_base_lam ); buffer += len; } @@ -181,12 +181,12 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto halo_exchange_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_exchange_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_EXCHANGE-OMP.cpp b/src/comm/HALO_EXCHANGE-OMP.cpp index 44a55db92..8d48f03da 100644 --- a/src/comm/HALO_EXCHANGE-OMP.cpp +++ b/src/comm/HALO_EXCHANGE-OMP.cpp @@ -112,12 +112,12 @@ void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALO_exchange_pack_base_lam(i); + halo_exchange_pack_base_lam(i); } buffer += len; } @@ -147,12 +147,12 @@ void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALO_exchange_unpack_base_lam(i); + halo_exchange_unpack_base_lam(i); } buffer += len; } @@ -185,12 +185,12 @@ void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_exchange_pack_base_lam ); buffer += len; } @@ -219,12 +219,12 @@ void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_exchange_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_EXCHANGE-OMPTarget.cpp b/src/comm/HALO_EXCHANGE-OMPTarget.cpp index 0a0340810..32d10d5a6 100644 --- a/src/comm/HALO_EXCHANGE-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE-OMPTarget.cpp @@ -116,12 +116,12 @@ void HALO_EXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_exchange_pack_base_lam ); buffer += len; } @@ -150,12 +150,12 @@ void HALO_EXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_exchange_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_EXCHANGE-Seq.cpp b/src/comm/HALO_EXCHANGE-Seq.cpp index 62337fa1a..2d7ee28dc 100644 --- a/src/comm/HALO_EXCHANGE-Seq.cpp +++ b/src/comm/HALO_EXCHANGE-Seq.cpp @@ -109,11 +109,11 @@ void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; for (Index_type i = 0; i < len; i++) { - HALO_exchange_pack_base_lam(i); + halo_exchange_pack_base_lam(i); } buffer += len; } @@ -143,11 +143,11 @@ void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; for (Index_type i = 0; i < len; i++) { - HALO_exchange_unpack_base_lam(i); + halo_exchange_unpack_base_lam(i); } buffer += len; } @@ -180,12 +180,12 @@ void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_exchange_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_exchange_pack_base_lam ); buffer += len; } @@ -214,12 +214,12 @@ void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_exchange_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_exchange_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp index 13208b446..7a7beb168 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, +__global__ void halo_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,7 +70,7 @@ __global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pa template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, +__global__ void halo_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -133,7 +133,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - RPlaunchCudaKernel( (HALO_exchange_fused_pack), + RPlaunchCudaKernel( (halo_exchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, @@ -185,7 +185,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - RPlaunchCudaKernel( (HALO_exchange_fused_unpack), + RPlaunchCudaKernel( (halo_exchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), unpack_buffer_ptrs, diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp index 78c08e49d..5b483f9b9 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, +__global__ void halo_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) { Index_type j = blockIdx.y; @@ -70,7 +70,7 @@ __global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pa template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, +__global__ void halo_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) { Index_type j = blockIdx.y; @@ -133,7 +133,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - RPlaunchHipKernel( (HALO_exchange_fused_pack), + RPlaunchHipKernel( (halo_exchange_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, @@ -185,7 +185,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) Index_type unpack_len_ave = (unpack_len_sum + unpack_index-1) / unpack_index; dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - RPlaunchHipKernel( (HALO_exchange_fused_unpack), + RPlaunchHipKernel( (halo_exchange_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), unpack_buffer_ptrs, From 3c87f6a5b300423c8486d4be180f2c70a8b7b036 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 20 Dec 2023 10:29:41 -0800 Subject: [PATCH 226/454] Fix capitaliztion and naming in HALO_PACKING --- src/comm/HALO_PACKING-Cuda.cpp | 16 ++++++++-------- src/comm/HALO_PACKING-Hip.cpp | 16 ++++++++-------- src/comm/HALO_PACKING-OMP.cpp | 16 ++++++++-------- src/comm/HALO_PACKING-OMPTarget.cpp | 8 ++++---- src/comm/HALO_PACKING-Seq.cpp | 16 ++++++++-------- src/comm/HALO_PACKING_FUSED-Cuda.cpp | 8 ++++---- src/comm/HALO_PACKING_FUSED-Hip.cpp | 8 ++++---- 7 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/comm/HALO_PACKING-Cuda.cpp b/src/comm/HALO_PACKING-Cuda.cpp index b87478419..709b8f8f8 100644 --- a/src/comm/HALO_PACKING-Cuda.cpp +++ b/src/comm/HALO_PACKING-Cuda.cpp @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -69,7 +69,7 @@ void HALO_PACKING::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchCudaKernel( (HALO_exchange_pack), + RPlaunchCudaKernel( (halo_packing_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); @@ -101,7 +101,7 @@ void HALO_PACKING::runCudaVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchCudaKernel( (HALO_exchange_unpack), + RPlaunchCudaKernel( (halo_packing_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); @@ -127,12 +127,12 @@ void HALO_PACKING::runCudaVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { + auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } @@ -157,12 +157,12 @@ void HALO_PACKING::runCudaVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto halo_packing_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_PACKING-Hip.cpp b/src/comm/HALO_PACKING-Hip.cpp index 49d0e91ee..b3773c5f8 100644 --- a/src/comm/HALO_PACKING-Hip.cpp +++ b/src/comm/HALO_PACKING-Hip.cpp @@ -23,7 +23,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -35,7 +35,7 @@ __global__ void HALO_exchange_pack(Real_ptr buffer, Int_ptr list, Real_ptr var, template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, +__global__ void halo_packing_unpack(Real_ptr buffer, Int_ptr list, Real_ptr var, Index_type len) { Index_type i = threadIdx.x + blockIdx.x * block_size; @@ -69,7 +69,7 @@ void HALO_PACKING::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchHipKernel( (HALO_exchange_pack), + RPlaunchHipKernel( (halo_packing_pack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); @@ -101,7 +101,7 @@ void HALO_PACKING::runHipVariantImpl(VariantID vid) dim3 nthreads_per_block(block_size); dim3 nblocks((len + block_size-1) / block_size); constexpr size_t shmem = 0; - RPlaunchHipKernel( (HALO_exchange_unpack), + RPlaunchHipKernel( (halo_packing_unpack), nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); @@ -127,12 +127,12 @@ void HALO_PACKING::runHipVariantImpl(VariantID vid) Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=] __device__ (Index_type i) { + auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) { HALO_PACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } @@ -157,12 +157,12 @@ void HALO_PACKING::runHipVariantImpl(VariantID vid) for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=] __device__ (Index_type i) { + auto halo_packing_unpack_base_lam = [=] __device__ (Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( res, RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_PACKING-OMP.cpp b/src/comm/HALO_PACKING-OMP.cpp index 165688a3e..5a6ecf807 100644 --- a/src/comm/HALO_PACKING-OMP.cpp +++ b/src/comm/HALO_PACKING-OMP.cpp @@ -90,12 +90,12 @@ void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_packing_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALO_exchange_pack_base_lam(i); + halo_packing_pack_base_lam(i); } buffer += len; } @@ -119,12 +119,12 @@ void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_packing_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; #pragma omp parallel for for (Index_type i = 0; i < len; i++) { - HALO_exchange_unpack_base_lam(i); + halo_packing_unpack_base_lam(i); } buffer += len; } @@ -149,12 +149,12 @@ void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_packing_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } @@ -177,12 +177,12 @@ void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_packing_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_PACKING-OMPTarget.cpp b/src/comm/HALO_PACKING-OMPTarget.cpp index cb0c818aa..5a8e68f6d 100644 --- a/src/comm/HALO_PACKING-OMPTarget.cpp +++ b/src/comm/HALO_PACKING-OMPTarget.cpp @@ -96,12 +96,12 @@ void HALO_PACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_packing_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } @@ -124,12 +124,12 @@ void HALO_PACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_packing_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_PACKING-Seq.cpp b/src/comm/HALO_PACKING-Seq.cpp index 1b8194390..75bc30bca 100644 --- a/src/comm/HALO_PACKING-Seq.cpp +++ b/src/comm/HALO_PACKING-Seq.cpp @@ -88,11 +88,11 @@ void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_packing_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; for (Index_type i = 0; i < len; i++) { - HALO_exchange_pack_base_lam(i); + halo_packing_pack_base_lam(i); } buffer += len; } @@ -116,11 +116,11 @@ void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_packing_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; for (Index_type i = 0; i < len; i++) { - HALO_exchange_unpack_base_lam(i); + halo_packing_unpack_base_lam(i); } buffer += len; } @@ -145,12 +145,12 @@ void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_pack_base_lam = [=](Index_type i) { + auto halo_packing_pack_base_lam = [=](Index_type i) { HALO_PACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_pack_base_lam ); + halo_packing_pack_base_lam ); buffer += len; } @@ -173,12 +173,12 @@ void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; - auto HALO_exchange_unpack_base_lam = [=](Index_type i) { + auto halo_packing_unpack_base_lam = [=](Index_type i) { HALO_UNPACK_BODY; }; RAJA::forall( RAJA::TypedRangeSegment(0, len), - HALO_exchange_unpack_base_lam ); + halo_packing_unpack_base_lam ); buffer += len; } } diff --git a/src/comm/HALO_PACKING_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp index 3adb24a84..d14bc0ebe 100644 --- a/src/comm/HALO_PACKING_FUSED-Cuda.cpp +++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, +__global__ void halo_packing_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) @@ -72,7 +72,7 @@ __global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, +__global__ void halo_packing_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) @@ -131,7 +131,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - RPlaunchCudaKernel( (HALO_exchange_fused_pack), + RPlaunchCudaKernel( (halo_packing_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, @@ -178,7 +178,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid) dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - RPlaunchCudaKernel( (HALO_exchange_fused_unpack), + RPlaunchCudaKernel( (halo_packing_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), unpack_buffer_ptrs, diff --git a/src/comm/HALO_PACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp index 045081295..5132fafb6 100644 --- a/src/comm/HALO_PACKING_FUSED-Hip.cpp +++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp @@ -51,7 +51,7 @@ namespace comm template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, +__global__ void halo_packing_fused_pack(Real_ptr* pack_buffer_ptrs, Int_ptr* pack_list_ptrs, Real_ptr* pack_var_ptrs, Index_type* pack_len_ptrs) @@ -72,7 +72,7 @@ __global__ void HALO_exchange_fused_pack(Real_ptr* pack_buffer_ptrs, template < size_t block_size > __launch_bounds__(block_size) -__global__ void HALO_exchange_fused_unpack(Real_ptr* unpack_buffer_ptrs, +__global__ void halo_packing_fused_unpack(Real_ptr* unpack_buffer_ptrs, Int_ptr* unpack_list_ptrs, Real_ptr* unpack_var_ptrs, Index_type* unpack_len_ptrs) @@ -131,7 +131,7 @@ void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid) Index_type pack_len_ave = (pack_len_sum + pack_index-1) / pack_index; dim3 pack_nthreads_per_block(block_size); dim3 pack_nblocks((pack_len_ave + block_size-1) / block_size, pack_index); - RPlaunchHipKernel( (HALO_exchange_fused_pack), + RPlaunchHipKernel( (halo_packing_fused_pack), pack_nblocks, pack_nthreads_per_block, shmem, res.get_stream(), pack_buffer_ptrs, @@ -178,7 +178,7 @@ void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid) dim3 unpack_nthreads_per_block(block_size); dim3 unpack_nblocks((unpack_len_ave + block_size-1) / block_size, unpack_index); - RPlaunchHipKernel( (HALO_exchange_fused_unpack), + RPlaunchHipKernel( (halo_packing_fused_unpack), unpack_nblocks, unpack_nthreads_per_block, shmem, res.get_stream(), unpack_buffer_ptrs, From eb6770ea9ca85fa8b5babed6d536aa5335681aa5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 20 Dec 2023 10:34:09 -0800 Subject: [PATCH 227/454] Fix spacing in comm --- src/comm/HALO_EXCHANGE-Cuda.cpp | 4 ++-- src/comm/HALO_EXCHANGE-Hip.cpp | 4 ++-- src/comm/HALO_EXCHANGE-OMP.cpp | 6 +++--- src/comm/HALO_EXCHANGE-OMPTarget.cpp | 4 ++-- src/comm/HALO_EXCHANGE-Seq.cpp | 2 +- src/comm/HALO_EXCHANGE.hpp | 4 ++-- src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp | 4 ++-- src/comm/HALO_EXCHANGE_FUSED-Hip.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED-OMP.cpp | 10 +++++----- src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp | 4 ++-- src/comm/HALO_EXCHANGE_FUSED-Seq.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED.hpp | 4 ++-- src/comm/HALO_PACKING-Cuda.cpp | 4 ++-- src/comm/HALO_PACKING-Hip.cpp | 4 ++-- src/comm/HALO_PACKING-OMP.cpp | 6 +++--- src/comm/HALO_PACKING-OMPTarget.cpp | 4 ++-- src/comm/HALO_PACKING-Seq.cpp | 2 +- src/comm/HALO_PACKING.hpp | 4 ++-- src/comm/HALO_PACKING_FUSED-Cuda.cpp | 4 ++-- src/comm/HALO_PACKING_FUSED-Hip.cpp | 2 +- src/comm/HALO_PACKING_FUSED-OMP.cpp | 10 +++++----- src/comm/HALO_PACKING_FUSED-OMPTarget.cpp | 4 ++-- src/comm/HALO_PACKING_FUSED-Seq.cpp | 2 +- src/comm/HALO_PACKING_FUSED.hpp | 4 ++-- src/comm/HALO_SENDRECV-Cuda.cpp | 2 +- src/comm/HALO_SENDRECV-Hip.cpp | 2 +- src/comm/HALO_SENDRECV-OMP.cpp | 2 +- src/comm/HALO_SENDRECV-OMPTarget.cpp | 2 +- src/comm/HALO_SENDRECV.hpp | 4 ++-- 29 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/comm/HALO_EXCHANGE-Cuda.cpp b/src/comm/HALO_EXCHANGE-Cuda.cpp index 96128e962..26be980c1 100644 --- a/src/comm/HALO_EXCHANGE-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE-Cuda.cpp @@ -69,7 +69,7 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -143,7 +143,7 @@ void HALO_EXCHANGE::runCudaVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALO_EXCHANGE-Hip.cpp b/src/comm/HALO_EXCHANGE-Hip.cpp index 25f6a338a..1e9efb823 100644 --- a/src/comm/HALO_EXCHANGE-Hip.cpp +++ b/src/comm/HALO_EXCHANGE-Hip.cpp @@ -69,7 +69,7 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -143,7 +143,7 @@ void HALO_EXCHANGE::runHipVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_exchange_pack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALO_EXCHANGE-OMP.cpp b/src/comm/HALO_EXCHANGE-OMP.cpp index 8d48f03da..1ce83af12 100644 --- a/src/comm/HALO_EXCHANGE-OMP.cpp +++ b/src/comm/HALO_EXCHANGE-OMP.cpp @@ -44,7 +44,7 @@ void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp parallel for @@ -109,7 +109,7 @@ void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_exchange_pack_base_lam = [=](Index_type i) { @@ -182,7 +182,7 @@ void HALO_EXCHANGE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_exchange_pack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALO_EXCHANGE-OMPTarget.cpp b/src/comm/HALO_EXCHANGE-OMPTarget.cpp index 32d10d5a6..c3100e996 100644 --- a/src/comm/HALO_EXCHANGE-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE-OMPTarget.cpp @@ -47,7 +47,7 @@ void HALO_EXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp target is_device_ptr(buffer, list, var) device( did ) @@ -113,7 +113,7 @@ void HALO_EXCHANGE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_exchange_pack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALO_EXCHANGE-Seq.cpp b/src/comm/HALO_EXCHANGE-Seq.cpp index 2d7ee28dc..e494e374a 100644 --- a/src/comm/HALO_EXCHANGE-Seq.cpp +++ b/src/comm/HALO_EXCHANGE-Seq.cpp @@ -177,7 +177,7 @@ void HALO_EXCHANGE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_exchange_pack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALO_EXCHANGE.hpp b/src/comm/HALO_EXCHANGE.hpp index 5474a6837..7d46cedd7 100644 --- a/src/comm/HALO_EXCHANGE.hpp +++ b/src/comm/HALO_EXCHANGE.hpp @@ -20,7 +20,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; +/// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; @@ -40,7 +40,7 @@ /// MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE); /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; +/// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp index 7a7beb168..0492c520d 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp @@ -118,7 +118,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_buffer_ptrs[pack_index] = buffer; @@ -164,7 +164,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp index 5b483f9b9..1a51fd7ed 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp @@ -118,7 +118,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_buffer_ptrs[pack_index] = buffer; diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp index 4cf972111..05c314e20 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp @@ -106,7 +106,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], @@ -178,7 +178,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); @@ -232,7 +232,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], @@ -356,7 +356,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); @@ -384,7 +384,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp index eb04ae398..04efe9b4d 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp @@ -246,7 +246,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); @@ -274,7 +274,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp index d439230ae..f7c5169f7 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp @@ -308,7 +308,7 @@ void HALO_EXCHANGE_FUSED::runSeqVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_EXCHANGE_FUSED.hpp b/src/comm/HALO_EXCHANGE_FUSED.hpp index 3a6236a3b..2373935c3 100644 --- a/src/comm/HALO_EXCHANGE_FUSED.hpp +++ b/src/comm/HALO_EXCHANGE_FUSED.hpp @@ -20,7 +20,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; +/// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; @@ -44,7 +44,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; +/// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; diff --git a/src/comm/HALO_PACKING-Cuda.cpp b/src/comm/HALO_PACKING-Cuda.cpp index 709b8f8f8..a48c14f6e 100644 --- a/src/comm/HALO_PACKING-Cuda.cpp +++ b/src/comm/HALO_PACKING-Cuda.cpp @@ -63,7 +63,7 @@ void HALO_PACKING::runCudaVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -124,7 +124,7 @@ void HALO_PACKING::runCudaVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALO_PACKING-Hip.cpp b/src/comm/HALO_PACKING-Hip.cpp index b3773c5f8..ec0118316 100644 --- a/src/comm/HALO_PACKING-Hip.cpp +++ b/src/comm/HALO_PACKING-Hip.cpp @@ -63,7 +63,7 @@ void HALO_PACKING::runHipVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; dim3 nthreads_per_block(block_size); @@ -124,7 +124,7 @@ void HALO_PACKING::runHipVariantImpl(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_packing_pack_base_lam = [=] __device__ (Index_type i) { diff --git a/src/comm/HALO_PACKING-OMP.cpp b/src/comm/HALO_PACKING-OMP.cpp index 5a6ecf807..d88433ba0 100644 --- a/src/comm/HALO_PACKING-OMP.cpp +++ b/src/comm/HALO_PACKING-OMP.cpp @@ -36,7 +36,7 @@ void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp parallel for @@ -87,7 +87,7 @@ void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_packing_pack_base_lam = [=](Index_type i) { @@ -146,7 +146,7 @@ void HALO_PACKING::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_packing_pack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALO_PACKING-OMPTarget.cpp b/src/comm/HALO_PACKING-OMPTarget.cpp index 5a8e68f6d..a96c240ae 100644 --- a/src/comm/HALO_PACKING-OMPTarget.cpp +++ b/src/comm/HALO_PACKING-OMPTarget.cpp @@ -41,7 +41,7 @@ void HALO_PACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; #pragma omp target is_device_ptr(buffer, list, var) device( did ) @@ -93,7 +93,7 @@ void HALO_PACKING::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_packing_pack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALO_PACKING-Seq.cpp b/src/comm/HALO_PACKING-Seq.cpp index 75bc30bca..da77fb0a3 100644 --- a/src/comm/HALO_PACKING-Seq.cpp +++ b/src/comm/HALO_PACKING-Seq.cpp @@ -142,7 +142,7 @@ void HALO_PACKING::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; auto halo_packing_pack_base_lam = [=](Index_type i) { diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp index e14199a4d..2ae609e72 100644 --- a/src/comm/HALO_PACKING.hpp +++ b/src/comm/HALO_PACKING.hpp @@ -13,7 +13,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; +/// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; @@ -28,7 +28,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; +/// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; diff --git a/src/comm/HALO_PACKING_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp index d14bc0ebe..9608f00b7 100644 --- a/src/comm/HALO_PACKING_FUSED-Cuda.cpp +++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp @@ -116,7 +116,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_buffer_ptrs[pack_index] = buffer; @@ -155,7 +155,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_PACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp index 5132fafb6..6f2a7123a 100644 --- a/src/comm/HALO_PACKING_FUSED-Hip.cpp +++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp @@ -116,7 +116,7 @@ void HALO_PACKING_FUSED::runHipVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pack_buffer_ptrs[pack_index] = buffer; diff --git a/src/comm/HALO_PACKING_FUSED-OMP.cpp b/src/comm/HALO_PACKING_FUSED-OMP.cpp index b7d48e69d..a8700bc03 100644 --- a/src/comm/HALO_PACKING_FUSED-OMP.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMP.cpp @@ -91,7 +91,7 @@ void HALO_PACKING_FUSED::runOpenMPVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], @@ -155,7 +155,7 @@ void HALO_PACKING_FUSED::runOpenMPVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; new(&pack_lambdas[pack_index]) pack_lambda_type(make_pack_lambda(buffer, list, var)); @@ -202,7 +202,7 @@ void HALO_PACKING_FUSED::runOpenMPVariantDirect(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], @@ -318,7 +318,7 @@ void HALO_PACKING_FUSED::runOpenMPVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); @@ -339,7 +339,7 @@ void HALO_PACKING_FUSED::runOpenMPVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp index cddfa703f..950c94bfd 100644 --- a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp @@ -225,7 +225,7 @@ void HALO_PACKING_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = pack_buffers[l]; Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { Real_ptr var = vars[v]; pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); @@ -246,7 +246,7 @@ void HALO_PACKING_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_PACKING_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp index 3bd4ee571..ba3482528 100644 --- a/src/comm/HALO_PACKING_FUSED-Seq.cpp +++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp @@ -263,7 +263,7 @@ void HALO_PACKING_FUSED::runSeqVariantWorkGroup(VariantID vid) for (Index_type l = 0; l < num_neighbors; ++l) { Real_ptr buffer = unpack_buffers[l]; Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + Index_type len = unpack_index_list_lengths[l]; if (separate_buffers) { copyData(dataSpace, unpack_buffers[l], DataSpace::Host, recv_buffers[l], diff --git a/src/comm/HALO_PACKING_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp index a80311de3..559632142 100644 --- a/src/comm/HALO_PACKING_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -13,7 +13,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; +/// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; @@ -28,7 +28,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; +/// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; diff --git a/src/comm/HALO_SENDRECV-Cuda.cpp b/src/comm/HALO_SENDRECV-Cuda.cpp index f6f182e60..e0e360675 100644 --- a/src/comm/HALO_SENDRECV-Cuda.cpp +++ b/src/comm/HALO_SENDRECV-Cuda.cpp @@ -40,7 +40,7 @@ void HALO_SENDRECV::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } for (Index_type l = 0; l < num_neighbors; ++l) { - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } diff --git a/src/comm/HALO_SENDRECV-Hip.cpp b/src/comm/HALO_SENDRECV-Hip.cpp index fc03ab487..8cf25b4e3 100644 --- a/src/comm/HALO_SENDRECV-Hip.cpp +++ b/src/comm/HALO_SENDRECV-Hip.cpp @@ -40,7 +40,7 @@ void HALO_SENDRECV::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune } for (Index_type l = 0; l < num_neighbors; ++l) { - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } diff --git a/src/comm/HALO_SENDRECV-OMP.cpp b/src/comm/HALO_SENDRECV-OMP.cpp index 23efb5143..663a1dd0b 100644 --- a/src/comm/HALO_SENDRECV-OMP.cpp +++ b/src/comm/HALO_SENDRECV-OMP.cpp @@ -42,7 +42,7 @@ void HALO_SENDRECV::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t } for (Index_type l = 0; l < num_neighbors; ++l) { - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } diff --git a/src/comm/HALO_SENDRECV-OMPTarget.cpp b/src/comm/HALO_SENDRECV-OMPTarget.cpp index e5a3f7723..d04814125 100644 --- a/src/comm/HALO_SENDRECV-OMPTarget.cpp +++ b/src/comm/HALO_SENDRECV-OMPTarget.cpp @@ -45,7 +45,7 @@ void HALO_SENDRECV::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED } for (Index_type l = 0; l < num_neighbors; ++l) { - Index_type len = pack_index_list_lengths[l]; + Index_type len = pack_index_list_lengths[l]; MPI_Isend(send_buffers[l], len*num_vars, Real_MPI_type, mpi_ranks[l], send_tags[l], MPI_COMM_WORLD, &pack_mpi_requests[l]); } diff --git a/src/comm/HALO_SENDRECV.hpp b/src/comm/HALO_SENDRECV.hpp index a64ca4d12..fbefaadec 100644 --- a/src/comm/HALO_SENDRECV.hpp +++ b/src/comm/HALO_SENDRECV.hpp @@ -20,7 +20,7 @@ /// for (Index_type l = 0; l < num_neighbors; ++l) { /// Real_ptr buffer = pack_buffers[l]; /// Int_ptr list = pack_index_lists[l]; -/// Index_type len = pack_index_list_lengths[l]; +/// Index_type len = pack_index_list_lengths[l]; /// // pack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; @@ -40,7 +40,7 @@ /// MPI_Wait(&unpack_mpi_requests[l], MPI_STATUS_IGNORE); /// Real_ptr buffer = unpack_buffers[l]; /// Int_ptr list = unpack_index_lists[l]; -/// Index_type len = unpack_index_list_lengths[l]; +/// Index_type len = unpack_index_list_lengths[l]; /// // unpack part of each variable /// for (Index_type v = 0; v < num_vars; ++v) { /// Real_ptr var = vars[v]; From f23c703d1b66f9c3fdb8f1fb437e737b2e3dc1fc Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Dec 2023 10:41:57 -0800 Subject: [PATCH 228/454] Convert HIP variants to use new launch methods --- src/basic/DAXPY_ATOMIC-Hip.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 0688950b0..4dac76c2b 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -52,8 +52,11 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((daxpy_atomic),dim3(grid_size), dim3(block_size), shmem, res.get_stream(), y, x, a, - iend ); + + RPlaunchHipKernel( (daxpy_atomic), + grid_size, block_size, + shmem, res.get_stream(), + y, x, a, iend ); hipErrchk( hipGetLastError() ); } @@ -70,8 +73,12 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - hipLaunchKernelGGL((lambda_hip_forall), - grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_atomic_lambda); + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, daxpy_atomic_lambda ); hipErrchk( hipGetLastError() ); } From 44d5b52175022d285e53bb94df8276ea00d11035 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Dec 2023 13:20:45 -0800 Subject: [PATCH 229/454] Change to GPU kernel launch errror checking for comment. --- src/basic/DAXPY-Cuda.cpp | 2 -- src/basic/DAXPY-Hip.cpp | 2 -- src/common/CudaDataUtils.hpp | 6 +++--- src/common/HipDataUtils.hpp | 6 +++--- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 707facc84..2461f8eaf 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -57,7 +57,6 @@ void DAXPY::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), y, x, a, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -79,7 +78,6 @@ void DAXPY::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 5e717d72d..c437eea15 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -58,7 +58,6 @@ void DAXPY::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), y, x, a, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -80,7 +79,6 @@ void DAXPY::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index de6ea1e4e..69f8aa825 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -64,9 +64,9 @@ void RPlaunchCudaKernel(void (*kernel)(KernArgs...), void* arg_arr[count]{(void*)&args...}; auto k = reinterpret_cast(kernel); - cudaLaunchKernel(k, numBlocks, dimBlocks, - arg_arr, - sharedMemBytes, stream); + cudaErrchk( cudaLaunchKernel(k, numBlocks, dimBlocks, + arg_arr, + sharedMemBytes, stream) ); } /*! diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 20dde8f4e..a668bb807 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -51,9 +51,9 @@ void RPlaunchHipKernel(void (*kernel)(KernArgs...), void* arg_arr[count]{(void*)&args...}; auto k = reinterpret_cast(kernel); - hipLaunchKernel(k, numBlocks, dimBlocks, - arg_arr, - sharedMemBytes, stream); + hipErrchk( hipLaunchKernel(k, numBlocks, dimBlocks, + arg_arr, + sharedMemBytes, stream) ); } /*! From 9ac870583288cac9ddb162f100907d88ca5da6c0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Dec 2023 14:29:56 -0800 Subject: [PATCH 230/454] Move GPU kernel error checks into laucnh methods and remove from calling code --- src/algorithm/MEMCPY-Cuda.cpp | 2 -- src/algorithm/MEMCPY-Hip.cpp | 2 -- src/algorithm/MEMSET-Cuda.cpp | 2 -- src/algorithm/MEMSET-Hip.cpp | 2 -- src/algorithm/REDUCE_SUM-Cuda.cpp | 2 -- src/algorithm/REDUCE_SUM-Hip.cpp | 2 -- src/apps/CONVECTION3DPA-Cuda.cpp | 1 - src/apps/CONVECTION3DPA-Hip.cpp | 1 - src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 2 -- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 -- src/apps/DIFFUSION3DPA-Cuda.cpp | 1 - src/apps/DIFFUSION3DPA-Hip.cpp | 1 - src/apps/EDGE3D-Cuda.cpp | 2 -- src/apps/EDGE3D-Hip.cpp | 2 -- src/apps/ENERGY-Cuda.cpp | 6 ------ src/apps/ENERGY-Hip.cpp | 6 ------ src/apps/FIR-Cuda.cpp | 2 -- src/apps/FIR-Hip.cpp | 2 -- src/apps/HALOEXCHANGE-Cuda.cpp | 2 -- src/apps/HALOEXCHANGE-Hip.cpp | 2 -- src/apps/HALOEXCHANGE_FUSED-Cuda.cpp | 2 -- src/apps/HALOEXCHANGE_FUSED-Hip.cpp | 2 -- src/apps/LTIMES-Cuda.cpp | 2 -- src/apps/LTIMES-Hip.cpp | 2 -- src/apps/LTIMES_NOVIEW-Cuda.cpp | 2 -- src/apps/LTIMES_NOVIEW-Hip.cpp | 2 -- src/apps/MASS3DEA-Cuda.cpp | 1 - src/apps/MASS3DEA-Hip.cpp | 1 - src/apps/MASS3DPA-Cuda.cpp | 1 - src/apps/MASS3DPA-Hip.cpp | 1 - src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 1 - src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 1 - src/apps/PRESSURE-Cuda.cpp | 2 -- src/apps/PRESSURE-Hip.cpp | 2 -- src/apps/VOL3D-Cuda.cpp | 1 - src/apps/VOL3D-Hip.cpp | 1 - src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp | 1 - src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp | 1 - src/basic/ARRAY_OF_PTRS-Cuda.cpp | 2 -- src/basic/ARRAY_OF_PTRS-Hip.cpp | 2 -- src/basic/COPY8-Cuda.cpp | 2 -- src/basic/COPY8-Hip.cpp | 2 -- src/basic/DAXPY_ATOMIC-Cuda.cpp | 2 -- src/basic/DAXPY_ATOMIC-Hip.cpp | 2 -- src/basic/IF_QUAD-Cuda.cpp | 2 -- src/basic/IF_QUAD-Hip.cpp | 2 -- src/basic/INDEXLIST-Cuda.cpp | 1 - src/basic/INDEXLIST-Hip.cpp | 1 - src/basic/INDEXLIST_3LOOP-Cuda.cpp | 2 -- src/basic/INDEXLIST_3LOOP-Hip.cpp | 2 -- src/basic/INIT3-Cuda.cpp | 2 -- src/basic/INIT3-Hip.cpp | 2 -- src/basic/INIT_VIEW1D-Cuda.cpp | 2 -- src/basic/INIT_VIEW1D-Hip.cpp | 2 -- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 2 -- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 -- src/basic/MAT_MAT_SHARED-Cuda.cpp | 2 -- src/basic/MAT_MAT_SHARED-Hip.cpp | 2 -- src/basic/MULADDSUB-Cuda.cpp | 2 -- src/basic/MULADDSUB-Hip.cpp | 2 -- src/basic/NESTED_INIT-Cuda.cpp | 2 -- src/basic/NESTED_INIT-Hip.cpp | 2 -- src/basic/PI_ATOMIC-Cuda.cpp | 2 -- src/basic/PI_ATOMIC-Hip.cpp | 1 - src/basic/PI_REDUCE-Cuda.cpp | 2 -- src/basic/PI_REDUCE-Hip.cpp | 2 -- src/basic/REDUCE3_INT-Cuda.cpp | 2 -- src/basic/REDUCE3_INT-Hip.cpp | 2 -- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 -- src/basic/REDUCE_STRUCT-Hip.cpp | 2 -- src/basic/TRAP_INT-Cuda.cpp | 2 -- src/basic/TRAP_INT-Hip.cpp | 2 -- src/lcals/DIFF_PREDICT-Cuda.cpp | 1 - src/lcals/DIFF_PREDICT-Hip.cpp | 1 - src/lcals/EOS-Cuda.cpp | 1 - src/lcals/EOS-Hip.cpp | 1 - src/lcals/FIRST_DIFF-Cuda.cpp | 1 - src/lcals/FIRST_DIFF-Hip.cpp | 1 - src/lcals/FIRST_MIN-Cuda.cpp | 2 -- src/lcals/FIRST_MIN-Hip.cpp | 2 -- src/lcals/FIRST_SUM-Cuda.cpp | 1 - src/lcals/FIRST_SUM-Hip.cpp | 1 - src/lcals/GEN_LIN_RECUR-Cuda.cpp | 2 -- src/lcals/GEN_LIN_RECUR-Hip.cpp | 2 -- src/lcals/HYDRO_1D-Cuda.cpp | 1 - src/lcals/HYDRO_1D-Hip.cpp | 1 - src/lcals/HYDRO_2D-Cuda.cpp | 3 --- src/lcals/HYDRO_2D-Hip.cpp | 3 --- src/lcals/INT_PREDICT-Cuda.cpp | 1 - src/lcals/INT_PREDICT-Hip.cpp | 1 - src/lcals/PLANCKIAN-Cuda.cpp | 1 - src/lcals/PLANCKIAN-Hip.cpp | 1 - src/lcals/TRIDIAG_ELIM-Cuda.cpp | 1 - src/lcals/TRIDIAG_ELIM-Hip.cpp | 1 - src/polybench/POLYBENCH_2MM-Cuda.cpp | 4 ---- src/polybench/POLYBENCH_2MM-Hip.cpp | 4 ---- src/polybench/POLYBENCH_3MM-Cuda.cpp | 6 ------ src/polybench/POLYBENCH_3MM-Hip.cpp | 6 ------ src/polybench/POLYBENCH_ADI-Cuda.cpp | 4 ---- src/polybench/POLYBENCH_ADI-Hip.cpp | 4 ---- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 4 ---- src/polybench/POLYBENCH_ATAX-Hip.cpp | 4 ---- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 8 -------- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 8 -------- src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 2 -- src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 2 -- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 2 -- src/polybench/POLYBENCH_GEMM-Hip.cpp | 2 -- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 8 -------- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 8 -------- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 1 - src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 1 - src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 4 ---- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 4 ---- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 2 -- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 2 -- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 4 ---- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 4 ---- src/polybench/POLYBENCH_MVT-Cuda.cpp | 2 -- src/polybench/POLYBENCH_MVT-Hip.cpp | 2 -- src/stream/ADD-Cuda.cpp | 2 -- src/stream/ADD-Hip.cpp | 2 -- src/stream/COPY-Cuda.cpp | 2 -- src/stream/COPY-Hip.cpp | 2 -- src/stream/DOT-Cuda.cpp | 2 -- src/stream/DOT-Hip.cpp | 2 -- src/stream/MUL-Cuda.cpp | 2 -- src/stream/MUL-Hip.cpp | 2 -- src/stream/TRIAD-Cuda.cpp | 2 -- src/stream/TRIAD-Hip.cpp | 4 +--- 130 files changed, 1 insertion(+), 288 deletions(-) diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp index df8d24ca7..e8d09119e 100644 --- a/src/algorithm/MEMCPY-Cuda.cpp +++ b/src/algorithm/MEMCPY-Cuda.cpp @@ -96,7 +96,6 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, y, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -118,7 +117,6 @@ void MEMCPY::runCudaVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, memcpy_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp index a8b0f6326..129c3fb18 100644 --- a/src/algorithm/MEMCPY-Hip.cpp +++ b/src/algorithm/MEMCPY-Hip.cpp @@ -96,7 +96,6 @@ void MEMCPY::runHipVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, y, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -118,7 +117,6 @@ void MEMCPY::runHipVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, memcpy_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp index e216d0de7..a6dd198c0 100644 --- a/src/algorithm/MEMSET-Cuda.cpp +++ b/src/algorithm/MEMSET-Cuda.cpp @@ -94,7 +94,6 @@ void MEMSET::runCudaVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, val, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -116,7 +115,6 @@ void MEMSET::runCudaVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, memset_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp index b36c9db86..5a78edfeb 100644 --- a/src/algorithm/MEMSET-Hip.cpp +++ b/src/algorithm/MEMSET-Hip.cpp @@ -94,7 +94,6 @@ void MEMSET::runHipVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, val, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -116,7 +115,6 @@ void MEMSET::runHipVariantBlock(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, memset_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 3e5ed478e..d43fb6d54 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -153,7 +153,6 @@ void REDUCE_SUM::runCudaVariantBlockAtomic(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, sum, m_sum_init, iend ); - cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); @@ -218,7 +217,6 @@ void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, sum, m_sum_init, iend ); - cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 1d354f1bd..b780e9be9 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -180,7 +180,6 @@ void REDUCE_SUM::runHipVariantBlockAtomic(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, sum, m_sum_init, iend ); - hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); @@ -245,7 +244,6 @@ void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) grid_size, block_size, shmem, res.get_stream(), x, sum, m_sum_init, iend ); - hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); diff --git a/src/apps/CONVECTION3DPA-Cuda.cpp b/src/apps/CONVECTION3DPA-Cuda.cpp index 83a0317de..a553624e8 100644 --- a/src/apps/CONVECTION3DPA-Cuda.cpp +++ b/src/apps/CONVECTION3DPA-Cuda.cpp @@ -148,7 +148,6 @@ void CONVECTION3DPA::runCudaVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), Basis, tBasis, dBasis, D, X, Y ); - cudaErrchk(cudaGetLastError()); } stopTimer(); diff --git a/src/apps/CONVECTION3DPA-Hip.cpp b/src/apps/CONVECTION3DPA-Hip.cpp index bf783bfdd..a3c3c0472 100644 --- a/src/apps/CONVECTION3DPA-Hip.cpp +++ b/src/apps/CONVECTION3DPA-Hip.cpp @@ -148,7 +148,6 @@ void CONVECTION3DPA::runHipVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), Basis, tBasis, dBasis, D, X, Y ); - hipErrchk(hipGetLastError()); } stopTimer(); diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 41a9db07a..43c9aa0a9 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -78,7 +78,6 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) real_zones, half, ptiny, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -102,7 +101,6 @@ void DEL_DOT_VEC_2D::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), ibegin, iend, deldotvec2d_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 0ded3b56e..88e59dac3 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -78,7 +78,6 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) real_zones, half, ptiny, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -102,7 +101,6 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), ibegin, iend, deldotvec2d_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 7270dfb97..6b205b70e 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -127,7 +127,6 @@ void DIFFUSION3DPA::runCudaVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), Basis, dBasis, D, X, Y, symmetric ); - cudaErrchk(cudaGetLastError()); } stopTimer(); diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 249fd9721..4a03eec93 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -127,7 +127,6 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), Basis, dBasis, D, X, Y, symmetric ); - hipErrchk(hipGetLastError()); } stopTimer(); diff --git a/src/apps/EDGE3D-Cuda.cpp b/src/apps/EDGE3D-Cuda.cpp index ae7c9e394..2a6a64a78 100644 --- a/src/apps/EDGE3D-Cuda.cpp +++ b/src/apps/EDGE3D-Cuda.cpp @@ -74,7 +74,6 @@ void EDGE3D::runCudaVariantImpl(VariantID vid) y0, y1, y2, y3, y4, y5, y6, y7, z0, z1, z2, z3, z4, z5, z6, z7, ibegin, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -95,7 +94,6 @@ void EDGE3D::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), ibegin, iend, edge3d_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/EDGE3D-Hip.cpp b/src/apps/EDGE3D-Hip.cpp index 0baf48312..e8a4f2be3 100644 --- a/src/apps/EDGE3D-Hip.cpp +++ b/src/apps/EDGE3D-Hip.cpp @@ -74,7 +74,6 @@ void EDGE3D::runHipVariantImpl(VariantID vid) y0, y1, y2, y3, y4, y5, y6, y7, z0, z1, z2, z3, z4, z5, z6, z7, ibegin, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -95,7 +94,6 @@ void EDGE3D::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), ibegin, iend, edge3d_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index d937826ae..e657a94a3 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -132,7 +132,6 @@ void ENERGY::runCudaVariantImpl(VariantID vid) e_new, e_old, delvc, p_old, q_old, work, iend ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (energycalc2), grid_size, block_size, @@ -143,7 +142,6 @@ void ENERGY::runCudaVariantImpl(VariantID vid) ql_old, qq_old, rho0, iend ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (energycalc3), grid_size, block_size, @@ -152,7 +150,6 @@ void ENERGY::runCudaVariantImpl(VariantID vid) p_old, q_old, pHalfStep, q_new, iend ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (energycalc4), grid_size, block_size, @@ -160,7 +157,6 @@ void ENERGY::runCudaVariantImpl(VariantID vid) e_new, work, e_cut, emin, iend ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (energycalc5), grid_size, block_size, @@ -173,7 +169,6 @@ void ENERGY::runCudaVariantImpl(VariantID vid) pHalfStep, q_new, rho0, e_cut, emin, iend ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (energycalc6), grid_size, block_size, @@ -185,7 +180,6 @@ void ENERGY::runCudaVariantImpl(VariantID vid) ql_old, qq_old, rho0, q_cut, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 2e88a1921..56a8126ee 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -132,7 +132,6 @@ void ENERGY::runHipVariantImpl(VariantID vid) e_new, e_old, delvc, p_old, q_old, work, iend ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (energycalc2), grid_size, block_size, @@ -143,7 +142,6 @@ void ENERGY::runHipVariantImpl(VariantID vid) ql_old, qq_old, rho0, iend ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (energycalc3), grid_size, block_size, @@ -152,7 +150,6 @@ void ENERGY::runHipVariantImpl(VariantID vid) p_old, q_old, pHalfStep, q_new, iend ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (energycalc4), grid_size, block_size, @@ -160,7 +157,6 @@ void ENERGY::runHipVariantImpl(VariantID vid) e_new, work, e_cut, emin, iend ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (energycalc5), grid_size, block_size, @@ -173,7 +169,6 @@ void ENERGY::runHipVariantImpl(VariantID vid) pHalfStep, q_new, rho0, e_cut, emin, iend ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (energycalc6), grid_size, block_size, @@ -185,7 +180,6 @@ void ENERGY::runHipVariantImpl(VariantID vid) ql_old, qq_old, rho0, q_cut, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 2a1725522..bdfd21aa5 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -108,7 +108,6 @@ void FIR::runCudaVariantImpl(VariantID vid) out, in, coefflen, iend ); - cudaErrchk( cudaGetLastError() ); #else RPlaunchCudaKernel( (fir), grid_size, block_size, @@ -117,7 +116,6 @@ void FIR::runCudaVariantImpl(VariantID vid) coeff, coefflen, iend ); - udaErrchk( cudaGetLastError() ); #endif } diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index f075b3c9c..7d9db27a7 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -106,7 +106,6 @@ void FIR::runHipVariantImpl(VariantID vid) out, in, coefflen, iend ); - hipErrchk( hipGetLastError() ); #else RPlaunchHipKernel( (fir), grid_size, block_size, @@ -115,7 +114,6 @@ void FIR::runHipVariantImpl(VariantID vid) coeff, coefflen, iend ); - hipErrchk( hipGetLastError() ); #endif } diff --git a/src/apps/HALOEXCHANGE-Cuda.cpp b/src/apps/HALOEXCHANGE-Cuda.cpp index 0919f98d2..e3abc406f 100644 --- a/src/apps/HALOEXCHANGE-Cuda.cpp +++ b/src/apps/HALOEXCHANGE-Cuda.cpp @@ -73,7 +73,6 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); - cudaErrchk( cudaGetLastError() ); buffer += len; } } @@ -92,7 +91,6 @@ void HALOEXCHANGE::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); - cudaErrchk( cudaGetLastError() ); buffer += len; } } diff --git a/src/apps/HALOEXCHANGE-Hip.cpp b/src/apps/HALOEXCHANGE-Hip.cpp index 4a5ecfc42..c58dcdbfb 100644 --- a/src/apps/HALOEXCHANGE-Hip.cpp +++ b/src/apps/HALOEXCHANGE-Hip.cpp @@ -73,7 +73,6 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); - hipErrchk( hipGetLastError() ); buffer += len; } } @@ -92,7 +91,6 @@ void HALOEXCHANGE::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), buffer, list, var, len ); - hipErrchk( hipGetLastError() ); buffer += len; } } diff --git a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp index 4535d5bd9..2b0d675ca 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Cuda.cpp @@ -138,7 +138,6 @@ void HALOEXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) pack_list_ptrs, pack_var_ptrs, pack_len_ptrs ); - cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); Index_type unpack_index = 0; @@ -171,7 +170,6 @@ void HALOEXCHANGE_FUSED::runCudaVariantDirect(VariantID vid) unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs ); - cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); } diff --git a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp index 3fd78f925..dd078cb9e 100644 --- a/src/apps/HALOEXCHANGE_FUSED-Hip.cpp +++ b/src/apps/HALOEXCHANGE_FUSED-Hip.cpp @@ -138,7 +138,6 @@ void HALOEXCHANGE_FUSED::runHipVariantDirect(VariantID vid) pack_list_ptrs, pack_var_ptrs, pack_len_ptrs ); - hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); Index_type unpack_index = 0; @@ -171,7 +170,6 @@ void HALOEXCHANGE_FUSED::runHipVariantDirect(VariantID vid) unpack_list_ptrs, unpack_var_ptrs, unpack_len_ptrs ); - hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); } diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 7173a0dcf..b059cb1c6 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -97,7 +97,6 @@ void LTIMES::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), phidat, elldat, psidat, num_d, num_m, num_g, num_z ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -125,7 +124,6 @@ void LTIMES::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), num_m, num_g, num_z, ltimes_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 12e619259..de0bb44ac 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -96,7 +96,6 @@ void LTIMES::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), phidat, elldat, psidat, num_d, num_m, num_g, num_z ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -124,7 +123,6 @@ void LTIMES::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), num_m, num_g, num_z, ltimes_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 8a34f32e4..f201a4980 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -96,7 +96,6 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), phidat, elldat, psidat, num_d, num_m, num_g, num_z ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -124,7 +123,6 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), num_m, num_g, num_z, ltimes_noview_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 2549192a8..3c3d39469 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -96,7 +96,6 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), phidat, elldat, psidat, num_d, num_m, num_g, num_z ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -124,7 +123,6 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), num_m, num_g, num_z, ltimes_noview_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/MASS3DEA-Cuda.cpp b/src/apps/MASS3DEA-Cuda.cpp index c1eb62e84..acdd029b9 100644 --- a/src/apps/MASS3DEA-Cuda.cpp +++ b/src/apps/MASS3DEA-Cuda.cpp @@ -79,7 +79,6 @@ void MASS3DEA::runCudaVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), B, D, M ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/MASS3DEA-Hip.cpp b/src/apps/MASS3DEA-Hip.cpp index 831a0434d..02809f270 100644 --- a/src/apps/MASS3DEA-Hip.cpp +++ b/src/apps/MASS3DEA-Hip.cpp @@ -79,7 +79,6 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), B, D, M ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 0b7d5ec56..1b51c8e59 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -109,7 +109,6 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, X, Y ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 9dd759423..7b07522b9 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -109,7 +109,6 @@ void MASS3DPA::runHipVariantImpl(VariantID vid) { NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, X, Y ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index 670a5e03a..819aaf680 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -68,7 +68,6 @@ void NODAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) x0, x1, x2, x3, x4, x5, x6, x7, real_zones, ibegin, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index 66297261f..33eee5dc1 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -68,7 +68,6 @@ void NODAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) x0, x1, x2, x3, x4, x5, x6, x7, real_zones, ibegin, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index 151b83890..bfb5e78df 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -72,7 +72,6 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), bvc, compression, cls, iend ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (pressurecalc2), grid_size, block_size, @@ -81,7 +80,6 @@ void PRESSURE::runCudaVariantImpl(VariantID vid) vnewc, p_cut, eosvmax, pmin, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index bf69c3815..ad7ff6bd9 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -72,7 +72,6 @@ void PRESSURE::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), bvc, compression, cls, iend ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (pressurecalc2), grid_size, block_size, @@ -81,7 +80,6 @@ void PRESSURE::runHipVariantImpl(VariantID vid) vnewc, p_cut, eosvmax, pmin, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index 2e34858a7..cc9cf6da6 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -77,7 +77,6 @@ void VOL3D::runCudaVariantImpl(VariantID vid) z0, z1, z2, z3, z4, z5, z6, z7, vnormq, ibegin, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 207718691..027ea2c22 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -77,7 +77,6 @@ void VOL3D::runHipVariantImpl(VariantID vid) z0, z1, z2, z3, z4, z5, z6, z7, vnormq, ibegin, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp index cc7180d80..e81347af5 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp @@ -68,7 +68,6 @@ void ZONAL_ACCUMULATION_3D::runCudaVariantImpl(VariantID vid) x0, x1, x2, x3, x4, x5, x6, x7, real_zones, ibegin, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp index acf838d64..d182fdfa4 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp @@ -68,7 +68,6 @@ void ZONAL_ACCUMULATION_3D::runHipVariantImpl(VariantID vid) x0, x1, x2, x3, x4, x5, x6, x7, real_zones, ibegin, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/ARRAY_OF_PTRS-Cuda.cpp b/src/basic/ARRAY_OF_PTRS-Cuda.cpp index 1df18e4bb..ca3b8360f 100644 --- a/src/basic/ARRAY_OF_PTRS-Cuda.cpp +++ b/src/basic/ARRAY_OF_PTRS-Cuda.cpp @@ -59,7 +59,6 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), y, x_array, array_size, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -81,7 +80,6 @@ void ARRAY_OF_PTRS::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, array_of_ptrs_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/ARRAY_OF_PTRS-Hip.cpp b/src/basic/ARRAY_OF_PTRS-Hip.cpp index 4c3f39d9f..cb1336058 100644 --- a/src/basic/ARRAY_OF_PTRS-Hip.cpp +++ b/src/basic/ARRAY_OF_PTRS-Hip.cpp @@ -59,7 +59,6 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), y, x_array, array_size, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -81,7 +80,6 @@ void ARRAY_OF_PTRS::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, array_of_ptrs_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/COPY8-Cuda.cpp b/src/basic/COPY8-Cuda.cpp index b29360256..202bcbc54 100644 --- a/src/basic/COPY8-Cuda.cpp +++ b/src/basic/COPY8-Cuda.cpp @@ -61,7 +61,6 @@ void COPY8::runCudaVariantImpl(VariantID vid) y0, y1, y2, y3, y4, y5, y6, y7, x0, x1, x2, x3, x4, x5, x6, x7, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -83,7 +82,6 @@ void COPY8::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy8_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/COPY8-Hip.cpp b/src/basic/COPY8-Hip.cpp index 2c6fa5781..9c3054611 100644 --- a/src/basic/COPY8-Hip.cpp +++ b/src/basic/COPY8-Hip.cpp @@ -62,7 +62,6 @@ void COPY8::runHipVariantImpl(VariantID vid) y0, y1, y2, y3, y4, y5, y6, y7, x0, x1, x2, x3, x4, x5, x6, x7, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -84,7 +83,6 @@ void COPY8::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy8_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index 50b46bf8d..df4776ce7 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -57,7 +57,6 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), y, x, a, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -79,7 +78,6 @@ void DAXPY_ATOMIC::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_atomic_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 4dac76c2b..93f6b4d0d 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -57,7 +57,6 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), y, x, a, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -79,7 +78,6 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, daxpy_atomic_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index 01354199b..d110a036a 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -60,7 +60,6 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) x1, x2, a, b, c, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -81,7 +80,6 @@ void IF_QUAD::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, ifquad_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 4a0192094..4cd5b482b 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -60,7 +60,6 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) x1, x2, a, b, c, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -82,7 +81,6 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, ifquad_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 3a682cfe9..131b74f7a 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -276,7 +276,6 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) x+ibegin, list+ibegin, block_counts, grid_counts, block_readys, len, iend-ibegin ); - cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize( res.get_stream() ) ); m_len = *len; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 84c02c045..9c4bff838 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -277,7 +277,6 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) x+ibegin, list+ibegin, block_counts, grid_counts, block_readys, len, iend-ibegin ); - hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize( res.get_stream() ) ); m_len = *len; diff --git a/src/basic/INDEXLIST_3LOOP-Cuda.cpp b/src/basic/INDEXLIST_3LOOP-Cuda.cpp index 2195eb380..e95524201 100644 --- a/src/basic/INDEXLIST_3LOOP-Cuda.cpp +++ b/src/basic/INDEXLIST_3LOOP-Cuda.cpp @@ -106,7 +106,6 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, stream, x, counts, iend ); - cudaErrchk( cudaGetLastError() ); cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, @@ -121,7 +120,6 @@ void INDEXLIST_3LOOP::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, stream, list, counts, len, iend ); - cudaErrchk( cudaGetLastError() ); cudaErrchk( cudaStreamSynchronize(stream) ); m_len = *len; diff --git a/src/basic/INDEXLIST_3LOOP-Hip.cpp b/src/basic/INDEXLIST_3LOOP-Hip.cpp index d49cd98e4..e1de399a0 100644 --- a/src/basic/INDEXLIST_3LOOP-Hip.cpp +++ b/src/basic/INDEXLIST_3LOOP-Hip.cpp @@ -117,7 +117,6 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, stream, x, counts, iend ); - hipErrchk( hipGetLastError() ); #if defined(__HIPCC__) hipErrchk(::rocprim::exclusive_scan(d_temp_storage, @@ -143,7 +142,6 @@ void INDEXLIST_3LOOP::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, stream, list, counts, len, iend ); - hipErrchk( hipGetLastError() ); hipErrchk( hipStreamSynchronize(stream) ); m_len = *len; diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index f30f74655..898d3b28b 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -60,7 +60,6 @@ void INIT3::runCudaVariantImpl(VariantID vid) out1, out2, out3, in1, in2, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -82,7 +81,6 @@ void INIT3::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, init3_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index c22d0b419..c3b67130f 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -60,7 +60,6 @@ void INIT3::runHipVariantImpl(VariantID vid) out1, out2, out3, in1, in2, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -82,7 +81,6 @@ void INIT3::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, init3_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 828c87fb4..747bceb6d 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -58,7 +58,6 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, v, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -80,7 +79,6 @@ void INIT_VIEW1D::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 8ef65ddb1..b7db081ec 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -58,7 +58,6 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, v, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -80,7 +79,6 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index e973d6328..822de3c70 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -60,7 +60,6 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), a, v, ibegin, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -82,7 +81,6 @@ void INIT_VIEW1D_OFFSET::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_offset_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 2300fec69..83b1ae339 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -60,7 +60,6 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), a, v, ibegin, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -82,7 +81,6 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, initview1d_offset_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index ee77d0b22..23e317815 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -77,7 +77,6 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) gridDim, blockDim, shmem, res.get_stream(), N, C, A, B ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -181,7 +180,6 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) gridDim, blockDim, shmem, res.get_stream(), mat_mat_shared_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index 789698670..acf08168b 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -77,7 +77,6 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) gridDim, blockDim, shmem, res.get_stream(), N, C, A, B ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -181,7 +180,6 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) gridDim, blockDim, shmem, res.get_stream(), mat_mat_shared_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index f999394a8..260189990 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -60,7 +60,6 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) out1, out2, out3, in1, in2, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -82,7 +81,6 @@ void MULADDSUB::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, muladdsub_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index cc199ef17..9e4d0d741 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -60,7 +60,6 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) out1, out2, out3, in1, in2, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -82,7 +81,6 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, muladdsub_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 4c4323477..3cefef7de 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -94,7 +94,6 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), array, ni, nj, nk ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -121,7 +120,6 @@ void NESTED_INIT::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), ni, nj, nk, nested_init_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 796665bdd..20ce5c382 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -94,7 +94,6 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), array, ni, nj, nk ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -121,7 +120,6 @@ void NESTED_INIT::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), ni, nj, nk, nested_init_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 0ffebe002..0c57353ed 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -65,7 +65,6 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) pi, dx, iend ); - cudaErrchk( cudaGetLastError() ); Real_type rpi; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); @@ -94,7 +93,6 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, pi_atomic_lambda ); - cudaErrchk( cudaGetLastError() ); Real_type rpi; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 27e428c0c..2033e01b8 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -65,7 +65,6 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) pi, dx, iend ); - hipErrchk( hipGetLastError() ); Real_type rpi; RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 4a8ae56d2..0b35b39c2 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -83,7 +83,6 @@ void PI_REDUCE::runCudaVariantBlockAtomic(VariantID vid) dx, pi, m_pi_init, iend ); - cudaErrchk( cudaGetLastError() ); Real_type rpi; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); @@ -149,7 +148,6 @@ void PI_REDUCE::runCudaVariantBlockAtomicOccGS(VariantID vid) dx, pi, m_pi_init, iend ); - cudaErrchk( cudaGetLastError() ); Real_type rpi; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index e4e11c218..a56d2174a 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -83,7 +83,6 @@ void PI_REDUCE::runHipVariantBlockAtomic(VariantID vid) dx, pi, m_pi_init, iend ); - hipErrchk( hipGetLastError() ); Real_type rpi; RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); @@ -149,7 +148,6 @@ void PI_REDUCE::runHipVariantBlockAtomicOccGS(VariantID vid) dx, pi, m_pi_init, iend ); - hipErrchk( hipGetLastError() ); Real_type rpi; RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 3e4781433..e46f2fd70 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -99,7 +99,6 @@ void REDUCE3_INT::runCudaVariantBlockAtomic(VariantID vid) vmem + 1, m_vmin_init, vmem + 2, m_vmax_init, iend ); - cudaErrchk( cudaGetLastError() ); Int_type rvmem[3]; RAJAPERF_CUDA_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); @@ -174,7 +173,6 @@ void REDUCE3_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) vmem + 1, m_vmin_init, vmem + 2, m_vmax_init, iend ); - cudaErrchk( cudaGetLastError() ); Int_type rvmem[3]; RAJAPERF_CUDA_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index b8b59a034..dfeb9d085 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -99,7 +99,6 @@ void REDUCE3_INT::runHipVariantBlockAtomic(VariantID vid) vmem + 1, m_vmin_init, vmem + 2, m_vmax_init, iend ); - hipErrchk( hipGetLastError() ); Int_type rvmem[3]; RAJAPERF_HIP_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); @@ -174,7 +173,6 @@ void REDUCE3_INT::runHipVariantBlockAtomicOccGS(VariantID vid) vmem + 1, m_vmin_init, vmem + 2, m_vmax_init, iend ); - hipErrchk( hipGetLastError() ); Int_type rvmem[3]; RAJAPERF_HIP_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 92f7fcb1d..677084d1b 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -130,7 +130,6 @@ void REDUCE_STRUCT::runCudaVariantBlockAtomic(VariantID vid) mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, points.N ); - cudaErrchk( cudaGetLastError() ); Real_type rmem[6]; RAJAPERF_CUDA_REDUCER_COPY_BACK(rmem, mem, hmem, 6); @@ -216,7 +215,6 @@ void REDUCE_STRUCT::runCudaVariantBlockAtomicOccGS(VariantID vid) mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, points.N ); - cudaErrchk( cudaGetLastError() ); Real_type rmem[6]; RAJAPERF_CUDA_REDUCER_COPY_BACK(rmem, mem, hmem, 6); diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 9bede2bc1..451f8b1a4 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -131,7 +131,6 @@ void REDUCE_STRUCT::runHipVariantBlockAtomic(VariantID vid) mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, points.N ); - hipErrchk( hipGetLastError() ); Real_type rmem[6]; RAJAPERF_HIP_REDUCER_COPY_BACK(rmem, mem, hmem, 6); @@ -216,7 +215,6 @@ void REDUCE_STRUCT::runHipVariantBlockAtomicOccGS(VariantID vid) mem+3, mem+4, mem+5, // ycenter,ymin,ymax m_init_sum, m_init_min, m_init_max, points.N ); - hipErrchk( hipGetLastError() ); Real_type rmem[6]; RAJAPERF_HIP_REDUCER_COPY_BACK(rmem, mem, hmem, 6); diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 8bf85508d..e4010431b 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -104,7 +104,6 @@ void TRAP_INT::runCudaVariantBlockAtomic(VariantID vid) h, sumx, iend); - cudaErrchk( cudaGetLastError() ); Real_type rsumx; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); @@ -172,7 +171,6 @@ void TRAP_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) h, sumx, iend); - cudaErrchk( cudaGetLastError() ); Real_type rsumx; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 94b791f04..aff820afc 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -104,7 +104,6 @@ void TRAP_INT::runHipVariantBlockAtomic(VariantID vid) h, sumx, iend); - hipErrchk( hipGetLastError() ); Real_type rsumx; RAJAPERF_HIP_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); @@ -172,7 +171,6 @@ void TRAP_INT::runHipVariantBlockAtomicOccGS(VariantID vid) h, sumx, iend); - hipErrchk( hipGetLastError() ); Real_type rsumx; RAJAPERF_HIP_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index e51370454..a7c42aa0e 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -57,7 +57,6 @@ void DIFF_PREDICT::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), px, cx, offset, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 9c2967a1a..7a01975cb 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -58,7 +58,6 @@ void DIFF_PREDICT::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), px, cx, offset, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index 8483d71bc..1a6b3eb43 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -59,7 +59,6 @@ void EOS::runCudaVariantImpl(VariantID vid) x, y, z, u, q, r, t, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 22610f813..2f6079fbb 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -59,7 +59,6 @@ void EOS::runHipVariantImpl(VariantID vid) x, y, z, u, q, r, t, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 3731b1afe..8daf8e571 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -57,7 +57,6 @@ void FIRST_DIFF::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), x, y, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 770c62081..1768e7851 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -57,7 +57,6 @@ void FIRST_DIFF::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), x, y, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 1a29d3052..282464f88 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -87,7 +87,6 @@ void FIRST_MIN::runCudaVariantBlockHost(VariantID vid) shmem, res.get_stream(), x, dminloc, mymin, iend ); - cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { @@ -173,7 +172,6 @@ void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) shmem, res.get_stream(), x, dminloc, mymin, iend ); - cudaErrchk( cudaGetLastError() ); RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index f8592fdca..75677e07f 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -87,7 +87,6 @@ void FIRST_MIN::runHipVariantBlockHost(VariantID vid) shmem, res.get_stream(), x, dminloc, mymin, iend ); - hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { @@ -173,7 +172,6 @@ void FIRST_MIN::runHipVariantBlockHostOccGS(VariantID vid) shmem, res.get_stream(), x, dminloc, mymin, iend ); - hipErrchk( hipGetLastError() ); RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); for (Index_type i = 0; i < static_cast(grid_size); i++) { diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 7a29a3154..9869ccf80 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -57,7 +57,6 @@ void FIRST_SUM::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), x, y, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 7e35bfefd..9d8258a28 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -57,7 +57,6 @@ void FIRST_SUM::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), x, y, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index a049f6f10..9569f020b 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -73,7 +73,6 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) sa, sb, kb5i, N ); - cudaErrchk( cudaGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); @@ -84,7 +83,6 @@ void GEN_LIN_RECUR::runCudaVariantImpl(VariantID vid) sa, sb, kb5i, N ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 35cb3fd2a..86eb64884 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -73,7 +73,6 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) sa, sb, kb5i, N ); - hipErrchk( hipGetLastError() ); const size_t grid_size2 = RAJA_DIVIDE_CEILING_INT(N+1, block_size); @@ -84,7 +83,6 @@ void GEN_LIN_RECUR::runHipVariantImpl(VariantID vid) sa, sb, kb5i, N ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 3d5a66f1e..66f038750 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -59,7 +59,6 @@ void HYDRO_1D::runCudaVariantImpl(VariantID vid) x, y, z, q, r, t, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index 078f6269f..c79786d21 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -59,7 +59,6 @@ void HYDRO_1D::runHipVariantImpl(VariantID vid) x, y, z, q, r, t, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index be8b37afa..d10875074 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -118,7 +118,6 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) zpdat, zqdat, zrdat, zmdat, jn, kn); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (hydro_2d2), nblocks, nthreads_per_block, @@ -128,7 +127,6 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) zzdat, zrdat, s, jn, kn); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (hydro_2d3), nblocks, nthreads_per_block, @@ -138,7 +136,6 @@ void HYDRO_2D::runCudaVariantImpl(VariantID vid) zzdat, zvdat, t, jn, kn); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index e737ff278..aeb4bdf29 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -118,7 +118,6 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) zpdat, zqdat, zrdat, zmdat, jn, kn); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (hydro_2d2), nblocks, nthreads_per_block, @@ -128,7 +127,6 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) zzdat, zrdat, s, jn, kn); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (hydro_2d3), nblocks, nthreads_per_block, @@ -138,7 +136,6 @@ void HYDRO_2D::runHipVariantImpl(VariantID vid) zzdat, zvdat, t, jn, kn); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index ba05ea81f..b3c70cdda 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -65,7 +65,6 @@ void INT_PREDICT::runCudaVariantImpl(VariantID vid) dm28, c0, offset, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index 301c54f94..ccf226523 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -65,7 +65,6 @@ void INT_PREDICT::runHipVariantImpl(VariantID vid) dm28, c0, offset, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index 56011beb5..68265ab4e 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -60,7 +60,6 @@ void PLANCKIAN::runCudaVariantImpl(VariantID vid) x, y, u, v, w, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index c73468bc2..b345e0ec3 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -60,7 +60,6 @@ void PLANCKIAN::runHipVariantImpl(VariantID vid) x, y, u, v, w, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 21831fb9e..691817124 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -59,7 +59,6 @@ void TRIDIAG_ELIM::runCudaVariantImpl(VariantID vid) xout, xin, y, z, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index fa07a9f34..9afd6bb44 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -59,7 +59,6 @@ void TRIDIAG_ELIM::runHipVariantImpl(VariantID vid) xout, xin, y, z, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 99fa4b1ff..5cb0f48c8 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -133,7 +133,6 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) tmp, A, B, alpha, ni, nj, nk ); - cudaErrchk( cudaGetLastError() ); POLY_2MM_2_NBLOCKS_CUDA; @@ -144,7 +143,6 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) tmp, C, D, beta, ni, nl, nj ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -173,7 +171,6 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) nblocks1, nthreads_per_block, shmem, res.get_stream(), ni, nj, poly_2mm_1_lambda ); - cudaErrchk( cudaGetLastError() ); POLY_2MM_2_NBLOCKS_CUDA; @@ -191,7 +188,6 @@ void POLYBENCH_2MM::runCudaVariantImpl(VariantID vid) nblocks2, nthreads_per_block, shmem, res.get_stream(), ni, nl, poly_2mm_2_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 08b906cf3..5dac8fa96 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -133,7 +133,6 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) tmp, A, B, alpha, ni, nj, nk ); - hipErrchk( hipGetLastError() ); POLY_2MM_2_NBLOCKS_HIP; @@ -144,7 +143,6 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) tmp, C, D, beta, ni, nl, nj ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -173,7 +171,6 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) nblocks1, nthreads_per_block, shmem, res.get_stream(), ni, nj, poly_2mm_1_lambda ); - hipErrchk( hipGetLastError() ); POLY_2MM_2_NBLOCKS_HIP; @@ -191,7 +188,6 @@ void POLYBENCH_2MM::runHipVariantImpl(VariantID vid) nblocks2, nthreads_per_block, shmem, res.get_stream(), ni, nl, poly_2mm_2_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index 05f81f7ca..e20c71aa2 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -166,7 +166,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), E, A, B, ni, nj, nk ); - cudaErrchk( cudaGetLastError() ); POLY_3MM_2_NBLOCKS_CUDA; @@ -176,7 +175,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), F, C, D, nj, nl, nm ); - cudaErrchk( cudaGetLastError() ); POLY_3MM_3_NBLOCKS_CUDA; @@ -186,7 +184,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), G, E, F, ni, nl, nj ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -215,7 +212,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) nblocks1, nthreads_per_block, shmem, res.get_stream(), ni, nj, poly_3mm_1_lambda ); - cudaErrchk( cudaGetLastError() ); POLY_3MM_2_NBLOCKS_CUDA; @@ -233,7 +229,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) nblocks2, nthreads_per_block, shmem, res.get_stream(), nj, nl, poly_3mm_2_lambda ); - cudaErrchk( cudaGetLastError() ); POLY_3MM_3_NBLOCKS_CUDA; @@ -251,7 +246,6 @@ void POLYBENCH_3MM::runCudaVariantImpl(VariantID vid) nblocks3, nthreads_per_block, shmem, res.get_stream(), ni, nl, poly_3mm_3_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index 295ad3293..a6fd1ba59 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -165,7 +165,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), E, A, B, ni, nj, nk ); - hipErrchk( hipGetLastError() ); POLY_3MM_2_NBLOCKS_HIP; @@ -175,7 +174,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), F, C, D, nj, nl, nm ); - hipErrchk( hipGetLastError() ); POLY_3MM_3_NBLOCKS_HIP; @@ -185,7 +183,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), G, E, F, ni, nl, nj ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -214,7 +211,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) nblocks1, nthreads_per_block, shmem, res.get_stream(), ni, nj, poly_3mm_1_lambda ); - hipErrchk( hipGetLastError() ); POLY_3MM_2_NBLOCKS_HIP; @@ -232,7 +228,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) nblocks2, nthreads_per_block, shmem, res.get_stream(), nj, nl, poly_3mm_2_lambda ); - hipErrchk( hipGetLastError() ); POLY_3MM_3_NBLOCKS_HIP; @@ -250,7 +245,6 @@ void POLYBENCH_3MM::runHipVariantImpl(VariantID vid) nblocks3, nthreads_per_block, shmem, res.get_stream(), ni, nl, poly_3mm_3_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index cc175291f..a43da0efc 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -99,7 +99,6 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) a, b, c, d, f, P, Q, U, V ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_adi2), grid_size, block_size, @@ -108,7 +107,6 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) a, c, d, e, f, P, Q, U, V ); - cudaErrchk( cudaGetLastError() ); } // tstep loop @@ -141,7 +139,6 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_adi1_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_adi2_lambda = [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY6; @@ -159,7 +156,6 @@ void POLYBENCH_ADI::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_adi2_lambda ); - cudaErrchk( cudaGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 39e10dc62..151d0d81c 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -99,7 +99,6 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) a, b, c, d, f, P, Q, U, V ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_adi2), grid_size, block_size, @@ -108,7 +107,6 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) a, c, d, e, f, P, Q, U, V ); - hipErrchk( hipGetLastError() ); } // tstep loop @@ -141,7 +139,6 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_adi1_lambda ); - hipErrchk( hipGetLastError() ); auto poly_adi2_lambda = [=] __device__ (Index_type i) { POLYBENCH_ADI_BODY6; @@ -159,7 +156,6 @@ void POLYBENCH_ADI::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_adi2_lambda ); - hipErrchk( hipGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index 6309b1590..e244b08db 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -88,14 +88,12 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), A, x, y, tmp, N ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_atax_2), grid_size, block_size, shmem, res.get_stream(), A, tmp, y, N ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -121,7 +119,6 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), N, poly_atax1_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_atax2_lambda = [=] __device__ (Index_type j) { POLYBENCH_ATAX_BODY4; @@ -136,7 +133,6 @@ void POLYBENCH_ATAX::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), N, poly_atax2_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index f28cb4dec..f918d355e 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -88,14 +88,12 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), A, x, y, tmp, N ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_atax_2), grid_size, block_size, shmem, res.get_stream(), A, tmp, y, N ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -121,7 +119,6 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), N, poly_atax1_lambda ); - hipErrchk( hipGetLastError() ); auto poly_atax2_lambda = [=] __device__ (Index_type j) { POLYBENCH_ATAX_BODY4; @@ -136,7 +133,6 @@ void POLYBENCH_ATAX::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), N, poly_atax2_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index f7237cc08..51c1f1f90 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -165,7 +165,6 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) grid_size1, block_size, shmem, res.get_stream(), ey, fict, ny, t ); - cudaErrchk( cudaGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_CUDA; FDTD_2D_NBLOCKS_CUDA; @@ -175,21 +174,18 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), ey, hz, nx, ny ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_fdtd2d_3), nblocks234, nthreads_per_block234, shmem, res.get_stream(), ex, hz, nx, ny ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_fdtd2d_4), nblocks234, nthreads_per_block234, shmem, res.get_stream(), hz, ex, ey, nx, ny ); - cudaErrchk( cudaGetLastError() ); } // tstep loop @@ -216,7 +212,6 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) grid_size1, block_size, shmem, res.get_stream(), ny, poly_fdtd2d_1_lambda ); - cudaErrchk( cudaGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_CUDA; FDTD_2D_NBLOCKS_CUDA; @@ -232,7 +227,6 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), nx, ny, poly_fdtd2d_2_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, Index_type j) { @@ -245,7 +239,6 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), nx, ny, poly_fdtd2d_3_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, Index_type j) { @@ -258,7 +251,6 @@ void POLYBENCH_FDTD_2D::runCudaVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), nx, ny, poly_fdtd2d_4_lambda ); - cudaErrchk( cudaGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index b6a2e21d8..4b5e4c4ec 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -164,7 +164,6 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) grid_size1, block_size, shmem, res.get_stream(), ey, fict, ny, t ); - hipErrchk( hipGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; @@ -174,21 +173,18 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), ey, hz, nx, ny ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_fdtd2d_3), nblocks234, nthreads_per_block234, shmem, res.get_stream(), ex, hz, nx, ny ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_fdtd2d_4), nblocks234, nthreads_per_block234, shmem, res.get_stream(), hz, ex, ey, nx, ny ); - hipErrchk( hipGetLastError() ); } // tstep loop @@ -215,7 +211,6 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) grid_size1, block_size, shmem, res.get_stream(), ny, poly_fdtd2d_1_lambda ); - hipErrchk( hipGetLastError() ); FDTD_2D_THREADS_PER_BLOCK_HIP; FDTD_2D_NBLOCKS_HIP; @@ -231,7 +226,6 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), nx, ny, poly_fdtd2d_2_lambda ); - hipErrchk( hipGetLastError() ); auto poly_fdtd2d_3_lambda = [=] __device__ (Index_type i, Index_type j) { @@ -244,7 +238,6 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), nx, ny, poly_fdtd2d_3_lambda ); - hipErrchk( hipGetLastError() ); auto poly_fdtd2d_4_lambda = [=] __device__ (Index_type i, Index_type j) { @@ -257,7 +250,6 @@ void POLYBENCH_FDTD_2D::runHipVariantImpl(VariantID vid) nblocks234, nthreads_per_block234, shmem, res.get_stream(), nx, ny, poly_fdtd2d_4_lambda ); - hipErrchk( hipGetLastError() ); } // tstep loop diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index d45726e20..87ebd0adf 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -90,7 +90,6 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), pout, pin, k, N ); - cudaErrchk( cudaGetLastError() ); } @@ -119,7 +118,6 @@ void POLYBENCH_FLOYD_WARSHALL::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_floyd_warshall_lambda ); - cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index cb67abc1e..144abf695 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -90,7 +90,6 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), pout, pin, k, N ); - hipErrchk( hipGetLastError() ); } @@ -119,7 +118,6 @@ void POLYBENCH_FLOYD_WARSHALL::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_floyd_warshall_lambda ); - hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index b64c5a221..a1cd2ec2f 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -97,7 +97,6 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) C, A, B, alpha, beta, ni, nj, nk ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -126,7 +125,6 @@ void POLYBENCH_GEMM::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), ni, nj, poly_gemm_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index fcbd22e88..ce1b73c0d 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -97,7 +97,6 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) C, A, B, alpha, beta, ni, nj, nk ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -126,7 +125,6 @@ void POLYBENCH_GEMM::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), ni, nj, poly_gemm_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 4f31f14c8..0b02c7e57 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -145,7 +145,6 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) nblocks1, nthreads_per_block1, shmem, res.get_stream(), A, u1, v1, u2, v2, n ); - cudaErrchk( cudaGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); @@ -153,19 +152,16 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), A, x, y, beta, n ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_gemver_3), grid_size, block_size, shmem, res.get_stream(), x, z, n ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_gemver_4), grid_size, block_size, shmem, res.get_stream(), A, x, w, alpha, n ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -189,7 +185,6 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) nblocks1, nthreads_per_block1, shmem, res.get_stream(), n, poly_gemver1_lambda ); - cudaErrchk( cudaGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); @@ -206,7 +201,6 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_gemver2_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_gemver3_lambda = [=] __device__ (Index_type i) { POLYBENCH_GEMVER_BODY5; @@ -217,7 +211,6 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_gemver3_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_gemver4_lambda = [=] __device__ (Index_type i) { POLYBENCH_GEMVER_BODY6; @@ -232,7 +225,6 @@ void POLYBENCH_GEMVER::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_gemver4_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 008149c29..5d1fa4be3 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -145,7 +145,6 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) nblocks1, nthreads_per_block1, shmem, res.get_stream(), A, u1, v1, u2, v2, n ); - hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n, block_size); @@ -153,19 +152,16 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), A, x, y, beta, n ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_gemver_3), grid_size, block_size, shmem, res.get_stream(), x, z, n ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_gemver_4), grid_size, block_size, shmem, res.get_stream(), A, x, w, alpha, n ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -189,7 +185,6 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) nblocks1, nthreads_per_block1, shmem, res.get_stream(), n, poly_gemver1_lambda ); - hipErrchk( hipGetLastError() ); size_t grid_size = RAJA_DIVIDE_CEILING_INT(n, block_size); @@ -206,7 +201,6 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_gemver2_lambda ); - hipErrchk( hipGetLastError() ); auto poly_gemver3_lambda = [=] __device__ (Index_type i) { POLYBENCH_GEMVER_BODY5; @@ -217,7 +211,6 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_gemver3_lambda ); - hipErrchk( hipGetLastError() ); auto poly_gemver4_lambda = [=] __device__ (Index_type i) { POLYBENCH_GEMVER_BODY6; @@ -232,7 +225,6 @@ void POLYBENCH_GEMVER::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), n, poly_gemver4_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index 9d0157653..c2772e506 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -64,7 +64,6 @@ void POLYBENCH_GESUMMV::runCudaVariantImpl(VariantID vid) A, B, alpha, beta, N ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index fbf8f7381..2105ce55c 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -64,7 +64,6 @@ void POLYBENCH_GESUMMV::runHipVariantImpl(VariantID vid) A, B, alpha, beta, N ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 88c6e725f..9bf147bdd 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -105,14 +105,12 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_heat_3D_2), nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - cudaErrchk( cudaGetLastError() ); } @@ -142,7 +140,6 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_heat_3D_1_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, Index_type j, @@ -156,7 +153,6 @@ void POLYBENCH_HEAT_3D::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_heat_3D_2_lambda ); - cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index c244f9470..224630f63 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -105,14 +105,12 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_heat_3D_2), nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - hipErrchk( hipGetLastError() ); } @@ -142,7 +140,6 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_heat_3D_1_lambda ); - hipErrchk( hipGetLastError() ); auto poly_heat_3D_2_lambda = [=] __device__ (Index_type i, Index_type j, @@ -156,7 +153,6 @@ void POLYBENCH_HEAT_3D::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_heat_3D_2_lambda ); - hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index e46c5972d..9ff945c92 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -67,13 +67,11 @@ void POLYBENCH_JACOBI_1D::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), A, B, N ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_jacobi_1D_2), grid_size, block_size, shmem, res.get_stream(), A, B, N ); - cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index d4c0b9ba4..ce9336999 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -67,13 +67,11 @@ void POLYBENCH_JACOBI_1D::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), A, B, N ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_jacobi_1D_2), grid_size, block_size, shmem, res.get_stream(), A, B, N ); - hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 2287a8b9f..5826a90a4 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -101,14 +101,12 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_jacobi_2D_2), nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - cudaErrchk( cudaGetLastError() ); } @@ -137,7 +135,6 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_jacobi_2D_1_lambda ); - cudaErrchk( cudaGetLastError() ); auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, Index_type j) { @@ -150,7 +147,6 @@ void POLYBENCH_JACOBI_2D::runCudaVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_jacobi_2D_2_lambda ); - cudaErrchk( cudaGetLastError() ); } diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index dd83bb5c2..7c4a3f835 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -101,14 +101,12 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_jacobi_2D_2), nblocks, nthreads_per_block, shmem, res.get_stream(), A, B, N ); - hipErrchk( hipGetLastError() ); } @@ -137,7 +135,6 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_jacobi_2D_1_lambda ); - hipErrchk( hipGetLastError() ); auto poly_jacobi_2D_2_lambda = [=] __device__ (Index_type i, Index_type j) { @@ -150,7 +147,6 @@ void POLYBENCH_JACOBI_2D::runHipVariantImpl(VariantID vid) nblocks, nthreads_per_block, shmem, res.get_stream(), N, poly_jacobi_2D_2_lambda ); - hipErrchk( hipGetLastError() ); } diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index ffa0debe8..d1990d8fe 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -75,13 +75,11 @@ void POLYBENCH_MVT::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), A, x1, y1, N ); - cudaErrchk( cudaGetLastError() ); RPlaunchCudaKernel( (poly_mvt_2), grid_size, block_size, shmem, res.get_stream(), A, x2, y2, N ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index fb6dd4d4d..7d33627bd 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -75,13 +75,11 @@ void POLYBENCH_MVT::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), A, x1, y1, N ); - hipErrchk( hipGetLastError() ); RPlaunchHipKernel( (poly_mvt_2), grid_size, block_size, shmem, res.get_stream(), A, x2, y2, N ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index 482df0c9d..c70add7db 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -57,7 +57,6 @@ void ADD::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), c, a, b, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -79,7 +78,6 @@ void ADD::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, add_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 6f8dbe965..0ff31e74f 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -56,7 +56,6 @@ void ADD::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), c, a, b, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -78,7 +77,6 @@ void ADD::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, add_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 2a5020e1e..5977b8926 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -56,7 +56,6 @@ void COPY::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), c, a, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -78,7 +77,6 @@ void COPY::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 52f1be4d4..51d66459f 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -56,7 +56,6 @@ void COPY::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), c, a, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -78,7 +77,6 @@ void COPY::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, copy_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index f568efb08..0fbd862fb 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -80,7 +80,6 @@ void DOT::runCudaVariantBlockAtomic(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); - cudaErrchk( cudaGetLastError() ); Real_type rdprod; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); @@ -144,7 +143,6 @@ void DOT::runCudaVariantBlockAtomicOccGS(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); - cudaErrchk( cudaGetLastError() ); Real_type rdprod; RAJAPERF_CUDA_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 93b6292a7..eee153865 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -80,7 +80,6 @@ void DOT::runHipVariantBlockAtomic(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); - hipErrchk( hipGetLastError() ); Real_type rdprod; RAJAPERF_HIP_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); @@ -144,7 +143,6 @@ void DOT::runHipVariantBlockAtomicOccGS(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); - hipErrchk( hipGetLastError() ); Real_type rdprod; RAJAPERF_HIP_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 5c210f344..2fa56fd1a 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -56,7 +56,6 @@ void MUL::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), b, c, alpha, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -78,7 +77,6 @@ void MUL::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, mul_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index f231d4bad..d2cd11b7c 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -56,7 +56,6 @@ void MUL::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), b, c, alpha, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -78,7 +77,6 @@ void MUL::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, mul_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index ac8cfc8f6..b5a3f2e34 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -56,7 +56,6 @@ void TRIAD::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, b, c, alpha, iend ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); @@ -78,7 +77,6 @@ void TRIAD::runCudaVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), ibegin, iend, triad_lambda ); - cudaErrchk( cudaGetLastError() ); } stopTimer(); diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 667f67278..6b42deabe 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -56,7 +56,6 @@ void TRIAD::runHipVariantImpl(VariantID vid) grid_size, block_size, shmem, res.get_stream(), a, b, c, alpha, iend ); - hipErrchk( hipGetLastError() ); } stopTimer(); @@ -74,11 +73,10 @@ void TRIAD::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchHipKernel( (lambda_hip_forall), + decltype(triad_lambda)>), grid_size, block_size, shmem, res.get_stream(), ibegin, iend, triad_lambda ); - hipErrchk( hipGetLastError() ); } stopTimer(); From 194424233cf3f1aef05cd7e348696d5ae9229583 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 21 Dec 2023 10:24:04 -0800 Subject: [PATCH 231/454] Fix typo in FIRST_MIN --- src/lcals/FIRST_MIN-Cuda.cpp | 12 ++++++------ src/lcals/FIRST_MIN-Hip.cpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 282464f88..6dff6f4d7 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -236,7 +236,7 @@ void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if ( vid == Base_HIP ) { + if ( vid == Base_CUDA ) { if (tune_idx == t) { @@ -258,7 +258,7 @@ void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) } - if ( vid == RAJA_HIP ) { + if ( vid == RAJA_CUDA ) { if (tune_idx == t) { @@ -301,17 +301,17 @@ void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if ( vid == Base_HIP ) { + if ( vid == Base_CUDA ) { - addVariantTuningName(vid, "blkhst"+std::to_string(block_size)); + addVariantTuningName(vid, "blkhst_"+std::to_string(block_size)); addVariantTuningName(vid, "blkhst_occgs_"+std::to_string(block_size)); } - if ( vid == RAJA_HIP ) { + if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "blkdev"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 75677e07f..bbdea1605 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -302,7 +302,7 @@ void FIRST_MIN::setHipTuningDefinitions(VariantID vid) if ( vid == Base_HIP ) { - addVariantTuningName(vid, "blkhst"+std::to_string(block_size)); + addVariantTuningName(vid, "blkhst_"+std::to_string(block_size)); addVariantTuningName(vid, "blkhst_occgs_"+std::to_string(block_size)); @@ -310,7 +310,7 @@ void FIRST_MIN::setHipTuningDefinitions(VariantID vid) if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "blkdev"+std::to_string(block_size)); + addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); From 6443b964d4400d5745d28c653134ae3a9330ca7d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 10:40:25 -0800 Subject: [PATCH 232/454] Add helper classes for reducers Add a group for algorithms and a group for mappings --- src/common/GPUUtils.hpp | 46 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 23140a29e..5ee735e4f 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -140,6 +140,52 @@ using make_list_type = } // closing brace for gpu_block_size namespace +namespace gpu_algorithm { + +struct block_atomic_helper +{ + static constexpr bool atomic = true; + static std::string get_name() { return "blkatm"; } +}; + +struct block_device_helper +{ + static constexpr bool atomic = false; + static std::string get_name() { return "blkdev"; } +}; + +struct block_host_helper +{ + static constexpr bool atomic = false; + static std::string get_name() { return "blkhst"; } +}; + +using reducer_helpers = camp::list< + block_atomic_helper, + block_device_helper >; + +} // closing brace for gpu_mapping namespace + +namespace gpu_mapping { + +struct global_direct_helper +{ + static constexpr bool direct = true; + static std::string get_name() { return "direct"; } +}; + +struct global_loop_occupancy_grid_stride_helper +{ + static constexpr bool direct = false; + static std::string get_name() { return "occgs"; } +}; + +using reducer_helpers = camp::list< + global_direct_helper, + global_loop_occupancy_grid_stride_helper >; + +} // closing brace for gpu_mapping namespace + } // closing brace for rajaperf namespace // allocate pointer of pointer_type with length From a589c0f4caa71f9acca0fa602afa8734153d9e42 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 10:40:48 -0800 Subject: [PATCH 233/454] Deduplicate PI_REDUCE --- src/basic/PI_REDUCE-Cuda.cpp | 206 ++++++++++------------------------- src/basic/PI_REDUCE-Hip.cpp | 206 ++++++++++------------------------- src/basic/PI_REDUCE.hpp | 24 ++-- 3 files changed, 120 insertions(+), 316 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 0b35b39c2..17d11cf41 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -53,73 +55,10 @@ __global__ void pi_reduce(Real_type dx, } - -template < size_t block_size > -void PI_REDUCE::runCudaVariantBlockAtomic(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - PI_REDUCE_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchCudaKernel( (pi_reduce), - grid_size, block_size, - shmem, res.get_stream(), - dx, - pi, m_pi_init, - iend ); - - Real_type rpi; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi = rpi * static_cast(4); - - } - stopTimer(); - - RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi); - - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - PI_REDUCE_BODY; - }); - - m_pi = static_cast(4) * static_cast(pi.get()); - - } - stopTimer(); - - } else { - getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void PI_REDUCE::runCudaVariantBlockAtomicOccGS(VariantID vid) +template < size_t block_size, bool direct > +void PI_REDUCE::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -131,8 +70,10 @@ void PI_REDUCE::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( - (pi_reduce), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getCudaOccupancyMaxBlocks( + (pi_reduce), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -158,65 +99,22 @@ void PI_REDUCE::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(pi, hpi); - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - PI_REDUCE_BODY; - }); - - m_pi = 4.0 * static_cast(pi.get()); - - } - stopTimer(); - } else { getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; } } - -template < size_t block_size > -void PI_REDUCE::runCudaVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void PI_REDUCE::runCudaVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); + using reduction_policy = std::conditional_t; - auto res{getCudaResource()}; - - PI_REDUCE_DATA_SETUP; + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; - if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - PI_REDUCE_BODY; - }); - - m_pi = 4.0 * static_cast(pi.get()); - - } - stopTimer(); - - } else { - getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void PI_REDUCE::runCudaVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -230,9 +128,9 @@ void PI_REDUCE::runCudaVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum pi(m_pi_init); + RAJA::ReduceSum pi(m_pi_init); - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { PI_REDUCE_BODY; }); @@ -258,45 +156,40 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runCudaVariantBlockAtomic(vid); + if ( vid == Base_CUDA ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runCudaVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runCudaVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_CUDA ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_CUDA ) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runCudaVariantRAJA(vid); - setBlockSize(block_size); - runCudaVariantBlockDevice(vid); + } - } + t += 1; - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -319,23 +212,36 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + if ( vid == Base_CUDA ) { - if ( vid == RAJA_CUDA ) { + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + } else if ( vid == RAJA_CUDA ) { - } + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); } }); } + } } // end namespace basic diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index a56d2174a..ba461485e 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -53,73 +55,10 @@ __global__ void pi_reduce(Real_type dx, } - -template < size_t block_size > -void PI_REDUCE::runHipVariantBlockAtomic(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - PI_REDUCE_DATA_SETUP; - - if ( vid == Base_HIP ) { - - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchHipKernel( (pi_reduce), - grid_size, block_size, - shmem, res.get_stream(), - dx, - pi, m_pi_init, - iend ); - - Real_type rpi; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi = rpi * static_cast(4); - - } - stopTimer(); - - RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi); - - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - PI_REDUCE_BODY; - }); - - m_pi = static_cast(4) * static_cast(pi.get()); - - } - stopTimer(); - - } else { - getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void PI_REDUCE::runHipVariantBlockAtomicOccGS(VariantID vid) +template < size_t block_size, bool direct > +void PI_REDUCE::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -131,8 +70,10 @@ void PI_REDUCE::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( - (pi_reduce), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getHipOccupancyMaxBlocks( + (pi_reduce), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -158,65 +99,22 @@ void PI_REDUCE::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(pi, hpi); - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - - RAJA::forall< RAJA::hip_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - PI_REDUCE_BODY; - }); - - m_pi = 4.0 * static_cast(pi.get()); - - } - stopTimer(); - } else { getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; } } - -template < size_t block_size > -void PI_REDUCE::runHipVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void PI_REDUCE::runHipVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); + using reduction_policy = std::conditional_t; - auto res{getHipResource()}; - - PI_REDUCE_DATA_SETUP; + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; - if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum pi(m_pi_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - PI_REDUCE_BODY; - }); - - m_pi = 4.0 * static_cast(pi.get()); - - } - stopTimer(); - - } else { - getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void PI_REDUCE::runHipVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -230,9 +128,9 @@ void PI_REDUCE::runHipVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum pi(m_pi_init); + RAJA::ReduceSum pi(m_pi_init); - RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { PI_REDUCE_BODY; }); @@ -258,45 +156,40 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runHipVariantBlockAtomic(vid); + if ( vid == Base_HIP ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runHipVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runHipVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_HIP ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_HIP ) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runHipVariantRAJA(vid); - setBlockSize(block_size); - runHipVariantBlockDevice(vid); + } - } + t += 1; - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -319,17 +212,30 @@ void PI_REDUCE::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + if ( vid == Base_HIP ) { - if ( vid == RAJA_HIP ) { + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } else if ( vid == RAJA_HIP ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + }); - } } }); diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 2f993f8e7..fa1bf6816 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -59,22 +59,14 @@ class PI_REDUCE : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDeviceOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantBase(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runHipVariantRAJA(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 04fd5e2741ad409d6e40ff0e879b7ffdcc2ce603 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 10:56:15 -0800 Subject: [PATCH 234/454] deduplicate REDUCE3_INT --- src/basic/REDUCE3_INT-Cuda.cpp | 225 +++++++++------------------------ src/basic/REDUCE3_INT-Hip.cpp | 225 +++++++++------------------------ src/basic/REDUCE3_INT.hpp | 24 ++-- 3 files changed, 122 insertions(+), 352 deletions(-) diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index e46f2fd70..5e80bd7cf 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -66,82 +68,10 @@ __global__ void reduce3int(Int_ptr vec, } - -template < size_t block_size > -void REDUCE3_INT::runCudaVariantBlockAtomic(VariantID vid) +template < size_t block_size, bool direct > +void REDUCE3_INT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - REDUCE3_INT_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; - RAJAPERF_CUDA_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - - RPlaunchCudaKernel( (reduce3int), - grid_size, block_size, - shmem, res.get_stream(), - vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); - - Int_type rvmem[3]; - RAJAPERF_CUDA_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); - m_vsum += rvmem[0]; - m_vmin = RAJA_MIN(m_vmin, rvmem[1]); - m_vmax = RAJA_MAX(m_vmax, rvmem[2]); - - } - stopTimer(); - - RAJAPERF_CUDA_REDUCER_TEARDOWN(vmem, hvmem); - - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void REDUCE3_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -153,8 +83,10 @@ void REDUCE3_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( - (reduce3int), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getCudaOccupancyMaxBlocks( + (reduce3int), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -185,72 +117,22 @@ void REDUCE3_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(vmem, hvmem); - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - } else { getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; } } -template < size_t block_size > -void REDUCE3_INT::runCudaVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void REDUCE3_INT::runCudaVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - REDUCE3_INT_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { + using reduction_policy = std::conditional_t; - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; - } -} + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; -template < size_t block_size > -void REDUCE3_INT::runCudaVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -264,11 +146,11 @@ void REDUCE3_INT::runCudaVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE3_INT_BODY_RAJA; }); @@ -296,45 +178,40 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runCudaVariantBlockAtomic(vid); + if ( vid == Base_CUDA ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runCudaVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runCudaVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_CUDA ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_CUDA ) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runCudaVariantRAJA(vid); - setBlockSize(block_size); - runCudaVariantBlockDevice(vid); - - } + } - t += 1; + t += 1; - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -357,17 +234,29 @@ void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_CUDA ) { + } else if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } - } + }); } diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index dfeb9d085..a495444ee 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -66,82 +68,10 @@ __global__ void reduce3int(Int_ptr vec, } - -template < size_t block_size > -void REDUCE3_INT::runHipVariantBlockAtomic(VariantID vid) +template < size_t block_size, bool direct > +void REDUCE3_INT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - REDUCE3_INT_DATA_SETUP; - - if ( vid == Base_HIP ) { - - RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; - RAJAPERF_HIP_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - - RPlaunchHipKernel( (reduce3int), - grid_size, block_size, - shmem, res.get_stream(), - vec, - vmem + 0, m_vsum_init, - vmem + 1, m_vmin_init, - vmem + 2, m_vmax_init, - iend ); - - Int_type rvmem[3]; - RAJAPERF_HIP_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); - m_vsum += rvmem[0]; - m_vmin = RAJA_MIN(m_vmin, rvmem[1]); - m_vmax = RAJA_MAX(m_vmax, rvmem[2]); - - } - stopTimer(); - - RAJAPERF_HIP_REDUCER_TEARDOWN(vmem, hvmem); - - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void REDUCE3_INT::runHipVariantBlockAtomicOccGS(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -153,8 +83,10 @@ void REDUCE3_INT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( - (reduce3int), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getHipOccupancyMaxBlocks( + (reduce3int), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -185,72 +117,22 @@ void REDUCE3_INT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(vmem, hvmem); - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::hip_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - } else { getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; } } -template < size_t block_size > -void REDUCE3_INT::runHipVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void REDUCE3_INT::runHipVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - REDUCE3_INT_DATA_SETUP; - - if ( vid == RAJA_HIP ) { + using reduction_policy = std::conditional_t; - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; - } -} + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; -template < size_t block_size > -void REDUCE3_INT::runHipVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -264,11 +146,11 @@ void REDUCE3_INT::runHipVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); - RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE3_INT_BODY_RAJA; }); @@ -296,45 +178,40 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runHipVariantBlockAtomic(vid); + if ( vid == Base_HIP ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runHipVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runHipVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_HIP ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_HIP ) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runHipVariantRAJA(vid); - setBlockSize(block_size); - runHipVariantBlockDevice(vid); - - } + } - t += 1; + t += 1; - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -357,17 +234,29 @@ void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_HIP ) { + } else if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } - } + }); } diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index bced8f059..55821efb7 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -74,22 +74,14 @@ class REDUCE3_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDeviceOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantBase(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runHipVariantRAJA(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 0d31729d8a6b775b44fd6403b8ac2e120be3c2e6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 11:10:01 -0800 Subject: [PATCH 235/454] deduplicate REDUCE_STRUCT --- src/basic/REDUCE_STRUCT-Cuda.cpp | 259 ++++++++----------------------- src/basic/REDUCE_STRUCT-Hip.cpp | 257 +++++++----------------------- src/basic/REDUCE_STRUCT.hpp | 24 +-- 3 files changed, 129 insertions(+), 411 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 677084d1b..53bf4d1b3 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -96,94 +98,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } - - -template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantBlockAtomic(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - REDUCE_STRUCT_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; - RAJAPERF_CUDA_REDUCER_INITIALIZE(imem, mem, hmem, 6); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - - RPlaunchCudaKernel( (reduce_struct), - grid_size, block_size, - shmem, res.get_stream(), - points.x, points.y, - mem, mem+1, mem+2, // xcenter,xmin,xmax - mem+3, mem+4, mem+5, // ycenter,ymin,ymax - m_init_sum, m_init_min, m_init_max, - points.N ); - - Real_type rmem[6]; - RAJAPERF_CUDA_REDUCER_COPY_BACK(rmem, mem, hmem, 6); - points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); - points.SetXMin(rmem[1]); - points.SetXMax(rmem[2]); - points.SetYMin(rmem[4]); - points.SetYMax(rmem[5]); - m_points=points; - - } - stopTimer(); - - RAJAPERF_CUDA_REDUCER_TEARDOWN(mem, hmem); - - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; - } - -} - -template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantBlockAtomicOccGS(VariantID vid) +template < size_t block_size, bool direct > +void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -195,8 +113,10 @@ void REDUCE_STRUCT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( - (reduce_struct), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getCudaOccupancyMaxBlocks( + (reduce_struct), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -230,88 +150,23 @@ void REDUCE_STRUCT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(mem, hmem); - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; - - } - stopTimer(); - } else { getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; } } -template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void REDUCE_STRUCT::runCudaVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); + using reduction_policy = std::conditional_t; - auto res{getCudaResource()}; - - REDUCE_STRUCT_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; - } - -} - -template < size_t block_size > -void REDUCE_STRUCT::runCudaVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -325,14 +180,14 @@ void REDUCE_STRUCT::runCudaVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); @@ -365,45 +220,40 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runCudaVariantBlockAtomic(vid); + if ( vid == Base_CUDA ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runCudaVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runCudaVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_CUDA ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_CUDA ) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runCudaVariantRAJA(vid); - setBlockSize(block_size); - runCudaVariantBlockDevice(vid); - - } + } - t += 1; + t += 1; - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -426,23 +276,36 @@ void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_CUDA ) { + } else if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - } + }); + + } + + }); } }); } + } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 451f8b1a4..dbebaeddc 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -96,94 +98,10 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } - - - -template < size_t block_size > -void REDUCE_STRUCT::runHipVariantBlockAtomic(VariantID vid) +template < size_t block_size, bool direct > +void REDUCE_STRUCT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - REDUCE_STRUCT_DATA_SETUP; - - if ( vid == Base_HIP ) { - - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; - RAJAPERF_HIP_REDUCER_INITIALIZE(imem, mem, hmem, 6); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - - RPlaunchHipKernel( (reduce_struct), - grid_size, block_size, - shmem, res.get_stream(), - points.x, points.y, - mem, mem+1, mem+2, // xcenter,xmin,xmax - mem+3, mem+4, mem+5, // ycenter,ymin,ymax - m_init_sum, m_init_min, m_init_max, - points.N ); - - Real_type rmem[6]; - RAJAPERF_HIP_REDUCER_COPY_BACK(rmem, mem, hmem, 6); - points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); - points.SetXMin(rmem[1]); - points.SetXMax(rmem[2]); - points.SetYMin(rmem[4]); - points.SetYMax(rmem[5]); - m_points=points; - - } - stopTimer(); - - RAJAPERF_HIP_REDUCER_TEARDOWN(mem, hmem); - - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; - } - -} -template < size_t block_size > -void REDUCE_STRUCT::runHipVariantBlockAtomicOccGS(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -195,8 +113,10 @@ void REDUCE_STRUCT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( - (reduce_struct), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getHipOccupancyMaxBlocks( + (reduce_struct), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -230,87 +150,23 @@ void REDUCE_STRUCT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(mem, hmem); - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall< RAJA::hip_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; - - } - stopTimer(); - } else { getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; } } -template < size_t block_size > -void REDUCE_STRUCT::runHipVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void REDUCE_STRUCT::runHipVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - REDUCE_STRUCT_DATA_SETUP; + using reduction_policy = std::conditional_t; - if ( vid == RAJA_HIP ) { + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; - - } - stopTimer(); - - } else { - getCout() << "\n REDUCE_STRUCT : Unknown Hip variant id = " << vid << std::endl; - } - -} -template < size_t block_size > -void REDUCE_STRUCT::runHipVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -324,14 +180,14 @@ void REDUCE_STRUCT::runHipVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); - RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_STRUCT_BODY_RAJA; }); @@ -364,45 +220,40 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockAtomic(vid); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - } + if ( vid == Base_HIP ) { - t += 1; + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runHipVariantBase(vid); - setBlockSize(block_size); - runHipVariantBlockAtomicOccGS(vid); + } - } + t += 1; - t += 1; + } else if ( vid == RAJA_HIP ) { - if ( vid == RAJA_HIP ) { + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if (tune_idx == t) { + if (tune_idx == t) { - setBlockSize(block_size); - runHipVariantBlockDevice(vid); + setBlockSize(block_size); + runHipVariantRAJA(vid); - } - - t += 1; + } - if (tune_idx == t) { + t += 1; - setBlockSize(block_size); - runHipVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -425,17 +276,29 @@ void REDUCE_STRUCT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - if ( vid == RAJA_HIP ) { + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + } else if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } - } + }); } diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 10222f941..beebd8692 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -89,22 +89,14 @@ class REDUCE_STRUCT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDeviceOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantBase(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runHipVariantRAJA(VariantID vid); struct PointsType { Index_type N; From 80d46219a441684bac0ca634b79a08f638dd1f6d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 11:16:33 -0800 Subject: [PATCH 236/454] deduplicate TRAP_INT --- src/basic/TRAP_INT-Cuda.cpp | 207 ++++++++++-------------------------- src/basic/TRAP_INT-Hip.cpp | 206 ++++++++++------------------------- src/basic/TRAP_INT.hpp | 24 ++--- 3 files changed, 119 insertions(+), 318 deletions(-) diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index e4010431b..578115678 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -72,75 +74,10 @@ __global__ void trapint(Real_type x0, Real_type xp, } - -template < size_t block_size > -void TRAP_INT::runCudaVariantBlockAtomic(VariantID vid) +template < size_t block_size, bool direct > +void TRAP_INT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - TRAP_INT_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchCudaKernel( (trapint), - grid_size, block_size, - shmem, res.get_stream(), - x0, xp, - y, yp, - h, - sumx, - iend); - - Real_type rsumx; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); - m_sumx += rsumx * h; - - } - stopTimer(); - - RAJAPERF_CUDA_REDUCER_TEARDOWN(sumx, hsumx); - - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - - } else { - getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void TRAP_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -152,8 +89,10 @@ void TRAP_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( - (trapint), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getCudaOccupancyMaxBlocks( + (trapint), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -181,64 +120,22 @@ void TRAP_INT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(sumx, hsumx); - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - } else { getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; } } -template < size_t block_size > -void TRAP_INT::runCudaVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void TRAP_INT::runCudaVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - TRAP_INT_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); + using reduction_policy = std::conditional_t; - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - - } else { - getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; - } -} + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; -template < size_t block_size > -void TRAP_INT::runCudaVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -252,9 +149,9 @@ void TRAP_INT::runCudaVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { TRAP_INT_BODY; }); @@ -280,45 +177,40 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runCudaVariantBlockAtomic(vid); + if ( vid == Base_CUDA ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runCudaVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runCudaVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_CUDA ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_CUDA ) { + if (tune_idx == t) { - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantBlockDevice(vid); - - } + setBlockSize(block_size); + runCudaVariantRAJA(vid); - t += 1; + } - if (tune_idx == t) { + t += 1; - setBlockSize(block_size); - runCudaVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -341,23 +233,36 @@ void TRAP_INT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_CUDA ) { - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - if ( vid == RAJA_CUDA ) { + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + } else if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - } + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); } }); } + } } // end namespace basic diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index aff820afc..83af19e55 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -72,75 +74,10 @@ __global__ void trapint(Real_type x0, Real_type xp, } - -template < size_t block_size > -void TRAP_INT::runHipVariantBlockAtomic(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - TRAP_INT_DATA_SETUP; - - if ( vid == Base_HIP ) { - - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchHipKernel( (trapint), - grid_size, block_size, - shmem, res.get_stream(), - x0, xp, - y, yp, - h, - sumx, - iend); - - Real_type rsumx; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); - m_sumx += rsumx * h; - - } - stopTimer(); - - RAJAPERF_HIP_REDUCER_TEARDOWN(sumx, hsumx); - - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - - } else { - getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void TRAP_INT::runHipVariantBlockAtomicOccGS(VariantID vid) +template < size_t block_size, bool direct > +void TRAP_INT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -152,8 +89,10 @@ void TRAP_INT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( - (trapint), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getHipOccupancyMaxBlocks( + (trapint), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -181,64 +120,22 @@ void TRAP_INT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(sumx, hsumx); - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall< RAJA::hip_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - } else { getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; } } -template < size_t block_size > -void TRAP_INT::runHipVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void TRAP_INT::runHipVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - TRAP_INT_DATA_SETUP; - - if ( vid == RAJA_HIP ) { + using reduction_policy = std::conditional_t; - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - - } else { - getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; - } -} + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; -template < size_t block_size > -void TRAP_INT::runHipVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -252,9 +149,9 @@ void TRAP_INT::runHipVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { TRAP_INT_BODY; }); @@ -280,45 +177,40 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockAtomic(vid); - - } + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - t += 1; + if ( vid == Base_HIP ) { - if (tune_idx == t) { + if (tune_idx == t) { - setBlockSize(block_size); - runHipVariantBlockAtomicOccGS(vid); + setBlockSize(block_size); + runHipVariantBase(vid); - } + } - t += 1; + t += 1; - if ( vid == RAJA_HIP ) { + } else if ( vid == RAJA_HIP ) { - if (tune_idx == t) { + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - setBlockSize(block_size); - runHipVariantBlockDevice(vid); + if (tune_idx == t) { - } + setBlockSize(block_size); + runHipVariantRAJA(vid); - t += 1; + } - if (tune_idx == t) { + t += 1; - setBlockSize(block_size); - runHipVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -341,17 +233,29 @@ void TRAP_INT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_HIP ) { + } else if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } - } + }); } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 107ed67d8..3e3eebf9c 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -71,22 +71,14 @@ class TRAP_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDeviceOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantBase(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runHipVariantRAJA(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 8a6737ffa5ca68716aa02365afde8926d9e5cde1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 11:40:03 -0800 Subject: [PATCH 237/454] deduplicate REDUCE_SUM --- src/algorithm/REDUCE_SUM-Cuda.cpp | 206 +++++++++--------------------- src/algorithm/REDUCE_SUM-Hip.cpp | 204 ++++++++--------------------- src/algorithm/REDUCE_SUM.hpp | 24 ++-- 3 files changed, 120 insertions(+), 314 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index d43fb6d54..f117db199 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -19,6 +19,8 @@ #include #include +#include +#include namespace rajaperf @@ -126,71 +128,10 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) } -template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlockAtomic(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - REDUCE_SUM_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchCudaKernel( (reduce_sum), - grid_size, block_size, - shmem, res.get_stream(), - x, sum, m_sum_init, iend ); - - RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); - - } - stopTimer(); - - RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum); - - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sum(m_sum_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - - } else { - - getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; - - } - -} - -template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) +template < size_t block_size, bool direct > +void REDUCE_SUM::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -202,8 +143,10 @@ void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( - (reduce_sum), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getCudaOccupancyMaxBlocks( + (reduce_sum), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -225,23 +168,6 @@ void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(sum, hsum); - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sum(m_sum_init); - - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - } else { getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; @@ -250,45 +176,17 @@ void REDUCE_SUM::runCudaVariantBlockAtomicOccGS(VariantID vid) } -template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void REDUCE_SUM::runCudaVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - REDUCE_SUM_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sum(m_sum_init); + using reduction_policy = std::conditional_t; - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - - } else { - - getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; - - } + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; -} - -template < size_t block_size > -void REDUCE_SUM::runCudaVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -302,9 +200,9 @@ void REDUCE_SUM::runCudaVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sum(m_sum_init); + RAJA::ReduceSum sum(m_sum_init); - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_SUM_BODY; }); @@ -345,45 +243,41 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runCudaVariantBlockAtomic(vid); + if ( vid == Base_CUDA ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runCudaVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runCudaVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_CUDA ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_CUDA ) { + if (tune_idx == t) { - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantBlockDevice(vid); - - } + setBlockSize(block_size); + runCudaVariantRAJA(vid); - t += 1; + } - if (tune_idx == t) { + t += 1; - setBlockSize(block_size); - runCudaVariantBlockDeviceOccGS(vid); + }); } - t += 1; + }); - } } }); @@ -411,22 +305,36 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + if ( vid == Base_CUDA ) { - if ( vid == RAJA_CUDA ) { + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + } else if ( vid == RAJA_CUDA ) { + + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); - } } }); } + } } // end namespace algorithm diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index b780e9be9..445d29dd4 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -24,6 +24,8 @@ #include #include +#include +#include namespace rajaperf @@ -153,71 +155,10 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) } -template < size_t block_size > -void REDUCE_SUM::runHipVariantBlockAtomic(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - REDUCE_SUM_DATA_SETUP; - - if ( vid == Base_HIP ) { - - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchHipKernel( (reduce_sum), - grid_size, block_size, - shmem, res.get_stream(), - x, sum, m_sum_init, iend ); - - RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); - - } - stopTimer(); - - RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum); - - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sum(m_sum_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - - } else { - - getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; - - } - -} - -template < size_t block_size > -void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) +template < size_t block_size, bool direct > +void REDUCE_SUM::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -229,8 +170,10 @@ void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( - (reduce_sum), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getHipOccupancyMaxBlocks( + (reduce_sum), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -252,23 +195,6 @@ void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(sum, hsum); - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sum(m_sum_init); - - RAJA::forall< RAJA::hip_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - } else { getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; @@ -277,45 +203,17 @@ void REDUCE_SUM::runHipVariantBlockAtomicOccGS(VariantID vid) } -template < size_t block_size > -void REDUCE_SUM::runHipVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void REDUCE_SUM::runHipVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - REDUCE_SUM_DATA_SETUP; - - if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + using reduction_policy = std::conditional_t; - RAJA::ReduceSum sum(m_sum_init); + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE_SUM_BODY; - }); - - m_sum = sum.get(); - - } - stopTimer(); - - } else { - - getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; - - } - -} - -template < size_t block_size > -void REDUCE_SUM::runHipVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -329,9 +227,9 @@ void REDUCE_SUM::runHipVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sum(m_sum_init); + RAJA::ReduceSum sum(m_sum_init); - RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { REDUCE_SUM_BODY; }); @@ -372,45 +270,41 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockAtomic(vid); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - } + if ( vid == Base_HIP ) { - t += 1; + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runHipVariantBase(vid); - setBlockSize(block_size); - runHipVariantBlockAtomicOccGS(vid); + } - } + t += 1; - t += 1; + } else if ( vid == RAJA_HIP ) { - if ( vid == RAJA_HIP ) { + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if (tune_idx == t) { + if (tune_idx == t) { - setBlockSize(block_size); - runHipVariantBlockDevice(vid); + setBlockSize(block_size); + runHipVariantRAJA(vid); - } + } - t += 1; + t += 1; - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockDeviceOccGS(vid); + }); } - t += 1; + }); - } } }); @@ -442,17 +336,29 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_HIP ) { + } else if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } - } + }); } diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 4a9db2831..58fe8387f 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -63,22 +63,14 @@ class REDUCE_SUM : public KernelBase void setHipTuningDefinitions(VariantID vid); void runCudaVariantCub(VariantID vid); void runHipVariantRocprim(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDeviceOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantBase(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runHipVariantRAJA(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 2672c631b59b803899966978754d8e40751ceebd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 11:47:40 -0800 Subject: [PATCH 238/454] deduplicate DOT --- src/stream/DOT-Cuda.cpp | 203 +++++++++++----------------------------- src/stream/DOT-Hip.cpp | 202 +++++++++++---------------------------- src/stream/DOT.hpp | 24 ++--- 3 files changed, 119 insertions(+), 310 deletions(-) diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 0fbd862fb..86f94d0ec 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -52,71 +54,10 @@ __global__ void dot(Real_ptr a, Real_ptr b, } - -template < size_t block_size > -void DOT::runCudaVariantBlockAtomic(VariantID vid) +template < size_t block_size, bool direct > +void DOT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - DOT_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchCudaKernel( (dot), - grid_size, block_size, - shmem, res.get_stream(), - a, b, dprod, m_dot_init, iend ); - - Real_type rdprod; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); - m_dot += rdprod; - - } - stopTimer(); - - RAJAPERF_CUDA_REDUCER_TEARDOWN(dprod, hdprod); - - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum dot(m_dot_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - DOT_BODY; - }); - - m_dot += static_cast(dot.get()); - - } - stopTimer(); - - } else { - getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void DOT::runCudaVariantBlockAtomicOccGS(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -128,8 +69,10 @@ void DOT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( - (dot), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getCudaOccupancyMaxBlocks( + (dot), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -153,64 +96,22 @@ void DOT::runCudaVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_CUDA_REDUCER_TEARDOWN(dprod, hdprod); - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum dot(m_dot_init); - - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - DOT_BODY; - }); - - m_dot += static_cast(dot.get()); - - } - stopTimer(); - } else { getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; } } -template < size_t block_size > -void DOT::runCudaVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void DOT::runCudaVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - DOT_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum dot(m_dot_init); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - DOT_BODY; - }); - - m_dot += static_cast(dot.get()); + using reduction_policy = std::conditional_t; - } - stopTimer(); + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; - } else { - getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void DOT::runCudaVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -224,9 +125,9 @@ void DOT::runCudaVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum dot(m_dot_init); + RAJA::ReduceSum dot(m_dot_init); - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { DOT_BODY; }); @@ -252,45 +153,40 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runCudaVariantBlockAtomic(vid); + if ( vid == Base_CUDA ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runCudaVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runCudaVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_CUDA ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_CUDA ) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runCudaVariantRAJA(vid); - setBlockSize(block_size); - runCudaVariantBlockDevice(vid); + } - } + t += 1; - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -313,23 +209,36 @@ void DOT::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + if ( vid == Base_CUDA ) { - if ( vid == RAJA_CUDA ) { + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + } else if ( vid == RAJA_CUDA ) { - } + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } + + }); } }); } + } } // end namespace stream diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index eee153865..89fac768e 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -52,71 +54,10 @@ __global__ void dot(Real_ptr a, Real_ptr b, } - -template < size_t block_size > -void DOT::runHipVariantBlockAtomic(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - DOT_DATA_SETUP; - - if ( vid == Base_HIP ) { - - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = sizeof(Real_type)*block_size; - - RPlaunchHipKernel( (dot), - grid_size, block_size, - shmem, res.get_stream(), - a, b, dprod, m_dot_init, iend ); - - Real_type rdprod; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); - m_dot += rdprod; - - } - stopTimer(); - - RAJAPERF_HIP_REDUCER_TEARDOWN(dprod, hdprod); - - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum dot(m_dot_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - DOT_BODY; - }); - - m_dot += static_cast(dot.get()); - - } - stopTimer(); - - } else { - getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void DOT::runHipVariantBlockAtomicOccGS(VariantID vid) +template < size_t block_size, bool direct > +void DOT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -128,8 +69,10 @@ void DOT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( - (dot), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getHipOccupancyMaxBlocks( + (dot), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -153,64 +96,22 @@ void DOT::runHipVariantBlockAtomicOccGS(VariantID vid) RAJAPERF_HIP_REDUCER_TEARDOWN(dprod, hdprod); - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum dot(m_dot_init); - - RAJA::forall< RAJA::hip_exec_occ_calc >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - DOT_BODY; - }); - - m_dot += static_cast(dot.get()); - - } - stopTimer(); - } else { getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; } } -template < size_t block_size > -void DOT::runHipVariantBlockDevice(VariantID vid) +template < size_t block_size, bool atomic, bool direct > +void DOT::runHipVariantRAJA(VariantID vid) { - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - DOT_DATA_SETUP; - - if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum dot(m_dot_init); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - DOT_BODY; - }); + using reduction_policy = std::conditional_t; - m_dot += static_cast(dot.get()); - - } - stopTimer(); + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; - } else { - getCout() << "\n DOT : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void DOT::runHipVariantBlockDeviceOccGS(VariantID vid) -{ const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -224,9 +125,9 @@ void DOT::runHipVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum dot(m_dot_init); + RAJA::ReduceSum dot(m_dot_init); - RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { DOT_BODY; }); @@ -252,45 +153,40 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runHipVariantBlockAtomic(vid); + if ( vid == Base_HIP ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runHipVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runHipVariantBlockAtomicOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_HIP ) { - t += 1; + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - if ( vid == RAJA_HIP ) { + if (tune_idx == t) { - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockDevice(vid); - - } + setBlockSize(block_size); + runHipVariantRAJA(vid); - t += 1; + } - if (tune_idx == t) { + t += 1; - setBlockSize(block_size); - runHipVariantBlockDeviceOccGS(vid); + }); } - t += 1; - - } + }); } @@ -313,17 +209,29 @@ void DOT::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - addVariantTuningName(vid, "blkatm_"+std::to_string(block_size)); + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + if ( vid == Base_HIP ) { + + auto algorithm_helper = gpu_algorithm::block_atomic_helper{}; - addVariantTuningName(vid, "blkatm_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_HIP ) { + } else if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + }); + + } - } + }); } diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 50391939b..04fd10727 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -55,22 +55,14 @@ class DOT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomic(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockAtomicOccGS(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDeviceOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantBase(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, bool atomic, bool direct > + void runHipVariantRAJA(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 83f3c4f7b53277b094faa1c5cf1b17ccca9d802c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 28 Dec 2023 11:59:05 -0800 Subject: [PATCH 239/454] deduplicate FIRST_MIN --- src/lcals/FIRST_MIN-Cuda.cpp | 179 ++++++++++------------------------- src/lcals/FIRST_MIN-Hip.cpp | 175 ++++++++++------------------------ src/lcals/FIRST_MIN.hpp | 24 ++--- 3 files changed, 106 insertions(+), 272 deletions(-) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 6dff6f4d7..68f6a43e8 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -58,90 +60,8 @@ __global__ void first_min(Real_ptr x, } -template < size_t block_size > -void FIRST_MIN::runCudaVariantBlockHost(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - FIRST_MIN_DATA_SETUP; - - if ( vid == Base_CUDA ) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - - RAJAPERF_CUDA_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - FIRST_MIN_MINLOC_INIT; - RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); - - constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - - RPlaunchCudaKernel( (first_min), - grid_size, block_size, - shmem, res.get_stream(), - x, dminloc, mymin, - iend ); - - RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); - for (Index_type i = 0; i < static_cast(grid_size); i++) { - if ( mymin_block[i].val < mymin.val ) { - mymin = mymin_block[i]; - } - } - m_minloc = RAJA_MAX(m_minloc, mymin.loc); - - } - stopTimer(); - - RAJAPERF_CUDA_REDUCER_TEARDOWN(dminloc, mymin_block); - - } else { - getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void FIRST_MIN::runCudaVariantBlockDevice(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - FIRST_MIN_DATA_SETUP; - - if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); - - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - FIRST_MIN_BODY_RAJA; - }); - - m_minloc = loc.getLoc(); - - } - stopTimer(); - - } else { - getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) +template < size_t block_size, bool direct > +void FIRST_MIN::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type iend = getActualProblemSize(); @@ -153,8 +73,10 @@ void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) if ( vid == Base_CUDA ) { constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - const size_t max_grid_size = detail::getCudaOccupancyMaxBlocks( - (first_min), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getCudaOccupancyMaxBlocks( + (first_min), block_size, shmem); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -179,7 +101,7 @@ void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) mymin = mymin_block[i]; } } - m_minloc = RAJA_MAX(m_minloc, mymin.loc); + m_minloc = mymin.loc; } stopTimer(); @@ -191,9 +113,15 @@ void FIRST_MIN::runCudaVariantBlockHostOccGS(VariantID vid) } } -template < size_t block_size > -void FIRST_MIN::runCudaVariantBlockDeviceOccGS(VariantID vid) +template < size_t block_size, bool direct > +void FIRST_MIN::runCudaVariantRAJA(VariantID vid) { + using reduction_policy = RAJA::cuda_reduce; + + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -207,10 +135,10 @@ void FIRST_MIN::runCudaVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceMinLoc loc( + RAJA::ReduceMinLoc loc( m_xmin_init, m_initloc); - RAJA::forall< RAJA::cuda_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { FIRST_MIN_BODY_RAJA; }); @@ -236,49 +164,35 @@ void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if ( vid == Base_CUDA ) { - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantBlockHost(vid); - - } - - t += 1; + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - if (tune_idx == t) { + if ( vid == Base_CUDA ) { - setBlockSize(block_size); - runCudaVariantBlockHostOccGS(vid); + if (tune_idx == t) { - } - - t += 1; - - } + setBlockSize(block_size); + runCudaVariantBase(vid); - if ( vid == RAJA_CUDA ) { + } - if (tune_idx == t) { + t += 1; - setBlockSize(block_size); - runCudaVariantBlockDevice(vid); + } else if ( vid == RAJA_CUDA ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runCudaVariantRAJA(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runCudaVariantBlockDeviceOccGS(vid); + t += 1; } - t += 1; - - } + }); } @@ -301,27 +215,34 @@ void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if ( vid == Base_CUDA ) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - addVariantTuningName(vid, "blkhst_"+std::to_string(block_size)); + if ( vid == Base_CUDA ) { - addVariantTuningName(vid, "blkhst_occgs_"+std::to_string(block_size)); + auto algorithm_helper = gpu_algorithm::block_host_helper{}; - } + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_CUDA ) { + } else if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - } + } + + }); } }); } + } } // end namespace lcals diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index bbdea1605..53936fbed 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include namespace rajaperf @@ -58,90 +60,8 @@ __global__ void first_min(Real_ptr x, } -template < size_t block_size > -void FIRST_MIN::runHipVariantBlockHost(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - FIRST_MIN_DATA_SETUP; - - if ( vid == Base_HIP ) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - - RAJAPERF_HIP_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - FIRST_MIN_MINLOC_INIT; - RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); - - constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - - RPlaunchHipKernel( (first_min), - grid_size, block_size, - shmem, res.get_stream(), - x, dminloc, mymin, - iend ); - - RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); - for (Index_type i = 0; i < static_cast(grid_size); i++) { - if ( mymin_block[i].val < mymin.val ) { - mymin = mymin_block[i]; - } - } - m_minloc = mymin.loc; - - } - stopTimer(); - - RAJAPERF_HIP_REDUCER_TEARDOWN(dminloc, mymin_block); - - } else { - getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void FIRST_MIN::runHipVariantBlockDevice(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - FIRST_MIN_DATA_SETUP; - - if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); - - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - FIRST_MIN_BODY_RAJA; - }); - - m_minloc = loc.getLoc(); - - } - stopTimer(); - - } else { - getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; - } -} - -template < size_t block_size > -void FIRST_MIN::runHipVariantBlockHostOccGS(VariantID vid) +template < size_t block_size, bool direct > +void FIRST_MIN::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type iend = getActualProblemSize(); @@ -153,8 +73,10 @@ void FIRST_MIN::runHipVariantBlockHostOccGS(VariantID vid) if ( vid == Base_HIP ) { constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - const size_t max_grid_size = detail::getHipOccupancyMaxBlocks( - (first_min), block_size, shmem); + const size_t max_grid_size = direct + ? std::numeric_limits::max() + : detail::getHipOccupancyMaxBlocks( + (first_min), block_size, shmem); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -191,9 +113,15 @@ void FIRST_MIN::runHipVariantBlockHostOccGS(VariantID vid) } } -template < size_t block_size > -void FIRST_MIN::runHipVariantBlockDeviceOccGS(VariantID vid) +template < size_t block_size, bool direct > +void FIRST_MIN::runHipVariantRAJA(VariantID vid) { + using reduction_policy = RAJA::hip_reduce; + + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -207,10 +135,10 @@ void FIRST_MIN::runHipVariantBlockDeviceOccGS(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceMinLoc loc( + RAJA::ReduceMinLoc loc( m_xmin_init, m_initloc); - RAJA::forall< RAJA::hip_exec_occ_calc >( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { FIRST_MIN_BODY_RAJA; }); @@ -236,49 +164,36 @@ void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if ( vid == Base_HIP ) { - - if (tune_idx == t) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - setBlockSize(block_size); - runHipVariantBlockHost(vid); + if ( vid == Base_HIP ) { - } + if (tune_idx == t) { - t += 1; + setBlockSize(block_size); + runHipVariantBase(vid); - if (tune_idx == t) { + } - setBlockSize(block_size); - runHipVariantBlockHostOccGS(vid); + t += 1; - } + } else if ( vid == RAJA_HIP ) { - t += 1; + if (tune_idx == t) { - } + setBlockSize(block_size); + runHipVariantRAJA(vid); - if ( vid == RAJA_HIP ) { + } - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockDevice(vid); - - } - - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantBlockDeviceOccGS(vid); + t += 1; } - t += 1; + }); - } } }); @@ -300,21 +215,27 @@ void FIRST_MIN::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - if ( vid == Base_HIP ) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - addVariantTuningName(vid, "blkhst_"+std::to_string(block_size)); + if ( vid == Base_HIP ) { - addVariantTuningName(vid, "blkhst_occgs_"+std::to_string(block_size)); + auto algorithm_helper = gpu_algorithm::block_host_helper{}; - } + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - if ( vid == RAJA_HIP ) { + } else if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "blkdev_"+std::to_string(block_size)); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; - addVariantTuningName(vid, "blkdev_occgs_"+std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - } + } + + }); } diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 28a7e760e..27b4ce8e6 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -83,22 +83,14 @@ class FIRST_MIN : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockHost(VariantID vid); - template < size_t block_size > - void runHipVariantBlockHost(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDevice(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockHostOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockHostOccGS(VariantID vid); - template < size_t block_size > - void runCudaVariantBlockDeviceOccGS(VariantID vid); - template < size_t block_size > - void runHipVariantBlockDeviceOccGS(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantBase(VariantID vid); + template < size_t block_size, bool direct > + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, bool direct > + void runHipVariantRAJA(VariantID vid); private: static const size_t default_gpu_block_size = 256; From f7fce996703e38851abe806cbc4782a0ee42dc0d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Jan 2024 10:49:06 -0800 Subject: [PATCH 240/454] Update src/common/GPUUtils.hpp Co-authored-by: Rich Hornung --- src/common/GPUUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 5ee735e4f..f946bdf6a 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -164,7 +164,7 @@ using reducer_helpers = camp::list< block_atomic_helper, block_device_helper >; -} // closing brace for gpu_mapping namespace +} // closing brace for gpu_algorithm namespace namespace gpu_mapping { From 8d4f7653e216255083ab0dbacdb8ece0d7f5c2a3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Jan 2024 13:23:19 -0800 Subject: [PATCH 241/454] Pass Atomic and Direct through template This improves type safety over using bools. --- src/algorithm/REDUCE_SUM-Cuda.cpp | 16 ++++++++-------- src/algorithm/REDUCE_SUM-Hip.cpp | 16 ++++++++-------- src/algorithm/REDUCE_SUM.hpp | 8 ++++---- src/basic/PI_REDUCE-Cuda.cpp | 16 ++++++++-------- src/basic/PI_REDUCE-Hip.cpp | 16 ++++++++-------- src/basic/PI_REDUCE.hpp | 8 ++++---- src/basic/REDUCE3_INT-Cuda.cpp | 16 ++++++++-------- src/basic/REDUCE3_INT-Hip.cpp | 16 ++++++++-------- src/basic/REDUCE3_INT.hpp | 8 ++++---- src/basic/REDUCE_STRUCT-Cuda.cpp | 16 ++++++++-------- src/basic/REDUCE_STRUCT-Hip.cpp | 16 ++++++++-------- src/basic/REDUCE_STRUCT.hpp | 8 ++++---- src/basic/TRAP_INT-Cuda.cpp | 16 ++++++++-------- src/basic/TRAP_INT-Hip.cpp | 16 ++++++++-------- src/basic/TRAP_INT.hpp | 8 ++++---- src/lcals/FIRST_MIN-Cuda.cpp | 12 ++++++------ src/lcals/FIRST_MIN-Hip.cpp | 12 ++++++------ src/lcals/FIRST_MIN.hpp | 8 ++++---- src/stream/DOT-Cuda.cpp | 16 ++++++++-------- src/stream/DOT-Hip.cpp | 16 ++++++++-------- src/stream/DOT.hpp | 8 ++++---- 21 files changed, 136 insertions(+), 136 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index f117db199..7af4297bb 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -128,7 +128,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void REDUCE_SUM::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -143,7 +143,7 @@ void REDUCE_SUM::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getCudaOccupancyMaxBlocks( (reduce_sum), block_size, shmem); @@ -176,14 +176,14 @@ void REDUCE_SUM::runCudaVariantBase(VariantID vid) } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void REDUCE_SUM::runCudaVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::cuda_exec_occ_calc>; @@ -251,7 +251,7 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -265,8 +265,8 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 445d29dd4..b26bb08a5 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -155,7 +155,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void REDUCE_SUM::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -170,7 +170,7 @@ void REDUCE_SUM::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getHipOccupancyMaxBlocks( (reduce_sum), block_size, shmem); @@ -203,14 +203,14 @@ void REDUCE_SUM::runHipVariantBase(VariantID vid) } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void REDUCE_SUM::runHipVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::hip_exec_occ_calc>; @@ -278,7 +278,7 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -292,8 +292,8 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 58fe8387f..e94400dbb 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -63,13 +63,13 @@ class REDUCE_SUM : public KernelBase void setHipTuningDefinitions(VariantID vid); void runCudaVariantCub(VariantID vid); void runHipVariantRocprim(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); private: diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 17d11cf41..56872c4e2 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -55,7 +55,7 @@ __global__ void pi_reduce(Real_type dx, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void PI_REDUCE::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -70,7 +70,7 @@ void PI_REDUCE::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getCudaOccupancyMaxBlocks( (pi_reduce), block_size, shmem); @@ -104,14 +104,14 @@ void PI_REDUCE::runCudaVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void PI_REDUCE::runCudaVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::cuda_exec_occ_calc>; @@ -164,7 +164,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -178,8 +178,8 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index ba461485e..ca06e5cac 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -55,7 +55,7 @@ __global__ void pi_reduce(Real_type dx, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void PI_REDUCE::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -70,7 +70,7 @@ void PI_REDUCE::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getHipOccupancyMaxBlocks( (pi_reduce), block_size, shmem); @@ -104,14 +104,14 @@ void PI_REDUCE::runHipVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void PI_REDUCE::runHipVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::hip_exec_occ_calc>; @@ -164,7 +164,7 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -178,8 +178,8 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index fa1bf6816..e2275409a 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -59,13 +59,13 @@ class PI_REDUCE : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); private: diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 5e80bd7cf..711c61576 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -68,7 +68,7 @@ __global__ void reduce3int(Int_ptr vec, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void REDUCE3_INT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -83,7 +83,7 @@ void REDUCE3_INT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getCudaOccupancyMaxBlocks( (reduce3int), block_size, shmem); @@ -122,14 +122,14 @@ void REDUCE3_INT::runCudaVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void REDUCE3_INT::runCudaVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::cuda_exec_occ_calc>; @@ -186,7 +186,7 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -200,8 +200,8 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index a495444ee..590e51dee 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -68,7 +68,7 @@ __global__ void reduce3int(Int_ptr vec, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void REDUCE3_INT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -83,7 +83,7 @@ void REDUCE3_INT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getHipOccupancyMaxBlocks( (reduce3int), block_size, shmem); @@ -122,14 +122,14 @@ void REDUCE3_INT::runHipVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void REDUCE3_INT::runHipVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::hip_exec_occ_calc>; @@ -186,7 +186,7 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -200,8 +200,8 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index 55821efb7..749d62009 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -74,13 +74,13 @@ class REDUCE3_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); private: diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 53bf4d1b3..3e925e60a 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -98,7 +98,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -113,7 +113,7 @@ void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getCudaOccupancyMaxBlocks( (reduce_struct), block_size, shmem); @@ -156,14 +156,14 @@ void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void REDUCE_STRUCT::runCudaVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::cuda_exec_occ_calc>; @@ -228,7 +228,7 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -242,8 +242,8 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index dbebaeddc..2ad47e938 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -98,7 +98,7 @@ __global__ void reduce_struct(Real_ptr x, Real_ptr y, } } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void REDUCE_STRUCT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -113,7 +113,7 @@ void REDUCE_STRUCT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getHipOccupancyMaxBlocks( (reduce_struct), block_size, shmem); @@ -156,14 +156,14 @@ void REDUCE_STRUCT::runHipVariantBase(VariantID vid) } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void REDUCE_STRUCT::runHipVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::hip_exec_occ_calc>; @@ -228,7 +228,7 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -242,8 +242,8 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index beebd8692..9bb83f661 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -89,13 +89,13 @@ class REDUCE_STRUCT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); struct PointsType { diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 578115678..45a79b004 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -74,7 +74,7 @@ __global__ void trapint(Real_type x0, Real_type xp, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void TRAP_INT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -89,7 +89,7 @@ void TRAP_INT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getCudaOccupancyMaxBlocks( (trapint), block_size, shmem); @@ -125,14 +125,14 @@ void TRAP_INT::runCudaVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void TRAP_INT::runCudaVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::cuda_exec_occ_calc>; @@ -185,7 +185,7 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -199,8 +199,8 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 83af19e55..28aa7e1aa 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -74,7 +74,7 @@ __global__ void trapint(Real_type x0, Real_type xp, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void TRAP_INT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -89,7 +89,7 @@ void TRAP_INT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getHipOccupancyMaxBlocks( (trapint), block_size, shmem); @@ -125,14 +125,14 @@ void TRAP_INT::runHipVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void TRAP_INT::runHipVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::hip_exec_occ_calc>; @@ -185,7 +185,7 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -199,8 +199,8 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 3e3eebf9c..4f705d008 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -71,13 +71,13 @@ class TRAP_INT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); private: diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 68f6a43e8..b08fd9b4c 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -60,7 +60,7 @@ __global__ void first_min(Real_ptr x, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void FIRST_MIN::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -73,7 +73,7 @@ void FIRST_MIN::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getCudaOccupancyMaxBlocks( (first_min), block_size, shmem); @@ -113,12 +113,12 @@ void FIRST_MIN::runCudaVariantBase(VariantID vid) } } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void FIRST_MIN::runCudaVariantRAJA(VariantID vid) { using reduction_policy = RAJA::cuda_reduce; - using exec_policy = std::conditional_t, RAJA::cuda_exec_occ_calc>; @@ -172,7 +172,7 @@ void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -184,7 +184,7 @@ void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantRAJA(vid); + decltype(mapping_helper)>(vid); } diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index 53936fbed..def368366 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -60,7 +60,7 @@ __global__ void first_min(Real_ptr x, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void FIRST_MIN::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -73,7 +73,7 @@ void FIRST_MIN::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getHipOccupancyMaxBlocks( (first_min), block_size, shmem); @@ -113,12 +113,12 @@ void FIRST_MIN::runHipVariantBase(VariantID vid) } } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void FIRST_MIN::runHipVariantRAJA(VariantID vid) { using reduction_policy = RAJA::hip_reduce; - using exec_policy = std::conditional_t, RAJA::hip_exec_occ_calc>; @@ -172,7 +172,7 @@ void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -184,7 +184,7 @@ void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantRAJA(vid); + decltype(mapping_helper)>(vid); } diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 27b4ce8e6..afe90a554 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -83,13 +83,13 @@ class FIRST_MIN : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantRAJA(VariantID vid); private: diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 86f94d0ec..b01f0fcd8 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -54,7 +54,7 @@ __global__ void dot(Real_ptr a, Real_ptr b, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void DOT::runCudaVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -69,7 +69,7 @@ void DOT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getCudaOccupancyMaxBlocks( (dot), block_size, shmem); @@ -101,14 +101,14 @@ void DOT::runCudaVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void DOT::runCudaVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::cuda_exec_occ_calc>; @@ -161,7 +161,7 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -175,8 +175,8 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runCudaVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 89fac768e..2f2b0dbca 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -54,7 +54,7 @@ __global__ void dot(Real_ptr a, Real_ptr b, } -template < size_t block_size, bool direct > +template < size_t block_size, typename MappingHelper > void DOT::runHipVariantBase(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -69,7 +69,7 @@ void DOT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = direct + const size_t max_grid_size = MappingHelper::direct ? std::numeric_limits::max() : detail::getHipOccupancyMaxBlocks( (dot), block_size, shmem); @@ -101,14 +101,14 @@ void DOT::runHipVariantBase(VariantID vid) } } -template < size_t block_size, bool atomic, bool direct > +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void DOT::runHipVariantRAJA(VariantID vid) { - using reduction_policy = std::conditional_t; - using exec_policy = std::conditional_t, RAJA::hip_exec_occ_calc>; @@ -161,7 +161,7 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantBase(vid); + decltype(mapping_helper)>(vid); } @@ -175,8 +175,8 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) setBlockSize(block_size); runHipVariantRAJA(vid); + decltype(algorithm_helper), + decltype(mapping_helper)>(vid); } diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 04fd10727..05e304973 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -55,13 +55,13 @@ class DOT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, bool direct > + template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, bool atomic, bool direct > + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); private: From ead1842bfbde231eed4f2d1f572d54afd21e9580 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Jan 2024 13:41:55 -0800 Subject: [PATCH 242/454] Deduplicate occupancy calcualtor call logic --- src/algorithm/REDUCE_SUM-Cuda.cpp | 6 ++---- src/algorithm/REDUCE_SUM-Hip.cpp | 6 ++---- src/basic/PI_REDUCE-Cuda.cpp | 6 ++---- src/basic/PI_REDUCE-Hip.cpp | 6 ++---- src/basic/REDUCE3_INT-Cuda.cpp | 6 ++---- src/basic/REDUCE3_INT-Hip.cpp | 6 ++---- src/basic/REDUCE_STRUCT-Cuda.cpp | 6 ++---- src/basic/REDUCE_STRUCT-Hip.cpp | 6 ++---- src/basic/TRAP_INT-Cuda.cpp | 6 ++---- src/basic/TRAP_INT-Hip.cpp | 6 ++---- src/common/GPUUtils.hpp | 15 +++++++++++++++ src/lcals/FIRST_MIN-Cuda.cpp | 6 ++---- src/lcals/FIRST_MIN-Hip.cpp | 6 ++---- src/stream/DOT-Cuda.cpp | 6 ++---- src/stream/DOT-Hip.cpp | 6 ++---- 15 files changed, 43 insertions(+), 56 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 7af4297bb..dfbc7a478 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -143,10 +143,8 @@ void REDUCE_SUM::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getCudaOccupancyMaxBlocks( - (reduce_sum), block_size, shmem); + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (reduce_sum), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index b26bb08a5..44084c6c3 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -170,10 +170,8 @@ void REDUCE_SUM::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getHipOccupancyMaxBlocks( - (reduce_sum), block_size, shmem); + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (reduce_sum), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 56872c4e2..f79259e32 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -70,10 +70,8 @@ void PI_REDUCE::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getCudaOccupancyMaxBlocks( - (pi_reduce), block_size, shmem); + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (pi_reduce), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index ca06e5cac..6bd9385b8 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -70,10 +70,8 @@ void PI_REDUCE::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getHipOccupancyMaxBlocks( - (pi_reduce), block_size, shmem); + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (pi_reduce), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 711c61576..be72022f4 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -83,10 +83,8 @@ void REDUCE3_INT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getCudaOccupancyMaxBlocks( - (reduce3int), block_size, shmem); + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (reduce3int), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 590e51dee..6cbee2fa0 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -83,10 +83,8 @@ void REDUCE3_INT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getHipOccupancyMaxBlocks( - (reduce3int), block_size, shmem); + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (reduce3int), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 3e925e60a..9be3d8643 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -113,10 +113,8 @@ void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getCudaOccupancyMaxBlocks( - (reduce_struct), block_size, shmem); + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (reduce_struct), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 2ad47e938..88679b598 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -113,10 +113,8 @@ void REDUCE_STRUCT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getHipOccupancyMaxBlocks( - (reduce_struct), block_size, shmem); + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (reduce_struct), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 45a79b004..f0e0565db 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -89,10 +89,8 @@ void TRAP_INT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getCudaOccupancyMaxBlocks( - (trapint), block_size, shmem); + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (trapint), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 28aa7e1aa..ad291ddf4 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -89,10 +89,8 @@ void TRAP_INT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getHipOccupancyMaxBlocks( - (trapint), block_size, shmem); + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (trapint), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index f946bdf6a..e117993d6 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -188,6 +188,21 @@ using reducer_helpers = camp::list< } // closing brace for rajaperf namespace +// Get the max number of blocks to launch with the given MappingHelper +// for kernel func with the given block_size and shmem. +// This will use the occupancy calculator if MappingHelper::direct is false +#define RAJAPERF_CUDA_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem) \ + MappingHelper::direct \ + ? std::numeric_limits::max() \ + : detail::getCudaOccupancyMaxBlocks( \ + (func), (block_size), (shmem)); +/// +#define RAJAPERF_HIP_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem) \ + MappingHelper::direct \ + ? std::numeric_limits::max() \ + : detail::getHipOccupancyMaxBlocks( \ + (func), (block_size), (shmem)); + // allocate pointer of pointer_type with length // device_ptr_name gets memory in the reduction data space for the current variant // host_ptr_name is set to either device_ptr_name if the reduction data space is diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index b08fd9b4c..7a70c8991 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -73,10 +73,8 @@ void FIRST_MIN::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getCudaOccupancyMaxBlocks( - (first_min), block_size, shmem); + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (first_min), block_size, shmem); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index def368366..ef577aa72 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -73,10 +73,8 @@ void FIRST_MIN::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { constexpr size_t shmem = sizeof(MyMinLoc)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getHipOccupancyMaxBlocks( - (first_min), block_size, shmem); + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (first_min), block_size, shmem); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index b01f0fcd8..cce51a90c 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -69,10 +69,8 @@ void DOT::runCudaVariantBase(VariantID vid) RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getCudaOccupancyMaxBlocks( - (dot), block_size, shmem); + const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, (dot), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 2f2b0dbca..000331381 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -69,10 +69,8 @@ void DOT::runHipVariantBase(VariantID vid) RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; - const size_t max_grid_size = MappingHelper::direct - ? std::numeric_limits::max() - : detail::getHipOccupancyMaxBlocks( - (dot), block_size, shmem); + const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, (dot), block_size, shmem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { From 197e5387f3017b7dc573a060358edb7d0f151aaa Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 2 Jan 2024 15:17:40 -0800 Subject: [PATCH 243/454] Update copyright year to 2024 --- .gitlab/custom-jobs-and-variables.yml | 2 +- .gitlab/jobs/corona.yml | 2 +- .gitlab/jobs/lassen.yml | 2 +- .gitlab/jobs/poodle.yml | 2 +- .gitlab/jobs/ruby.yml | 2 +- .gitlab/jobs/tioga.yml | 2 +- .gitlab/subscribed-pipelines.yml | 2 +- CMakeLists.txt | 2 +- Dockerfile | 2 +- LICENSE | 2 +- README.md | 2 +- RELEASE | 2 +- TODO/WIP-COUPLE.cpp | 2 +- TODO/WIP-COUPLE.hpp | 2 +- docs/CMakeLists.txt | 2 +- docs/conf.py | 6 +++--- docs/index.rst | 2 +- docs/sphinx/dev_guide/branch_development.rst | 2 +- docs/sphinx/dev_guide/build_configurations.rst | 2 +- docs/sphinx/dev_guide/ci.rst | 2 +- docs/sphinx/dev_guide/contributing.rst | 2 +- docs/sphinx/dev_guide/index.rst | 2 +- docs/sphinx/dev_guide/kernel_class.rst | 2 +- docs/sphinx/dev_guide/kernel_class_impl.rst | 2 +- docs/sphinx/dev_guide/release_process.rst | 2 +- docs/sphinx/dev_guide/structure.rst | 2 +- docs/sphinx/rajaperf_license.rst | 4 ++-- docs/sphinx/user_guide/CMakeLists.txt | 2 +- docs/sphinx/user_guide/build.rst | 2 +- docs/sphinx/user_guide/index.rst | 2 +- docs/sphinx/user_guide/output.rst | 2 +- docs/sphinx/user_guide/run.rst | 2 +- scripts/gitlab/build_and_test.sh | 2 +- scripts/install_llvm.sh | 2 +- scripts/lc-builds/blueos_clang.sh | 2 +- scripts/lc-builds/blueos_clang_omptarget.sh | 2 +- scripts/lc-builds/blueos_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_nvcc_clang_caliper.sh | 2 +- scripts/lc-builds/blueos_nvcc_gcc.sh | 2 +- scripts/lc-builds/blueos_nvcc_xl.sh | 2 +- scripts/lc-builds/blueos_pgi.sh | 2 +- scripts/lc-builds/blueos_spectrum_nvcc_clang.sh | 2 +- scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh | 2 +- scripts/lc-builds/blueos_xl.sh | 2 +- scripts/lc-builds/blueos_xl_omptarget.sh | 2 +- scripts/lc-builds/corona_sycl.sh | 2 +- scripts/lc-builds/toss4_amdclang.sh | 2 +- scripts/lc-builds/toss4_amdclang_asan.sh | 2 +- scripts/lc-builds/toss4_cce_hip.sh | 2 +- scripts/lc-builds/toss4_clang.sh | 2 +- scripts/lc-builds/toss4_clang_caliper.sh | 2 +- scripts/lc-builds/toss4_cray-mpich_amdclang.sh | 2 +- scripts/lc-builds/toss4_gcc.sh | 2 +- scripts/lc-builds/toss4_gcc_caliper.sh | 2 +- scripts/lc-builds/toss4_hipcc.sh | 2 +- scripts/lc-builds/toss4_icpc-classic.sh | 2 +- scripts/lc-builds/toss4_icpc.sh | 2 +- scripts/lc-builds/toss4_icpx.sh | 2 +- scripts/lc-builds/toss4_mvapich2_icpx.sh | 2 +- scripts/make_release_tarball.sh | 2 +- scripts/travis_build_and_test.sh | 2 +- scripts/ubuntu-builds/ubuntu_clang.sh | 2 +- scripts/ubuntu-builds/ubuntu_gcc.sh | 2 +- scripts/update_copyright.sh | 13 +++++++------ src/CMakeLists.txt | 2 +- src/RAJAPerfSuiteDriver.cpp | 2 +- src/algorithm/CMakeLists.txt | 2 +- src/algorithm/MEMCPY-Cuda.cpp | 2 +- src/algorithm/MEMCPY-Hip.cpp | 2 +- src/algorithm/MEMCPY-OMP.cpp | 2 +- src/algorithm/MEMCPY-OMPTarget.cpp | 2 +- src/algorithm/MEMCPY-Seq.cpp | 2 +- src/algorithm/MEMCPY.cpp | 2 +- src/algorithm/MEMCPY.hpp | 2 +- src/algorithm/MEMSET-Cuda.cpp | 2 +- src/algorithm/MEMSET-Hip.cpp | 2 +- src/algorithm/MEMSET-OMP.cpp | 2 +- src/algorithm/MEMSET-OMPTarget.cpp | 2 +- src/algorithm/MEMSET-Seq.cpp | 2 +- src/algorithm/MEMSET.cpp | 2 +- src/algorithm/MEMSET.hpp | 2 +- src/algorithm/REDUCE_SUM-Cuda.cpp | 2 +- src/algorithm/REDUCE_SUM-Hip.cpp | 2 +- src/algorithm/REDUCE_SUM-OMP.cpp | 2 +- src/algorithm/REDUCE_SUM-OMPTarget.cpp | 2 +- src/algorithm/REDUCE_SUM-Seq.cpp | 2 +- src/algorithm/REDUCE_SUM.cpp | 2 +- src/algorithm/REDUCE_SUM.hpp | 2 +- src/algorithm/SORT-Cuda.cpp | 2 +- src/algorithm/SORT-Hip.cpp | 2 +- src/algorithm/SORT-OMP.cpp | 2 +- src/algorithm/SORT-Seq.cpp | 2 +- src/algorithm/SORT.cpp | 2 +- src/algorithm/SORT.hpp | 2 +- src/algorithm/SORTPAIRS-Cuda.cpp | 2 +- src/algorithm/SORTPAIRS-Hip.cpp | 2 +- src/algorithm/SORTPAIRS-OMP.cpp | 2 +- src/algorithm/SORTPAIRS-Seq.cpp | 2 +- src/algorithm/SORTPAIRS.cpp | 2 +- src/algorithm/SORTPAIRS.hpp | 2 +- src/apps/AppsData.cpp | 2 +- src/apps/AppsData.hpp | 2 +- src/apps/CMakeLists.txt | 2 +- src/apps/CONVECTION3DPA-Cuda.cpp | 2 +- src/apps/CONVECTION3DPA-Hip.cpp | 2 +- src/apps/CONVECTION3DPA-OMP.cpp | 2 +- src/apps/CONVECTION3DPA-OMPTarget.cpp | 2 +- src/apps/CONVECTION3DPA-Seq.cpp | 2 +- src/apps/CONVECTION3DPA.cpp | 2 +- src/apps/CONVECTION3DPA.hpp | 2 +- src/apps/DEL_DOT_VEC_2D-Cuda.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMP.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp | 2 +- src/apps/DEL_DOT_VEC_2D-Seq.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.hpp | 2 +- src/apps/DIFFUSION3DPA-Cuda.cpp | 2 +- src/apps/DIFFUSION3DPA-Hip.cpp | 2 +- src/apps/DIFFUSION3DPA-OMP.cpp | 2 +- src/apps/DIFFUSION3DPA-OMPTarget.cpp | 2 +- src/apps/DIFFUSION3DPA-Seq.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 2 +- src/apps/DIFFUSION3DPA.hpp | 2 +- src/apps/EDGE3D-Cuda.cpp | 2 +- src/apps/EDGE3D-Hip.cpp | 2 +- src/apps/EDGE3D-OMP.cpp | 2 +- src/apps/EDGE3D-OMPTarget.cpp | 2 +- src/apps/EDGE3D-Seq.cpp | 2 +- src/apps/EDGE3D.cpp | 2 +- src/apps/EDGE3D.hpp | 2 +- src/apps/ENERGY-Cuda.cpp | 2 +- src/apps/ENERGY-Hip.cpp | 2 +- src/apps/ENERGY-OMP.cpp | 2 +- src/apps/ENERGY-OMPTarget.cpp | 2 +- src/apps/ENERGY-Seq.cpp | 2 +- src/apps/ENERGY.cpp | 2 +- src/apps/ENERGY.hpp | 2 +- src/apps/FEM_MACROS.hpp | 2 +- src/apps/FIR-Cuda.cpp | 2 +- src/apps/FIR-Hip.cpp | 2 +- src/apps/FIR-OMP.cpp | 2 +- src/apps/FIR-OMPTarget.cpp | 2 +- src/apps/FIR-Seq.cpp | 2 +- src/apps/FIR.cpp | 2 +- src/apps/FIR.hpp | 2 +- src/apps/LTIMES-Cuda.cpp | 2 +- src/apps/LTIMES-Hip.cpp | 2 +- src/apps/LTIMES-OMP.cpp | 2 +- src/apps/LTIMES-OMPTarget.cpp | 2 +- src/apps/LTIMES-Seq.cpp | 2 +- src/apps/LTIMES.cpp | 2 +- src/apps/LTIMES.hpp | 2 +- src/apps/LTIMES_NOVIEW-Cuda.cpp | 2 +- src/apps/LTIMES_NOVIEW-Hip.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMP.cpp | 2 +- src/apps/LTIMES_NOVIEW-OMPTarget.cpp | 2 +- src/apps/LTIMES_NOVIEW-Seq.cpp | 2 +- src/apps/LTIMES_NOVIEW.cpp | 2 +- src/apps/LTIMES_NOVIEW.hpp | 2 +- src/apps/MASS3DEA-Cuda.cpp | 2 +- src/apps/MASS3DEA-Hip.cpp | 2 +- src/apps/MASS3DEA-OMP.cpp | 2 +- src/apps/MASS3DEA-OMPTarget.cpp | 2 +- src/apps/MASS3DEA-Seq.cpp | 2 +- src/apps/MASS3DEA.cpp | 2 +- src/apps/MASS3DEA.hpp | 2 +- src/apps/MASS3DPA-Cuda.cpp | 2 +- src/apps/MASS3DPA-Hip.cpp | 2 +- src/apps/MASS3DPA-OMP.cpp | 2 +- src/apps/MASS3DPA-OMPTarget.cpp | 2 +- src/apps/MASS3DPA-Seq.cpp | 2 +- src/apps/MASS3DPA.cpp | 2 +- src/apps/MASS3DPA.hpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Hip.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-OMP.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D-Seq.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.hpp | 2 +- src/apps/PRESSURE-Cuda.cpp | 2 +- src/apps/PRESSURE-Hip.cpp | 2 +- src/apps/PRESSURE-OMP.cpp | 2 +- src/apps/PRESSURE-OMPTarget.cpp | 2 +- src/apps/PRESSURE-Seq.cpp | 2 +- src/apps/PRESSURE.cpp | 2 +- src/apps/PRESSURE.hpp | 2 +- src/apps/VOL3D-Cuda.cpp | 2 +- src/apps/VOL3D-Hip.cpp | 2 +- src/apps/VOL3D-OMP.cpp | 2 +- src/apps/VOL3D-OMPTarget.cpp | 2 +- src/apps/VOL3D-Seq.cpp | 2 +- src/apps/VOL3D.cpp | 2 +- src/apps/VOL3D.hpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D.cpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D.hpp | 2 +- src/apps/mixed_fem_helper.hpp | 2 +- src/basic-kokkos/CMakeLists.txt | 2 +- src/basic-kokkos/DAXPY-Kokkos.cpp | 2 +- src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp | 2 +- src/basic-kokkos/IF_QUAD-Kokkos.cpp | 2 +- src/basic-kokkos/INIT3-Kokkos.cpp | 2 +- src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp | 2 +- src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp | 2 +- src/basic-kokkos/MULADDSUB-Kokkos.cpp | 2 +- src/basic-kokkos/NESTED_INIT-Kokkos.cpp | 2 +- src/basic-kokkos/PI_ATOMIC-Kokkos.cpp | 2 +- src/basic-kokkos/REDUCE3_INT-Kokkos.cpp | 2 +- src/basic-kokkos/TRAP_INT-Kokkos.cpp | 2 +- src/basic/ARRAY_OF_PTRS-Cuda.cpp | 2 +- src/basic/ARRAY_OF_PTRS-Hip.cpp | 2 +- src/basic/ARRAY_OF_PTRS-OMP.cpp | 2 +- src/basic/ARRAY_OF_PTRS-OMPTarget.cpp | 2 +- src/basic/ARRAY_OF_PTRS-Seq.cpp | 2 +- src/basic/ARRAY_OF_PTRS.cpp | 2 +- src/basic/ARRAY_OF_PTRS.hpp | 2 +- src/basic/CMakeLists.txt | 2 +- src/basic/COPY8-Cuda.cpp | 2 +- src/basic/COPY8-Hip.cpp | 2 +- src/basic/COPY8-OMP.cpp | 2 +- src/basic/COPY8-OMPTarget.cpp | 2 +- src/basic/COPY8-Seq.cpp | 2 +- src/basic/COPY8.cpp | 2 +- src/basic/COPY8.hpp | 2 +- src/basic/DAXPY-Cuda.cpp | 2 +- src/basic/DAXPY-Hip.cpp | 2 +- src/basic/DAXPY-OMP.cpp | 2 +- src/basic/DAXPY-OMPTarget.cpp | 2 +- src/basic/DAXPY-Seq.cpp | 2 +- src/basic/DAXPY.cpp | 2 +- src/basic/DAXPY.hpp | 2 +- src/basic/DAXPY_ATOMIC-Cuda.cpp | 2 +- src/basic/DAXPY_ATOMIC-Hip.cpp | 2 +- src/basic/DAXPY_ATOMIC-OMP.cpp | 2 +- src/basic/DAXPY_ATOMIC-OMPTarget.cpp | 2 +- src/basic/DAXPY_ATOMIC-Seq.cpp | 2 +- src/basic/DAXPY_ATOMIC.cpp | 2 +- src/basic/DAXPY_ATOMIC.hpp | 2 +- src/basic/IF_QUAD-Cuda.cpp | 2 +- src/basic/IF_QUAD-Hip.cpp | 2 +- src/basic/IF_QUAD-OMP.cpp | 2 +- src/basic/IF_QUAD-OMPTarget.cpp | 2 +- src/basic/IF_QUAD-Seq.cpp | 2 +- src/basic/IF_QUAD.cpp | 2 +- src/basic/IF_QUAD.hpp | 2 +- src/basic/INIT3-Cuda.cpp | 2 +- src/basic/INIT3-Hip.cpp | 2 +- src/basic/INIT3-OMP.cpp | 2 +- src/basic/INIT3-OMPTarget.cpp | 2 +- src/basic/INIT3-Seq.cpp | 2 +- src/basic/INIT3.cpp | 2 +- src/basic/INIT3.hpp | 2 +- src/basic/INIT_VIEW1D-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D-Hip.cpp | 2 +- src/basic/INIT_VIEW1D-OMP.cpp | 2 +- src/basic/INIT_VIEW1D-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D-Seq.cpp | 2 +- src/basic/INIT_VIEW1D.cpp | 2 +- src/basic/INIT_VIEW1D.hpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMP.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Seq.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.hpp | 2 +- src/basic/MULADDSUB-Cuda.cpp | 2 +- src/basic/MULADDSUB-Hip.cpp | 2 +- src/basic/MULADDSUB-OMP.cpp | 2 +- src/basic/MULADDSUB-OMPTarget.cpp | 2 +- src/basic/MULADDSUB-Seq.cpp | 2 +- src/basic/MULADDSUB.cpp | 2 +- src/basic/MULADDSUB.hpp | 2 +- src/basic/NESTED_INIT-Cuda.cpp | 2 +- src/basic/NESTED_INIT-Hip.cpp | 2 +- src/basic/NESTED_INIT-OMP.cpp | 2 +- src/basic/NESTED_INIT-OMPTarget.cpp | 2 +- src/basic/NESTED_INIT-Seq.cpp | 2 +- src/basic/NESTED_INIT.cpp | 2 +- src/basic/NESTED_INIT.hpp | 2 +- src/basic/PI_ATOMIC-Cuda.cpp | 2 +- src/basic/PI_ATOMIC-Hip.cpp | 2 +- src/basic/PI_ATOMIC-OMP.cpp | 2 +- src/basic/PI_ATOMIC-OMPTarget.cpp | 2 +- src/basic/PI_ATOMIC-Seq.cpp | 2 +- src/basic/PI_ATOMIC.cpp | 2 +- src/basic/PI_ATOMIC.hpp | 2 +- src/basic/PI_REDUCE-Cuda.cpp | 2 +- src/basic/PI_REDUCE-Hip.cpp | 2 +- src/basic/PI_REDUCE-OMP.cpp | 2 +- src/basic/PI_REDUCE-OMPTarget.cpp | 2 +- src/basic/PI_REDUCE-Seq.cpp | 2 +- src/basic/PI_REDUCE.cpp | 2 +- src/basic/PI_REDUCE.hpp | 2 +- src/basic/REDUCE3_INT-Cuda.cpp | 2 +- src/basic/REDUCE3_INT-Hip.cpp | 2 +- src/basic/REDUCE3_INT-OMP.cpp | 2 +- src/basic/REDUCE3_INT-OMPTarget.cpp | 2 +- src/basic/REDUCE3_INT-Seq.cpp | 2 +- src/basic/REDUCE3_INT.cpp | 2 +- src/basic/REDUCE3_INT.hpp | 2 +- src/basic/REDUCE_STRUCT-Cuda.cpp | 2 +- src/basic/REDUCE_STRUCT-Hip.cpp | 2 +- src/basic/REDUCE_STRUCT-OMP.cpp | 2 +- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 2 +- src/basic/REDUCE_STRUCT-Seq.cpp | 2 +- src/basic/REDUCE_STRUCT.cpp | 2 +- src/basic/REDUCE_STRUCT.hpp | 2 +- src/basic/TRAP_INT-Cuda.cpp | 2 +- src/basic/TRAP_INT-Hip.cpp | 2 +- src/basic/TRAP_INT-OMP.cpp | 2 +- src/basic/TRAP_INT-OMPTarget.cpp | 2 +- src/basic/TRAP_INT-Seq.cpp | 2 +- src/basic/TRAP_INT.cpp | 2 +- src/basic/TRAP_INT.hpp | 2 +- src/comm/CMakeLists.txt | 2 +- src/comm/HALO_EXCHANGE-Cuda.cpp | 2 +- src/comm/HALO_EXCHANGE-Hip.cpp | 2 +- src/comm/HALO_EXCHANGE-OMP.cpp | 2 +- src/comm/HALO_EXCHANGE-OMPTarget.cpp | 2 +- src/comm/HALO_EXCHANGE-Seq.cpp | 2 +- src/comm/HALO_EXCHANGE.cpp | 2 +- src/comm/HALO_EXCHANGE.hpp | 2 +- src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED-Hip.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED-OMP.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED-Seq.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED.hpp | 2 +- src/comm/HALO_PACKING-Cuda.cpp | 2 +- src/comm/HALO_PACKING-Hip.cpp | 2 +- src/comm/HALO_PACKING-OMP.cpp | 2 +- src/comm/HALO_PACKING-OMPTarget.cpp | 2 +- src/comm/HALO_PACKING-Seq.cpp | 2 +- src/comm/HALO_PACKING.cpp | 2 +- src/comm/HALO_PACKING.hpp | 2 +- src/comm/HALO_PACKING_FUSED-Cuda.cpp | 2 +- src/comm/HALO_PACKING_FUSED-Hip.cpp | 2 +- src/comm/HALO_PACKING_FUSED-OMP.cpp | 2 +- src/comm/HALO_PACKING_FUSED-OMPTarget.cpp | 2 +- src/comm/HALO_PACKING_FUSED-Seq.cpp | 2 +- src/comm/HALO_PACKING_FUSED.cpp | 2 +- src/comm/HALO_PACKING_FUSED.hpp | 2 +- src/comm/HALO_SENDRECV-Cuda.cpp | 2 +- src/comm/HALO_SENDRECV-Hip.cpp | 2 +- src/comm/HALO_SENDRECV-OMP.cpp | 2 +- src/comm/HALO_SENDRECV-OMPTarget.cpp | 2 +- src/comm/HALO_SENDRECV-Seq.cpp | 2 +- src/comm/HALO_SENDRECV.cpp | 2 +- src/comm/HALO_SENDRECV.hpp | 2 +- src/comm/HALO_base.cpp | 2 +- src/comm/HALO_base.hpp | 2 +- src/common/CMakeLists.txt | 2 +- src/common/CudaDataUtils.hpp | 2 +- src/common/DataUtils.cpp | 2 +- src/common/DataUtils.hpp | 2 +- src/common/Executor.cpp | 2 +- src/common/Executor.hpp | 2 +- src/common/GPUUtils.hpp | 2 +- src/common/HipDataUtils.hpp | 2 +- src/common/KernelBase.cpp | 2 +- src/common/KernelBase.hpp | 2 +- src/common/KokkosViewUtils.hpp | 2 +- src/common/OpenMPTargetDataUtils.hpp | 2 +- src/common/OutputUtils.cpp | 2 +- src/common/OutputUtils.hpp | 2 +- src/common/RAJAPerfSuite.cpp | 2 +- src/common/RAJAPerfSuite.hpp | 2 +- src/common/RPTypes.hpp | 2 +- src/common/RunParams.cpp | 2 +- src/common/RunParams.hpp | 2 +- src/lcals-kokkos/CMakeLists.txt | 2 +- src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp | 2 +- src/lcals-kokkos/EOS-Kokkos.cpp | 2 +- src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp | 2 +- src/lcals-kokkos/FIRST_MIN-Kokkos.cpp | 2 +- src/lcals-kokkos/FIRST_SUM-Kokkos.cpp | 2 +- src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp | 2 +- src/lcals-kokkos/HYDRO_1D-Kokkos.cpp | 2 +- src/lcals-kokkos/HYDRO_2D-Kokkos.cpp | 2 +- src/lcals-kokkos/INT_PREDICT-Kokkos.cpp | 2 +- src/lcals-kokkos/PLANCKIAN-Kokkos.cpp | 2 +- src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp | 2 +- src/lcals/CMakeLists.txt | 2 +- src/lcals/DIFF_PREDICT-Cuda.cpp | 2 +- src/lcals/DIFF_PREDICT-Hip.cpp | 2 +- src/lcals/DIFF_PREDICT-OMP.cpp | 2 +- src/lcals/DIFF_PREDICT-OMPTarget.cpp | 2 +- src/lcals/DIFF_PREDICT-Seq.cpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 2 +- src/lcals/DIFF_PREDICT.hpp | 2 +- src/lcals/EOS-Cuda.cpp | 2 +- src/lcals/EOS-Hip.cpp | 2 +- src/lcals/EOS-OMP.cpp | 2 +- src/lcals/EOS-OMPTarget.cpp | 2 +- src/lcals/EOS-Seq.cpp | 2 +- src/lcals/EOS.cpp | 2 +- src/lcals/EOS.hpp | 2 +- src/lcals/FIRST_DIFF-Cuda.cpp | 2 +- src/lcals/FIRST_DIFF-Hip.cpp | 2 +- src/lcals/FIRST_DIFF-OMP.cpp | 2 +- src/lcals/FIRST_DIFF-OMPTarget.cpp | 2 +- src/lcals/FIRST_DIFF-Seq.cpp | 2 +- src/lcals/FIRST_DIFF.cpp | 2 +- src/lcals/FIRST_DIFF.hpp | 2 +- src/lcals/FIRST_MIN-Cuda.cpp | 2 +- src/lcals/FIRST_MIN-Hip.cpp | 2 +- src/lcals/FIRST_MIN-OMP.cpp | 2 +- src/lcals/FIRST_MIN-OMPTarget.cpp | 2 +- src/lcals/FIRST_MIN-Seq.cpp | 2 +- src/lcals/FIRST_MIN.cpp | 2 +- src/lcals/FIRST_MIN.hpp | 2 +- src/lcals/FIRST_SUM-Cuda.cpp | 2 +- src/lcals/FIRST_SUM-Hip.cpp | 2 +- src/lcals/FIRST_SUM-OMP.cpp | 2 +- src/lcals/FIRST_SUM-OMPTarget.cpp | 2 +- src/lcals/FIRST_SUM-Seq.cpp | 2 +- src/lcals/FIRST_SUM.cpp | 2 +- src/lcals/FIRST_SUM.hpp | 2 +- src/lcals/GEN_LIN_RECUR-Cuda.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Hip.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMP.cpp | 2 +- src/lcals/GEN_LIN_RECUR-OMPTarget.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Seq.cpp | 2 +- src/lcals/GEN_LIN_RECUR.cpp | 2 +- src/lcals/GEN_LIN_RECUR.hpp | 2 +- src/lcals/HYDRO_1D-Cuda.cpp | 2 +- src/lcals/HYDRO_1D-Hip.cpp | 2 +- src/lcals/HYDRO_1D-OMP.cpp | 2 +- src/lcals/HYDRO_1D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_1D-Seq.cpp | 2 +- src/lcals/HYDRO_1D.cpp | 2 +- src/lcals/HYDRO_1D.hpp | 2 +- src/lcals/HYDRO_2D-Cuda.cpp | 2 +- src/lcals/HYDRO_2D-Hip.cpp | 2 +- src/lcals/HYDRO_2D-OMP.cpp | 2 +- src/lcals/HYDRO_2D-OMPTarget.cpp | 2 +- src/lcals/HYDRO_2D-Seq.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 2 +- src/lcals/HYDRO_2D.hpp | 2 +- src/lcals/INT_PREDICT-Cuda.cpp | 2 +- src/lcals/INT_PREDICT-Hip.cpp | 2 +- src/lcals/INT_PREDICT-OMP.cpp | 2 +- src/lcals/INT_PREDICT-OMPTarget.cpp | 2 +- src/lcals/INT_PREDICT-Seq.cpp | 2 +- src/lcals/INT_PREDICT.cpp | 2 +- src/lcals/INT_PREDICT.hpp | 2 +- src/lcals/PLANCKIAN-Cuda.cpp | 2 +- src/lcals/PLANCKIAN-Hip.cpp | 2 +- src/lcals/PLANCKIAN-OMP.cpp | 2 +- src/lcals/PLANCKIAN-OMPTarget.cpp | 2 +- src/lcals/PLANCKIAN-Seq.cpp | 2 +- src/lcals/PLANCKIAN.cpp | 2 +- src/lcals/PLANCKIAN.hpp | 2 +- src/lcals/TRIDIAG_ELIM-Cuda.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Hip.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMP.cpp | 2 +- src/lcals/TRIDIAG_ELIM-OMPTarget.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Seq.cpp | 2 +- src/lcals/TRIDIAG_ELIM.cpp | 2 +- src/lcals/TRIDIAG_ELIM.hpp | 2 +- src/polybench/CMakeLists.txt | 2 +- src/polybench/POLYBENCH_2MM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_2MM-Hip.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_2MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_2MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 2 +- src/polybench/POLYBENCH_2MM.hpp | 2 +- src/polybench/POLYBENCH_3MM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_3MM-Hip.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMP.cpp | 2 +- src/polybench/POLYBENCH_3MM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_3MM-Seq.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 2 +- src/polybench/POLYBENCH_3MM.hpp | 2 +- src/polybench/POLYBENCH_ADI-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ADI-Hip.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMP.cpp | 2 +- src/polybench/POLYBENCH_ADI-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ADI-Seq.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 2 +- src/polybench/POLYBENCH_ADI.hpp | 2 +- src/polybench/POLYBENCH_ATAX-Cuda.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Hip.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMP.cpp | 2 +- src/polybench/POLYBENCH_ATAX-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_ATAX-Seq.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 2 +- src/polybench/POLYBENCH_ATAX.hpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Hip.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.hpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp | 2 +- .../POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 2 +- src/polybench/POLYBENCH_GEMM-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Hip.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMM-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMM-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 2 +- src/polybench/POLYBENCH_GEMM.hpp | 2 +- src/polybench/POLYBENCH_GEMVER-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Hip.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMP.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GEMVER-Seq.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.hpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Cuda.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Hip.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMP.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV-Seq.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.hpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Hip.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMP.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D-Seq.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.hpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_1D.hpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.hpp | 2 +- src/polybench/POLYBENCH_MVT-Cuda.cpp | 2 +- src/polybench/POLYBENCH_MVT-Hip.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMP.cpp | 2 +- src/polybench/POLYBENCH_MVT-OMPTarget.cpp | 2 +- src/polybench/POLYBENCH_MVT-Seq.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 2 +- src/polybench/POLYBENCH_MVT.hpp | 2 +- src/rajaperf_config.hpp.in | 2 +- src/stream-kokkos/ADD-Kokkos.cpp | 2 +- src/stream-kokkos/CMakeLists.txt | 2 +- src/stream-kokkos/COPY-Kokkos.cpp | 2 +- src/stream-kokkos/DOT-Kokkos.cpp | 2 +- src/stream-kokkos/MUL-Kokkos.cpp | 2 +- src/stream-kokkos/TRIAD-Kokkos.cpp | 2 +- src/stream/ADD-Cuda.cpp | 2 +- src/stream/ADD-Hip.cpp | 2 +- src/stream/ADD-OMP.cpp | 2 +- src/stream/ADD-OMPTarget.cpp | 2 +- src/stream/ADD-Seq.cpp | 2 +- src/stream/ADD.cpp | 2 +- src/stream/ADD.hpp | 2 +- src/stream/CMakeLists.txt | 2 +- src/stream/COPY-Cuda.cpp | 2 +- src/stream/COPY-Hip.cpp | 2 +- src/stream/COPY-OMP.cpp | 2 +- src/stream/COPY-OMPTarget.cpp | 2 +- src/stream/COPY-Seq.cpp | 2 +- src/stream/COPY.cpp | 2 +- src/stream/COPY.hpp | 2 +- src/stream/DOT-Cuda.cpp | 2 +- src/stream/DOT-Hip.cpp | 2 +- src/stream/DOT-OMP.cpp | 2 +- src/stream/DOT-OMPTarget.cpp | 2 +- src/stream/DOT-Seq.cpp | 2 +- src/stream/DOT.cpp | 2 +- src/stream/DOT.hpp | 2 +- src/stream/MUL-Cuda.cpp | 2 +- src/stream/MUL-Hip.cpp | 2 +- src/stream/MUL-OMP.cpp | 2 +- src/stream/MUL-OMPTarget.cpp | 2 +- src/stream/MUL-Seq.cpp | 2 +- src/stream/MUL.cpp | 2 +- src/stream/MUL.hpp | 2 +- src/stream/TRIAD-Cuda.cpp | 2 +- src/stream/TRIAD-Hip.cpp | 2 +- src/stream/TRIAD-OMP.cpp | 2 +- src/stream/TRIAD-OMPTarget.cpp | 2 +- src/stream/TRIAD-Seq.cpp | 2 +- src/stream/TRIAD.cpp | 2 +- src/stream/TRIAD.hpp | 2 +- test/CMakeLists.txt | 2 +- test/test-raja-perf-suite.cpp | 2 +- 606 files changed, 615 insertions(+), 614 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 9210db922..d4478afd7 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index dd5df056f..dc7de5077 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index 2cc28e193..112972606 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index adcf25494..484b385a4 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index 45a0a62f8..e07f65dff 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index 504d983bc..9cd06caa1 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index 38b481c97..7e60a05e9 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/CMakeLists.txt b/CMakeLists.txt index 3106587df..4145d0c37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/Dockerfile b/Dockerfile index 0da428cc7..26b9453b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/LICENSE b/LICENSE index 039a20b01..27c1ef431 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2017-2023, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index bf2eee850..04aeea048 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [comment]: # (#################################################################) -[comment]: # (Copyright 2017-23, Lawrence Livermore National Security, LLC) +[comment]: # (Copyright 2017-24, Lawrence Livermore National Security, LLC) [comment]: # (and RAJA Performance Suite project contributors.) [comment]: # (See the RAJAPerf/LICENSE file for details.) [comment]: # diff --git a/RELEASE b/RELEASE index 4b8dcac50..61fc02251 100644 --- a/RELEASE +++ b/RELEASE @@ -2,7 +2,7 @@ RAJA Performance Suite -Copyright (c) 2017-23, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-24, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory. All rights reserved. See details in the RAJAPerf/LICENSE file. diff --git a/TODO/WIP-COUPLE.cpp b/TODO/WIP-COUPLE.cpp index 5769b04e6..6f0feeed8 100644 --- a/TODO/WIP-COUPLE.cpp +++ b/TODO/WIP-COUPLE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/TODO/WIP-COUPLE.hpp b/TODO/WIP-COUPLE.hpp index 33faa85cc..bf29503f3 100644 --- a/TODO/WIP-COUPLE.hpp +++ b/TODO/WIP-COUPLE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index ac86f5bcc..9b4df01d6 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/docs/conf.py b/docs/conf.py index 6673fa10f..abbac1ada 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,16 +79,16 @@ # General information about the project. project = u'RAJAPerf' -copyright = u'2017-2023, Lawrence Livermore National Security, LLNS' +copyright = u'2017-2024, Lawrence Livermore National Security, LLNS' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u'2022.10' +version = u'2023.06' # The full version, including alpha/beta/rc tags. -release = u'2022.10.0' +release = u'2023.06.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index 12ec445a5..438c89f82 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/branch_development.rst b/docs/sphinx/dev_guide/branch_development.rst index 8d2e04437..318076584 100644 --- a/docs/sphinx/dev_guide/branch_development.rst +++ b/docs/sphinx/dev_guide/branch_development.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst index 7ce70decf..4972d85a2 100644 --- a/docs/sphinx/dev_guide/build_configurations.rst +++ b/docs/sphinx/dev_guide/build_configurations.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst index 231b00ee3..1fdd1a55f 100644 --- a/docs/sphinx/dev_guide/ci.rst +++ b/docs/sphinx/dev_guide/ci.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst index 74f86d3cd..bdac32a30 100644 --- a/docs/sphinx/dev_guide/contributing.rst +++ b/docs/sphinx/dev_guide/contributing.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index c2c976ff3..d04aa25ab 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/kernel_class.rst b/docs/sphinx/dev_guide/kernel_class.rst index 5d544dd68..015b7592f 100644 --- a/docs/sphinx/dev_guide/kernel_class.rst +++ b/docs/sphinx/dev_guide/kernel_class.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/kernel_class_impl.rst b/docs/sphinx/dev_guide/kernel_class_impl.rst index 38d8274a0..05271dd8e 100644 --- a/docs/sphinx/dev_guide/kernel_class_impl.rst +++ b/docs/sphinx/dev_guide/kernel_class_impl.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/release_process.rst b/docs/sphinx/dev_guide/release_process.rst index 8b1942758..0542ec08e 100644 --- a/docs/sphinx/dev_guide/release_process.rst +++ b/docs/sphinx/dev_guide/release_process.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/dev_guide/structure.rst b/docs/sphinx/dev_guide/structure.rst index 5c25ef2a2..bc11f9941 100644 --- a/docs/sphinx/dev_guide/structure.rst +++ b/docs/sphinx/dev_guide/structure.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/rajaperf_license.rst b/docs/sphinx/rajaperf_license.rst index a7985861f..5233fff7b 100644 --- a/docs/sphinx/rajaperf_license.rst +++ b/docs/sphinx/rajaperf_license.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## @@ -12,7 +12,7 @@ RAJA Performance Suite Copyright and License Information ========================================================== -Copyright (c) 2017-23, Lawrence Livermore National Security, LLC. +Copyright (c) 2017-24, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory. diff --git a/docs/sphinx/user_guide/CMakeLists.txt b/docs/sphinx/user_guide/CMakeLists.txt index 912f38a7a..e084390e8 100644 --- a/docs/sphinx/user_guide/CMakeLists.txt +++ b/docs/sphinx/user_guide/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA erformance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst index 082fb9f4e..db8f0e663 100644 --- a/docs/sphinx/user_guide/build.rst +++ b/docs/sphinx/user_guide/build.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index 0bd7d5570..33475a6b9 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst index 3d0879278..2af530e9a 100644 --- a/docs/sphinx/user_guide/output.rst +++ b/docs/sphinx/user_guide/output.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst index 19a8917bd..083263d61 100644 --- a/docs/sphinx/user_guide/run.rst +++ b/docs/sphinx/user_guide/run.rst @@ -1,5 +1,5 @@ .. ## -.. ## Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +.. ## Copyright (c) 2017-24, Lawrence Livermore National Security, LLC .. ## and RAJA Performance Suite project contributors. .. ## See the RAJAPerf/LICENSE file for details. .. ## diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 890ef9d2e..2f77a6bd0 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -7,7 +7,7 @@ then fi ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC and RAJA +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC and RAJA # project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/install_llvm.sh b/scripts/install_llvm.sh index f1a16dcfa..b264f59de 100755 --- a/scripts/install_llvm.sh +++ b/scripts/install_llvm.sh @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh index 011ac9522..15fde9bf1 100755 --- a/scripts/lc-builds/blueos_clang.sh +++ b/scripts/lc-builds/blueos_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh index 3a7d1f429..67ffdcf91 100755 --- a/scripts/lc-builds/blueos_clang_omptarget.sh +++ b/scripts/lc-builds/blueos_clang_omptarget.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh index 9f94fda0c..fe71ddf77 100755 --- a/scripts/lc-builds/blueos_gcc.sh +++ b/scripts/lc-builds/blueos_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 526c4c763..59b74d923 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh index de2fb5548..c13a40c25 100755 --- a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh +++ b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh index f194e8121..d1e24fdac 100755 --- a/scripts/lc-builds/blueos_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_nvcc_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh index 6d30da64c..1950dcadc 100755 --- a/scripts/lc-builds/blueos_nvcc_xl.sh +++ b/scripts/lc-builds/blueos_nvcc_xl.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh index 7ccfc3bb5..09e192fa5 100755 --- a/scripts/lc-builds/blueos_pgi.sh +++ b/scripts/lc-builds/blueos_pgi.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh index c5fa74cb2..d8a718229 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh index 4e1a68318..dd71dcc62 100755 --- a/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh +++ b/scripts/lc-builds/blueos_spectrum_nvcc_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh index 971015623..9729db57e 100755 --- a/scripts/lc-builds/blueos_xl.sh +++ b/scripts/lc-builds/blueos_xl.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh index 809c2fd5c..559c59900 100755 --- a/scripts/lc-builds/blueos_xl_omptarget.sh +++ b/scripts/lc-builds/blueos_xl_omptarget.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 6d34ae5b6..ee0bbd23d 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index c8ac1dbe5..c571e568d 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh index 28efdbff0..015416e8e 100755 --- a/scripts/lc-builds/toss4_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh index 030d65163..072443ff8 100755 --- a/scripts/lc-builds/toss4_cce_hip.sh +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index fd951b04a..64b11c012 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index 6adf38f54..588405a03 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 36c4353b7..0b36e8817 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index eac77b71e..1d0a98af7 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh index 65d680fee..dad854b59 100755 --- a/scripts/lc-builds/toss4_gcc_caliper.sh +++ b/scripts/lc-builds/toss4_gcc_caliper.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index e95f7cf9e..71642e1f1 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh index 3cc8b8ce1..dc042a369 100755 --- a/scripts/lc-builds/toss4_icpc-classic.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh index a70b02015..77d81605f 100755 --- a/scripts/lc-builds/toss4_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index ff13bde99..0a89683c3 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh index 7b0daa339..def610fb2 100755 --- a/scripts/lc-builds/toss4_mvapich2_icpx.sh +++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/make_release_tarball.sh b/scripts/make_release_tarball.sh index cd86cdc80..1956d0436 100755 --- a/scripts/make_release_tarball.sh +++ b/scripts/make_release_tarball.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/travis_build_and_test.sh b/scripts/travis_build_and_test.sh index 5ca692a49..027d41ed7 100755 --- a/scripts/travis_build_and_test.sh +++ b/scripts/travis_build_and_test.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/scripts/ubuntu-builds/ubuntu_clang.sh b/scripts/ubuntu-builds/ubuntu_clang.sh index 77e8100f1..7ddba9a7d 100755 --- a/scripts/ubuntu-builds/ubuntu_clang.sh +++ b/scripts/ubuntu-builds/ubuntu_clang.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/ubuntu-builds/ubuntu_gcc.sh b/scripts/ubuntu-builds/ubuntu_gcc.sh index 741b2fa22..e40c65482 100755 --- a/scripts/ubuntu-builds/ubuntu_gcc.sh +++ b/scripts/ubuntu-builds/ubuntu_gcc.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJAPerf/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) diff --git a/scripts/update_copyright.sh b/scripts/update_copyright.sh index d3bdeb170..527e42d43 100755 --- a/scripts/update_copyright.sh +++ b/scripts/update_copyright.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -17,7 +17,8 @@ # as well. # # IMPORTANT: Since this file is not modified (it is running the shell -# script commands), you must EDIT THE COPYRIGHT DATES ABOVE MANUALLY. +# script commands), you must EDIT THE COPYRIGHT DATES IN THE HEADER ABOVE +# MANUALLY. # # Edit the 'find' command below to change the set of files that will be # modified. @@ -46,18 +47,18 @@ for i in `cat files2change` do echo $i cp $i $i.sed.bak - sed "s/Copyright (c) 2017-22/Copyright (c) 2017-23/" $i.sed.bak > $i + sed "s/Copyright (c) 2017-23/Copyright (c) 2017-24/" $i.sed.bak > $i done echo LICENSE cp LICENSE LICENSE.sed.bak -sed "s/Copyright (c) 2017-2022/Copyright (c) 2017-2023/" LICENSE.sed.bak > LICENSE +sed "s/Copyright (c) 2017-2023/Copyright (c) 2017-2024/" LICENSE.sed.bak > LICENSE -for i in RELEASE README.md +for i in RELEASE README.md docs/conf.py do echo $i cp $i $i.sed.bak - sed "s/2017-22/2017-23/" $i.sed.bak > $i + sed "s/2017-23/2017-24/" $i.sed.bak > $i done #============================================================================= diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c74ef07e3..8091a6df8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index 3ce688d29..7aa549262 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index 54334242e..731bfdc76 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp index e8d09119e..9f0fda034 100644 --- a/src/algorithm/MEMCPY-Cuda.cpp +++ b/src/algorithm/MEMCPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp index 129c3fb18..0e880c1b4 100644 --- a/src/algorithm/MEMCPY-Hip.cpp +++ b/src/algorithm/MEMCPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY-OMP.cpp b/src/algorithm/MEMCPY-OMP.cpp index 55b63afd6..184f897bf 100644 --- a/src/algorithm/MEMCPY-OMP.cpp +++ b/src/algorithm/MEMCPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY-OMPTarget.cpp b/src/algorithm/MEMCPY-OMPTarget.cpp index 4f4932793..0b3536d42 100644 --- a/src/algorithm/MEMCPY-OMPTarget.cpp +++ b/src/algorithm/MEMCPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY-Seq.cpp b/src/algorithm/MEMCPY-Seq.cpp index 02a24668f..57c3f219f 100644 --- a/src/algorithm/MEMCPY-Seq.cpp +++ b/src/algorithm/MEMCPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index 49446a265..84203bf03 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp index 9fa46ae9e..f788a0e40 100644 --- a/src/algorithm/MEMCPY.hpp +++ b/src/algorithm/MEMCPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp index a6dd198c0..d0c60e97d 100644 --- a/src/algorithm/MEMSET-Cuda.cpp +++ b/src/algorithm/MEMSET-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp index 5a78edfeb..c838aed28 100644 --- a/src/algorithm/MEMSET-Hip.cpp +++ b/src/algorithm/MEMSET-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET-OMP.cpp b/src/algorithm/MEMSET-OMP.cpp index ebd931e4d..66a6e027c 100644 --- a/src/algorithm/MEMSET-OMP.cpp +++ b/src/algorithm/MEMSET-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET-OMPTarget.cpp b/src/algorithm/MEMSET-OMPTarget.cpp index ec6d9c716..cee5a8577 100644 --- a/src/algorithm/MEMSET-OMPTarget.cpp +++ b/src/algorithm/MEMSET-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET-Seq.cpp b/src/algorithm/MEMSET-Seq.cpp index 145fd462e..3064e7cb1 100644 --- a/src/algorithm/MEMSET-Seq.cpp +++ b/src/algorithm/MEMSET-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index 95d3d5321..ece31a1ad 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/MEMSET.hpp b/src/algorithm/MEMSET.hpp index ebf2f867b..8be682823 100644 --- a/src/algorithm/MEMSET.hpp +++ b/src/algorithm/MEMSET.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index dfbc7a478..6f79928e2 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 44084c6c3..9999ea674 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/REDUCE_SUM-OMP.cpp b/src/algorithm/REDUCE_SUM-OMP.cpp index 49d0d766e..ae5cc130c 100644 --- a/src/algorithm/REDUCE_SUM-OMP.cpp +++ b/src/algorithm/REDUCE_SUM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp index a8652099e..394f71b07 100644 --- a/src/algorithm/REDUCE_SUM-OMPTarget.cpp +++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/REDUCE_SUM-Seq.cpp b/src/algorithm/REDUCE_SUM-Seq.cpp index 8c7086057..9223c3ac5 100644 --- a/src/algorithm/REDUCE_SUM-Seq.cpp +++ b/src/algorithm/REDUCE_SUM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 3712f5ffa..233ef36d8 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index e94400dbb..bb244208d 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-Cuda.cpp b/src/algorithm/SORT-Cuda.cpp index 45cd40d63..4d77667d7 100644 --- a/src/algorithm/SORT-Cuda.cpp +++ b/src/algorithm/SORT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-Hip.cpp b/src/algorithm/SORT-Hip.cpp index d87445413..c464bae4e 100644 --- a/src/algorithm/SORT-Hip.cpp +++ b/src/algorithm/SORT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-OMP.cpp b/src/algorithm/SORT-OMP.cpp index 05b885d50..133b00a88 100644 --- a/src/algorithm/SORT-OMP.cpp +++ b/src/algorithm/SORT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT-Seq.cpp b/src/algorithm/SORT-Seq.cpp index c5e1503af..2d458ff4d 100644 --- a/src/algorithm/SORT-Seq.cpp +++ b/src/algorithm/SORT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index b7738f264..55441375b 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index b51bf12f9..9df61e411 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Cuda.cpp b/src/algorithm/SORTPAIRS-Cuda.cpp index 57176e3db..1f102eb91 100644 --- a/src/algorithm/SORTPAIRS-Cuda.cpp +++ b/src/algorithm/SORTPAIRS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Hip.cpp b/src/algorithm/SORTPAIRS-Hip.cpp index aece079d4..467a3cbf4 100644 --- a/src/algorithm/SORTPAIRS-Hip.cpp +++ b/src/algorithm/SORTPAIRS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-OMP.cpp b/src/algorithm/SORTPAIRS-OMP.cpp index 39705af9a..cdf0f044a 100644 --- a/src/algorithm/SORTPAIRS-OMP.cpp +++ b/src/algorithm/SORTPAIRS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS-Seq.cpp b/src/algorithm/SORTPAIRS-Seq.cpp index 91c094ce9..320e307f4 100644 --- a/src/algorithm/SORTPAIRS-Seq.cpp +++ b/src/algorithm/SORTPAIRS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index a07f1e79b..0e903e116 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index 4cfc3eb36..fa53a15c3 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index bade73b59..f93a13154 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index a4b566c6b..36f033d65 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 9e35bef84..8534a1f57 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/apps/CONVECTION3DPA-Cuda.cpp b/src/apps/CONVECTION3DPA-Cuda.cpp index a553624e8..6160430c0 100644 --- a/src/apps/CONVECTION3DPA-Cuda.cpp +++ b/src/apps/CONVECTION3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA-Hip.cpp b/src/apps/CONVECTION3DPA-Hip.cpp index a3c3c0472..12300f940 100644 --- a/src/apps/CONVECTION3DPA-Hip.cpp +++ b/src/apps/CONVECTION3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA-OMP.cpp b/src/apps/CONVECTION3DPA-OMP.cpp index b414122cb..2826defd0 100644 --- a/src/apps/CONVECTION3DPA-OMP.cpp +++ b/src/apps/CONVECTION3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA-OMPTarget.cpp b/src/apps/CONVECTION3DPA-OMPTarget.cpp index e0317c930..6affba0c6 100644 --- a/src/apps/CONVECTION3DPA-OMPTarget.cpp +++ b/src/apps/CONVECTION3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA-Seq.cpp b/src/apps/CONVECTION3DPA-Seq.cpp index a62a93409..9f18a2da8 100644 --- a/src/apps/CONVECTION3DPA-Seq.cpp +++ b/src/apps/CONVECTION3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index 43ed5d539..a7973a237 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index 784b2d4cd..d59b7e319 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp index 43c9aa0a9..3c7edcd40 100644 --- a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 88e59dac3..79cef6b09 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-OMP.cpp b/src/apps/DEL_DOT_VEC_2D-OMP.cpp index 1fc9b5775..730b49887 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMP.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp index 8dfa12e6c..b3527802a 100644 --- a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp +++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D-Seq.cpp b/src/apps/DEL_DOT_VEC_2D-Seq.cpp index ffb533e3a..76b04a96f 100644 --- a/src/apps/DEL_DOT_VEC_2D-Seq.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index ffe5edeb2..b5c7bc52c 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index d82efc12f..14db3565a 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 6b205b70e..90a55905b 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 4a03eec93..15e27ed78 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index a1dcdbe04..04f27ec63 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-OMPTarget.cpp b/src/apps/DIFFUSION3DPA-OMPTarget.cpp index 03a1811a3..be5bf5ecf 100644 --- a/src/apps/DIFFUSION3DPA-OMPTarget.cpp +++ b/src/apps/DIFFUSION3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 9e2818de1..c384b0695 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index e0cd0f6d0..5a645c53e 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 62967d5c0..5dff5e5ef 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D-Cuda.cpp b/src/apps/EDGE3D-Cuda.cpp index 2a6a64a78..5f212fb9b 100644 --- a/src/apps/EDGE3D-Cuda.cpp +++ b/src/apps/EDGE3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D-Hip.cpp b/src/apps/EDGE3D-Hip.cpp index e8a4f2be3..56ff054d8 100644 --- a/src/apps/EDGE3D-Hip.cpp +++ b/src/apps/EDGE3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D-OMP.cpp b/src/apps/EDGE3D-OMP.cpp index bb79de639..1671872b4 100644 --- a/src/apps/EDGE3D-OMP.cpp +++ b/src/apps/EDGE3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D-OMPTarget.cpp b/src/apps/EDGE3D-OMPTarget.cpp index 64c5f7dd4..57b2bc738 100644 --- a/src/apps/EDGE3D-OMPTarget.cpp +++ b/src/apps/EDGE3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D-Seq.cpp b/src/apps/EDGE3D-Seq.cpp index cebd426b7..5f7114127 100644 --- a/src/apps/EDGE3D-Seq.cpp +++ b/src/apps/EDGE3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index 3b93d281b..a9335f727 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp index 82e07c6a5..3707f90ed 100644 --- a/src/apps/EDGE3D.hpp +++ b/src/apps/EDGE3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp index e657a94a3..c33321ea7 100644 --- a/src/apps/ENERGY-Cuda.cpp +++ b/src/apps/ENERGY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Hip.cpp b/src/apps/ENERGY-Hip.cpp index 56a8126ee..e0424d55a 100644 --- a/src/apps/ENERGY-Hip.cpp +++ b/src/apps/ENERGY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-OMP.cpp b/src/apps/ENERGY-OMP.cpp index 235386ff0..687f69d25 100644 --- a/src/apps/ENERGY-OMP.cpp +++ b/src/apps/ENERGY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp index 83ce48357..786623a8f 100644 --- a/src/apps/ENERGY-OMPTarget.cpp +++ b/src/apps/ENERGY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY-Seq.cpp b/src/apps/ENERGY-Seq.cpp index c7e3ffdf2..bbf9d73c0 100644 --- a/src/apps/ENERGY-Seq.cpp +++ b/src/apps/ENERGY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index fd1988300..f24072f8b 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 22af34867..4a47e7912 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index f88e7b55d..258f75a0e 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index bdfd21aa5..32ebd761b 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 7d9db27a7..2627b2c5e 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index 5b3cc2a35..5475e0061 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index 5715f884a..a7f476aa0 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index b13d30818..59594798e 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 7b51aaebc..2700bb487 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 3ca8a1cef..41933c4e8 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index b059cb1c6..c64d0d87d 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index de0bb44ac..87d1686c2 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-OMP.cpp b/src/apps/LTIMES-OMP.cpp index 93ce138ef..80c4a4a0e 100644 --- a/src/apps/LTIMES-OMP.cpp +++ b/src/apps/LTIMES-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp index 7ae4ee1e2..da7047d20 100644 --- a/src/apps/LTIMES-OMPTarget.cpp +++ b/src/apps/LTIMES-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES-Seq.cpp b/src/apps/LTIMES-Seq.cpp index 33fd4b666..66503ed26 100644 --- a/src/apps/LTIMES-Seq.cpp +++ b/src/apps/LTIMES-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 0abb82d35..4db36b287 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 2f3f0ca6d..c45be3ac9 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index f201a4980..39dbe6c66 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 3c3d39469..722071f1d 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-OMP.cpp b/src/apps/LTIMES_NOVIEW-OMP.cpp index e9df87b83..900606076 100644 --- a/src/apps/LTIMES_NOVIEW-OMP.cpp +++ b/src/apps/LTIMES_NOVIEW-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp index 9a1f0bf06..1ffddaeaa 100644 --- a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp +++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW-Seq.cpp b/src/apps/LTIMES_NOVIEW-Seq.cpp index cd202004d..d4c6e4f41 100644 --- a/src/apps/LTIMES_NOVIEW-Seq.cpp +++ b/src/apps/LTIMES_NOVIEW-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index a106d5418..22c7bf43e 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 96a296366..61db05db4 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-Cuda.cpp b/src/apps/MASS3DEA-Cuda.cpp index acdd029b9..649fd5b01 100644 --- a/src/apps/MASS3DEA-Cuda.cpp +++ b/src/apps/MASS3DEA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-Hip.cpp b/src/apps/MASS3DEA-Hip.cpp index 02809f270..2eeabadeb 100644 --- a/src/apps/MASS3DEA-Hip.cpp +++ b/src/apps/MASS3DEA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-OMP.cpp b/src/apps/MASS3DEA-OMP.cpp index 7dd1ab122..2b77eeb6e 100644 --- a/src/apps/MASS3DEA-OMP.cpp +++ b/src/apps/MASS3DEA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-OMPTarget.cpp b/src/apps/MASS3DEA-OMPTarget.cpp index b3d8aa75f..6f41914ab 100644 --- a/src/apps/MASS3DEA-OMPTarget.cpp +++ b/src/apps/MASS3DEA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA-Seq.cpp b/src/apps/MASS3DEA-Seq.cpp index bc906f0f6..f3b0cfc99 100644 --- a/src/apps/MASS3DEA-Seq.cpp +++ b/src/apps/MASS3DEA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index d6239222f..7dfbbb47c 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index df029299e..3726cd470 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 1b51c8e59..60092ef7f 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 7b07522b9..1fbd0dea9 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-OMP.cpp b/src/apps/MASS3DPA-OMP.cpp index f2e122fed..4c6b2867c 100644 --- a/src/apps/MASS3DPA-OMP.cpp +++ b/src/apps/MASS3DPA-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-OMPTarget.cpp b/src/apps/MASS3DPA-OMPTarget.cpp index e4cc02a4f..d74c14641 100644 --- a/src/apps/MASS3DPA-OMPTarget.cpp +++ b/src/apps/MASS3DPA-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA-Seq.cpp b/src/apps/MASS3DPA-Seq.cpp index 39087834d..e22068169 100644 --- a/src/apps/MASS3DPA-Seq.cpp +++ b/src/apps/MASS3DPA-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 1c99e9f73..b95027f2f 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 7365fa011..8a70e326d 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp index 819aaf680..494d70bbb 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp index 33eee5dc1..da8f5dd12 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp index d62b5527a..a17576c98 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp index a12a91efd..7d5c59614 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp index 9fbc2effa..cf176d4c1 100644 --- a/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index ed1bd2078..fc32eba04 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index a8d194387..51edd3310 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp index bfb5e78df..c98e3a48a 100644 --- a/src/apps/PRESSURE-Cuda.cpp +++ b/src/apps/PRESSURE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Hip.cpp b/src/apps/PRESSURE-Hip.cpp index ad7ff6bd9..18d4a1c88 100644 --- a/src/apps/PRESSURE-Hip.cpp +++ b/src/apps/PRESSURE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-OMP.cpp b/src/apps/PRESSURE-OMP.cpp index 0f0dd2e4e..ceab0286c 100644 --- a/src/apps/PRESSURE-OMP.cpp +++ b/src/apps/PRESSURE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp index 830859f07..4d0b61f58 100644 --- a/src/apps/PRESSURE-OMPTarget.cpp +++ b/src/apps/PRESSURE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE-Seq.cpp b/src/apps/PRESSURE-Seq.cpp index 4506eb7a2..18fc0929e 100644 --- a/src/apps/PRESSURE-Seq.cpp +++ b/src/apps/PRESSURE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index c772a0989..2e344b843 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index c0568a8e0..82ab50aa0 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp index cc9cf6da6..378aa36bc 100644 --- a/src/apps/VOL3D-Cuda.cpp +++ b/src/apps/VOL3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Hip.cpp b/src/apps/VOL3D-Hip.cpp index 027ea2c22..704008006 100644 --- a/src/apps/VOL3D-Hip.cpp +++ b/src/apps/VOL3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-OMP.cpp b/src/apps/VOL3D-OMP.cpp index 44e3cdcf4..7c6562c47 100644 --- a/src/apps/VOL3D-OMP.cpp +++ b/src/apps/VOL3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp index 538fb47c7..d97a5889a 100644 --- a/src/apps/VOL3D-OMPTarget.cpp +++ b/src/apps/VOL3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D-Seq.cpp b/src/apps/VOL3D-Seq.cpp index 631c545bc..d174ad43c 100644 --- a/src/apps/VOL3D-Seq.cpp +++ b/src/apps/VOL3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 11051adc5..dd388e178 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index f3d296440..dce286a89 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp index e81347af5..f33c8656d 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp index d182fdfa4..c92d7b6db 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp b/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp index eea4614cf..9b0890ef4 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp b/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp index 573765e30..0a4f96119 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp index 1bd904088..ea9be8c17 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 267190132..be5c93000 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 1c823ea2a..6adedd04e 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/apps/mixed_fem_helper.hpp b/src/apps/mixed_fem_helper.hpp index 88f7d3b64..6ee3b1a06 100644 --- a/src/apps/mixed_fem_helper.hpp +++ b/src/apps/mixed_fem_helper.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 4b47c7b48..25969c207 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index b8ab91cd1..eb2cc1e83 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp b/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp index 9e74c4e0c..aadcf9401 100644 --- a/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY_ATOMIC-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index 19e916dac..a67b041c7 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index 661180c7b..b07c6f881 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp index 8c775a3b0..efd1ed118 100644 --- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp index 9df018264..996ec6225 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp index 49e890315..fb2f59ac2 100644 --- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp +++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp index 36929cead..fb5b5ba98 100644 --- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp +++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp index 233ca71af..6247970b1 100644 --- a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp +++ b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index 23c0ab6f4..e461c00a1 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index 5cdb9060f..43b629cad 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-Cuda.cpp b/src/basic/ARRAY_OF_PTRS-Cuda.cpp index ca3b8360f..e9cdd2349 100644 --- a/src/basic/ARRAY_OF_PTRS-Cuda.cpp +++ b/src/basic/ARRAY_OF_PTRS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-Hip.cpp b/src/basic/ARRAY_OF_PTRS-Hip.cpp index cb1336058..aa5777fb4 100644 --- a/src/basic/ARRAY_OF_PTRS-Hip.cpp +++ b/src/basic/ARRAY_OF_PTRS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-OMP.cpp b/src/basic/ARRAY_OF_PTRS-OMP.cpp index 3e05e929a..774fd8f98 100644 --- a/src/basic/ARRAY_OF_PTRS-OMP.cpp +++ b/src/basic/ARRAY_OF_PTRS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp b/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp index 7a7642b4e..02301ca1e 100644 --- a/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp +++ b/src/basic/ARRAY_OF_PTRS-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS-Seq.cpp b/src/basic/ARRAY_OF_PTRS-Seq.cpp index ba728d775..d03fb7ac4 100644 --- a/src/basic/ARRAY_OF_PTRS-Seq.cpp +++ b/src/basic/ARRAY_OF_PTRS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 2a88e5005..095e52781 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index ee8a44862..fca763190 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index d21d46e5a..bab953cc3 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/basic/COPY8-Cuda.cpp b/src/basic/COPY8-Cuda.cpp index 202bcbc54..f8f1aeb31 100644 --- a/src/basic/COPY8-Cuda.cpp +++ b/src/basic/COPY8-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8-Hip.cpp b/src/basic/COPY8-Hip.cpp index 9c3054611..714a00a0b 100644 --- a/src/basic/COPY8-Hip.cpp +++ b/src/basic/COPY8-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8-OMP.cpp b/src/basic/COPY8-OMP.cpp index 8ba6699c6..a8dec3228 100644 --- a/src/basic/COPY8-OMP.cpp +++ b/src/basic/COPY8-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8-OMPTarget.cpp b/src/basic/COPY8-OMPTarget.cpp index 88d8e3cac..63a207ba8 100644 --- a/src/basic/COPY8-OMPTarget.cpp +++ b/src/basic/COPY8-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8-Seq.cpp b/src/basic/COPY8-Seq.cpp index 1ae6af854..32bf188d6 100644 --- a/src/basic/COPY8-Seq.cpp +++ b/src/basic/COPY8-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index 7a75daa40..4d22c4336 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index f98784d16..7d047eba4 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-Cuda.cpp b/src/basic/DAXPY-Cuda.cpp index 2461f8eaf..d58468ba3 100644 --- a/src/basic/DAXPY-Cuda.cpp +++ b/src/basic/DAXPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index c437eea15..f08dba1fc 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-OMP.cpp b/src/basic/DAXPY-OMP.cpp index afc0e653c..8f1b95641 100644 --- a/src/basic/DAXPY-OMP.cpp +++ b/src/basic/DAXPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-OMPTarget.cpp b/src/basic/DAXPY-OMPTarget.cpp index 387a4c40d..fc36ad257 100644 --- a/src/basic/DAXPY-OMPTarget.cpp +++ b/src/basic/DAXPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 7b024ca49..e23cc5e6f 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 8aa05e66a..524c4f809 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index bcaca8054..c61be7e9a 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-Cuda.cpp b/src/basic/DAXPY_ATOMIC-Cuda.cpp index df4776ce7..c4cee2dd2 100644 --- a/src/basic/DAXPY_ATOMIC-Cuda.cpp +++ b/src/basic/DAXPY_ATOMIC-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 93f6b4d0d..258c979b6 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-OMP.cpp b/src/basic/DAXPY_ATOMIC-OMP.cpp index 4d2f4db87..a41c6c049 100644 --- a/src/basic/DAXPY_ATOMIC-OMP.cpp +++ b/src/basic/DAXPY_ATOMIC-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp index bc6b08932..ae7319e25 100644 --- a/src/basic/DAXPY_ATOMIC-OMPTarget.cpp +++ b/src/basic/DAXPY_ATOMIC-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC-Seq.cpp b/src/basic/DAXPY_ATOMIC-Seq.cpp index 1c33c45f8..9fd78fecf 100644 --- a/src/basic/DAXPY_ATOMIC-Seq.cpp +++ b/src/basic/DAXPY_ATOMIC-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index a9f709276..a311589f5 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index 9c2890e48..ffaa4cc4e 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp index d110a036a..da959e485 100644 --- a/src/basic/IF_QUAD-Cuda.cpp +++ b/src/basic/IF_QUAD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 4cd5b482b..259306d0f 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-OMP.cpp b/src/basic/IF_QUAD-OMP.cpp index e952f05fb..297decc78 100644 --- a/src/basic/IF_QUAD-OMP.cpp +++ b/src/basic/IF_QUAD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index d6232ec13..bedec322c 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD-Seq.cpp b/src/basic/IF_QUAD-Seq.cpp index aa2448a1b..14735ecd8 100644 --- a/src/basic/IF_QUAD-Seq.cpp +++ b/src/basic/IF_QUAD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index c31dc79d4..85094d643 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index f1f3e12a8..ce47ec332 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp index 898d3b28b..8b3cb9bb7 100644 --- a/src/basic/INIT3-Cuda.cpp +++ b/src/basic/INIT3-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index c3b67130f..be0f2e74f 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-OMP.cpp b/src/basic/INIT3-OMP.cpp index 25d31585c..346a92399 100644 --- a/src/basic/INIT3-OMP.cpp +++ b/src/basic/INIT3-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index 825730bdc..0caee8c80 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index 398e986b1..20feb79a4 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index bbf90da80..d77786037 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index aed67bfeb..89451433a 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp index 747bceb6d..e535ac041 100644 --- a/src/basic/INIT_VIEW1D-Cuda.cpp +++ b/src/basic/INIT_VIEW1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index b7db081ec..130a62a42 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-OMP.cpp b/src/basic/INIT_VIEW1D-OMP.cpp index 742270ff6..52160ab13 100644 --- a/src/basic/INIT_VIEW1D-OMP.cpp +++ b/src/basic/INIT_VIEW1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index d9ad636e1..825fcd569 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index 59c494c49..b1c761a5c 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 018811f34..88004ec31 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index f3770f69a..7512a6d81 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp index 822de3c70..3a13f5210 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 83b1ae339..2940bb59d 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp index 8fb7c0129..bb6834c17 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index d045462d7..f87fa2625 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp index c25511aa1..b7588350a 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 4daa109a6..ce187611b 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index d32f59c7b..75e13923a 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp index 260189990..e39cd5a77 100644 --- a/src/basic/MULADDSUB-Cuda.cpp +++ b/src/basic/MULADDSUB-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index 9e4d0d741..bb846eef5 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-OMP.cpp b/src/basic/MULADDSUB-OMP.cpp index 6c9bb2038..28f5edab1 100644 --- a/src/basic/MULADDSUB-OMP.cpp +++ b/src/basic/MULADDSUB-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index af691d008..f4d0e716a 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index 59ddf1ea1..40ede6d64 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index a5deb6049..1d6634ce9 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index e604a34c8..778e23838 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp index 3cefef7de..74e4136d2 100644 --- a/src/basic/NESTED_INIT-Cuda.cpp +++ b/src/basic/NESTED_INIT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Hip.cpp b/src/basic/NESTED_INIT-Hip.cpp index 20ce5c382..f7ea66dd4 100644 --- a/src/basic/NESTED_INIT-Hip.cpp +++ b/src/basic/NESTED_INIT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-OMP.cpp b/src/basic/NESTED_INIT-OMP.cpp index 3b1e07767..3fa73fa5b 100644 --- a/src/basic/NESTED_INIT-OMP.cpp +++ b/src/basic/NESTED_INIT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index 607c8befe..6e6538dfd 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT-Seq.cpp b/src/basic/NESTED_INIT-Seq.cpp index d3ce50d65..bc277ce27 100644 --- a/src/basic/NESTED_INIT-Seq.cpp +++ b/src/basic/NESTED_INIT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index fc64f5a0d..5bc652e8c 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index ccaf7079e..f26b9cba4 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index 0c57353ed..cb1c2a0bc 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 2033e01b8..f65f36fdd 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-OMP.cpp b/src/basic/PI_ATOMIC-OMP.cpp index 2c0228089..d1f0eb784 100644 --- a/src/basic/PI_ATOMIC-OMP.cpp +++ b/src/basic/PI_ATOMIC-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-OMPTarget.cpp b/src/basic/PI_ATOMIC-OMPTarget.cpp index c685e026d..5f3fe4c82 100644 --- a/src/basic/PI_ATOMIC-OMPTarget.cpp +++ b/src/basic/PI_ATOMIC-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC-Seq.cpp b/src/basic/PI_ATOMIC-Seq.cpp index b3db76b21..698361107 100644 --- a/src/basic/PI_ATOMIC-Seq.cpp +++ b/src/basic/PI_ATOMIC-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index b1293b531..40321c919 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 38a0b62a6..71327ce6b 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index f79259e32..c9c165321 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 6bd9385b8..7df62b8c8 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index 44da3e5b5..cbf32359e 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index f4c20a665..47b64fec6 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 4bd888dd0..9fc2ed0b5 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 84c38ce67..998ecf5f1 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index e2275409a..48a2fd519 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index be72022f4..015693eee 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 6cbee2fa0..460b68a2d 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index 5428d6087..32658fcf1 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index 0d261edec..a625ef842 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index f204bd345..a5f5965a2 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 975bf8f24..cee83519a 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index 749d62009..e56a7bc20 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 9be3d8643..2c20b2488 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index 88679b598..db31819f5 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 7ac22faa2..730134b95 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index cfbcba44a..0617ccaca 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 377b19b84..b83722a3e 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index f18319eb2..2fd5f9a3f 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 9bb83f661..00c1400a4 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index f0e0565db..233563269 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index ad291ddf4..51b6f0da3 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index dadaa5baa..9f89ba0b6 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index b9bdcd6a6..acf52a61d 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index 9b1264b4d..967d3e93e 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index eaac3ffda..162dbfa0a 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 4f705d008..841d25a40 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/CMakeLists.txt b/src/comm/CMakeLists.txt index 467436594..9298e7bce 100644 --- a/src/comm/CMakeLists.txt +++ b/src/comm/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/comm/HALO_EXCHANGE-Cuda.cpp b/src/comm/HALO_EXCHANGE-Cuda.cpp index e709562d1..ad5482d18 100644 --- a/src/comm/HALO_EXCHANGE-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE-Hip.cpp b/src/comm/HALO_EXCHANGE-Hip.cpp index 63244d8af..1b7ffb04a 100644 --- a/src/comm/HALO_EXCHANGE-Hip.cpp +++ b/src/comm/HALO_EXCHANGE-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE-OMP.cpp b/src/comm/HALO_EXCHANGE-OMP.cpp index 1ce83af12..922151704 100644 --- a/src/comm/HALO_EXCHANGE-OMP.cpp +++ b/src/comm/HALO_EXCHANGE-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE-OMPTarget.cpp b/src/comm/HALO_EXCHANGE-OMPTarget.cpp index c3100e996..f83eb2826 100644 --- a/src/comm/HALO_EXCHANGE-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE-Seq.cpp b/src/comm/HALO_EXCHANGE-Seq.cpp index e494e374a..b5cbbf6f6 100644 --- a/src/comm/HALO_EXCHANGE-Seq.cpp +++ b/src/comm/HALO_EXCHANGE-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp index 87cab85ab..abfed942b 100644 --- a/src/comm/HALO_EXCHANGE.cpp +++ b/src/comm/HALO_EXCHANGE.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE.hpp b/src/comm/HALO_EXCHANGE.hpp index 7d46cedd7..d0eea9f86 100644 --- a/src/comm/HALO_EXCHANGE.hpp +++ b/src/comm/HALO_EXCHANGE.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp index b3f71af8c..a9d161183 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp index d1f34b1a3..2ac30479b 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp index 05c314e20..1af5d4bb9 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp index 04efe9b4d..18c32437d 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp index f7c5169f7..bca51de0d 100644 --- a/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp index 855868ba8..11651755f 100644 --- a/src/comm/HALO_EXCHANGE_FUSED.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_EXCHANGE_FUSED.hpp b/src/comm/HALO_EXCHANGE_FUSED.hpp index 2373935c3..efc3a9501 100644 --- a/src/comm/HALO_EXCHANGE_FUSED.hpp +++ b/src/comm/HALO_EXCHANGE_FUSED.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING-Cuda.cpp b/src/comm/HALO_PACKING-Cuda.cpp index 49b8a30ac..6e09d0805 100644 --- a/src/comm/HALO_PACKING-Cuda.cpp +++ b/src/comm/HALO_PACKING-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING-Hip.cpp b/src/comm/HALO_PACKING-Hip.cpp index a465e2c48..583804396 100644 --- a/src/comm/HALO_PACKING-Hip.cpp +++ b/src/comm/HALO_PACKING-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING-OMP.cpp b/src/comm/HALO_PACKING-OMP.cpp index d88433ba0..bb760f479 100644 --- a/src/comm/HALO_PACKING-OMP.cpp +++ b/src/comm/HALO_PACKING-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING-OMPTarget.cpp b/src/comm/HALO_PACKING-OMPTarget.cpp index a96c240ae..d25f4f747 100644 --- a/src/comm/HALO_PACKING-OMPTarget.cpp +++ b/src/comm/HALO_PACKING-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING-Seq.cpp b/src/comm/HALO_PACKING-Seq.cpp index da77fb0a3..066116433 100644 --- a/src/comm/HALO_PACKING-Seq.cpp +++ b/src/comm/HALO_PACKING-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp index 7575bae43..4c0326b1f 100644 --- a/src/comm/HALO_PACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp index 2ae609e72..0cf329b92 100644 --- a/src/comm/HALO_PACKING.hpp +++ b/src/comm/HALO_PACKING.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING_FUSED-Cuda.cpp b/src/comm/HALO_PACKING_FUSED-Cuda.cpp index 3542e1fda..7541a30ef 100644 --- a/src/comm/HALO_PACKING_FUSED-Cuda.cpp +++ b/src/comm/HALO_PACKING_FUSED-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING_FUSED-Hip.cpp b/src/comm/HALO_PACKING_FUSED-Hip.cpp index c30703af2..7b4d9b064 100644 --- a/src/comm/HALO_PACKING_FUSED-Hip.cpp +++ b/src/comm/HALO_PACKING_FUSED-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING_FUSED-OMP.cpp b/src/comm/HALO_PACKING_FUSED-OMP.cpp index a8700bc03..143a65501 100644 --- a/src/comm/HALO_PACKING_FUSED-OMP.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp index 950c94bfd..ab0b075b4 100644 --- a/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp +++ b/src/comm/HALO_PACKING_FUSED-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp index ba3482528..2b25adcd0 100644 --- a/src/comm/HALO_PACKING_FUSED-Seq.cpp +++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp index 54ed08ec3..2e683b90b 100644 --- a/src/comm/HALO_PACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_PACKING_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp index 559632142..d89444104 100644 --- a/src/comm/HALO_PACKING_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_SENDRECV-Cuda.cpp b/src/comm/HALO_SENDRECV-Cuda.cpp index e0e360675..6d8d1bf56 100644 --- a/src/comm/HALO_SENDRECV-Cuda.cpp +++ b/src/comm/HALO_SENDRECV-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_SENDRECV-Hip.cpp b/src/comm/HALO_SENDRECV-Hip.cpp index 8cf25b4e3..7db6baf83 100644 --- a/src/comm/HALO_SENDRECV-Hip.cpp +++ b/src/comm/HALO_SENDRECV-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_SENDRECV-OMP.cpp b/src/comm/HALO_SENDRECV-OMP.cpp index 663a1dd0b..347756d81 100644 --- a/src/comm/HALO_SENDRECV-OMP.cpp +++ b/src/comm/HALO_SENDRECV-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_SENDRECV-OMPTarget.cpp b/src/comm/HALO_SENDRECV-OMPTarget.cpp index d04814125..42f62289f 100644 --- a/src/comm/HALO_SENDRECV-OMPTarget.cpp +++ b/src/comm/HALO_SENDRECV-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_SENDRECV-Seq.cpp b/src/comm/HALO_SENDRECV-Seq.cpp index 050b6da70..ab64c9415 100644 --- a/src/comm/HALO_SENDRECV-Seq.cpp +++ b/src/comm/HALO_SENDRECV-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp index 1aaddb2af..3e0990248 100644 --- a/src/comm/HALO_SENDRECV.cpp +++ b/src/comm/HALO_SENDRECV.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_SENDRECV.hpp b/src/comm/HALO_SENDRECV.hpp index fbefaadec..da2a1d1cc 100644 --- a/src/comm/HALO_SENDRECV.hpp +++ b/src/comm/HALO_SENDRECV.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_base.cpp b/src/comm/HALO_base.cpp index 904fbfc69..f72e95179 100644 --- a/src/comm/HALO_base.cpp +++ b/src/comm/HALO_base.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp index 5943dd7a9..1c179966a 100644 --- a/src/comm/HALO_base.hpp +++ b/src/comm/HALO_base.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 9dff522bd..f14076398 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp index 69f8aa825..dea54acf2 100644 --- a/src/common/CudaDataUtils.hpp +++ b/src/common/CudaDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index df911783a..c9c4d73bd 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 4d512bf6d..225d8233f 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index c1aa14710..dee369d98 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index c16286700..348fb44b7 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index e117993d6..4ccb20495 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index a668bb807..14c1b7381 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index d2851c713..48daaff2d 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 08309ebdd..32f32f64b 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/KokkosViewUtils.hpp b/src/common/KokkosViewUtils.hpp index 856fcb6f1..65a700030 100644 --- a/src/common/KokkosViewUtils.hpp +++ b/src/common/KokkosViewUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp index b24cbd7c4..b6875c7f7 100644 --- a/src/common/OpenMPTargetDataUtils.hpp +++ b/src/common/OpenMPTargetDataUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OutputUtils.cpp b/src/common/OutputUtils.cpp index 87648a545..fbd7f3653 100644 --- a/src/common/OutputUtils.cpp +++ b/src/common/OutputUtils.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/OutputUtils.hpp b/src/common/OutputUtils.hpp index 5641401e9..197721133 100644 --- a/src/common/OutputUtils.hpp +++ b/src/common/OutputUtils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 71f577dd9..dc9d0e20b 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index bf667fa66..a112a44d1 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index 3b095afec..9ec2566eb 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 7b3dcef2e..061e143cf 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index b9023ffbc..10ae761a0 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/CMakeLists.txt b/src/lcals-kokkos/CMakeLists.txt index 47e5b48c8..7cb6706e8 100644 --- a/src/lcals-kokkos/CMakeLists.txt +++ b/src/lcals-kokkos/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp index 4c7dd6b39..b8d8311ba 100644 --- a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/EOS-Kokkos.cpp b/src/lcals-kokkos/EOS-Kokkos.cpp index be30c0b60..2046b540d 100644 --- a/src/lcals-kokkos/EOS-Kokkos.cpp +++ b/src/lcals-kokkos/EOS-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp index 02ae5097e..071e2687c 100644 --- a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp index cd2957436..ebc31ddff 100644 --- a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp index b7da76fd0..37b2d0c41 100644 --- a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp index 00960c3aa..8dce97c22 100644 --- a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp +++ b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp index 20e05fde4..a2fdcfd02 100644 --- a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp +++ b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp index 45761b11e..e9b388105 100644 --- a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp +++ b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp index 451e6fe77..7609b3f3c 100644 --- a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp index b2c582790..e5263cf07 100644 --- a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp +++ b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp index ac0943dd8..f0ec388e7 100644 --- a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp +++ b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt index f767bbd0b..267f6756e 100644 --- a/src/lcals/CMakeLists.txt +++ b/src/lcals/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp index a7c42aa0e..a33f1aecf 100644 --- a/src/lcals/DIFF_PREDICT-Cuda.cpp +++ b/src/lcals/DIFF_PREDICT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-Hip.cpp b/src/lcals/DIFF_PREDICT-Hip.cpp index 7a01975cb..6d25a6f42 100644 --- a/src/lcals/DIFF_PREDICT-Hip.cpp +++ b/src/lcals/DIFF_PREDICT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-OMP.cpp b/src/lcals/DIFF_PREDICT-OMP.cpp index 09da23262..6e2110edb 100644 --- a/src/lcals/DIFF_PREDICT-OMP.cpp +++ b/src/lcals/DIFF_PREDICT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp index e04b1e07d..3509b6aaa 100644 --- a/src/lcals/DIFF_PREDICT-OMPTarget.cpp +++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT-Seq.cpp b/src/lcals/DIFF_PREDICT-Seq.cpp index eae7cda8f..9dcd9a035 100644 --- a/src/lcals/DIFF_PREDICT-Seq.cpp +++ b/src/lcals/DIFF_PREDICT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index b5ddc90e4..734e1d1f9 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 3a583381b..79ff8f9ac 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp index 1a6b3eb43..fafbdef56 100644 --- a/src/lcals/EOS-Cuda.cpp +++ b/src/lcals/EOS-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Hip.cpp b/src/lcals/EOS-Hip.cpp index 2f6079fbb..35c80e320 100644 --- a/src/lcals/EOS-Hip.cpp +++ b/src/lcals/EOS-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-OMP.cpp b/src/lcals/EOS-OMP.cpp index 7ac9cdb8f..88e8e9da1 100644 --- a/src/lcals/EOS-OMP.cpp +++ b/src/lcals/EOS-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp index 16a6b841b..b9bf454eb 100644 --- a/src/lcals/EOS-OMPTarget.cpp +++ b/src/lcals/EOS-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS-Seq.cpp b/src/lcals/EOS-Seq.cpp index 083fc343e..384a9d260 100644 --- a/src/lcals/EOS-Seq.cpp +++ b/src/lcals/EOS-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 517d144f8..843b802d3 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 9cc202a02..91a4c1f00 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp index 8daf8e571..2101da14f 100644 --- a/src/lcals/FIRST_DIFF-Cuda.cpp +++ b/src/lcals/FIRST_DIFF-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Hip.cpp b/src/lcals/FIRST_DIFF-Hip.cpp index 1768e7851..666b9783d 100644 --- a/src/lcals/FIRST_DIFF-Hip.cpp +++ b/src/lcals/FIRST_DIFF-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-OMP.cpp b/src/lcals/FIRST_DIFF-OMP.cpp index a3b814124..b664bfbf7 100644 --- a/src/lcals/FIRST_DIFF-OMP.cpp +++ b/src/lcals/FIRST_DIFF-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp index 341ef57f4..bf3a40ad9 100644 --- a/src/lcals/FIRST_DIFF-OMPTarget.cpp +++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF-Seq.cpp b/src/lcals/FIRST_DIFF-Seq.cpp index 54d2a8ce1..2382015e0 100644 --- a/src/lcals/FIRST_DIFF-Seq.cpp +++ b/src/lcals/FIRST_DIFF-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 3e8e42ec6..a49e05ed6 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index f3f6424f0..850f23852 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index 7a70c8991..f2433183d 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index ef577aa72..fde3100e1 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 1a7722570..7176bcacf 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index 5a4dccc69..dd39908d8 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index 6e5de0437..3ac609753 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 875932958..89b741d88 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index afe90a554..f8d8192b9 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Cuda.cpp b/src/lcals/FIRST_SUM-Cuda.cpp index 9869ccf80..b4a025a20 100644 --- a/src/lcals/FIRST_SUM-Cuda.cpp +++ b/src/lcals/FIRST_SUM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Hip.cpp b/src/lcals/FIRST_SUM-Hip.cpp index 9d8258a28..01c2082d5 100644 --- a/src/lcals/FIRST_SUM-Hip.cpp +++ b/src/lcals/FIRST_SUM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-OMP.cpp b/src/lcals/FIRST_SUM-OMP.cpp index e545538fc..223379dbe 100644 --- a/src/lcals/FIRST_SUM-OMP.cpp +++ b/src/lcals/FIRST_SUM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-OMPTarget.cpp b/src/lcals/FIRST_SUM-OMPTarget.cpp index 324b26d54..932f32fc4 100644 --- a/src/lcals/FIRST_SUM-OMPTarget.cpp +++ b/src/lcals/FIRST_SUM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index 4d3ef658f..4eba8626e 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 046528e2b..3c2c0e03b 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 59c1c0bfd..ddf5d9c33 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Cuda.cpp b/src/lcals/GEN_LIN_RECUR-Cuda.cpp index 9569f020b..17e56a2a0 100644 --- a/src/lcals/GEN_LIN_RECUR-Cuda.cpp +++ b/src/lcals/GEN_LIN_RECUR-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Hip.cpp b/src/lcals/GEN_LIN_RECUR-Hip.cpp index 86eb64884..5d428fa87 100644 --- a/src/lcals/GEN_LIN_RECUR-Hip.cpp +++ b/src/lcals/GEN_LIN_RECUR-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-OMP.cpp b/src/lcals/GEN_LIN_RECUR-OMP.cpp index 660d47273..d4ac65995 100644 --- a/src/lcals/GEN_LIN_RECUR-OMP.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp index 3838a2af0..9932469cb 100644 --- a/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp +++ b/src/lcals/GEN_LIN_RECUR-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR-Seq.cpp b/src/lcals/GEN_LIN_RECUR-Seq.cpp index 9d728a9f7..b1d6c3be5 100644 --- a/src/lcals/GEN_LIN_RECUR-Seq.cpp +++ b/src/lcals/GEN_LIN_RECUR-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 9c132a3db..f9c21ebd3 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 9586a69b4..1bcc39cf6 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp index 66f038750..4d89cc5b6 100644 --- a/src/lcals/HYDRO_1D-Cuda.cpp +++ b/src/lcals/HYDRO_1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Hip.cpp b/src/lcals/HYDRO_1D-Hip.cpp index c79786d21..aa2a12c99 100644 --- a/src/lcals/HYDRO_1D-Hip.cpp +++ b/src/lcals/HYDRO_1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-OMP.cpp b/src/lcals/HYDRO_1D-OMP.cpp index f2088205a..d3ac150a4 100644 --- a/src/lcals/HYDRO_1D-OMP.cpp +++ b/src/lcals/HYDRO_1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp index b5fbe0657..b5cf82420 100644 --- a/src/lcals/HYDRO_1D-OMPTarget.cpp +++ b/src/lcals/HYDRO_1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D-Seq.cpp b/src/lcals/HYDRO_1D-Seq.cpp index 47ca2aedd..22f257e8d 100644 --- a/src/lcals/HYDRO_1D-Seq.cpp +++ b/src/lcals/HYDRO_1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index d92267fc9..b26279013 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index dd61f112c..980b92281 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Cuda.cpp b/src/lcals/HYDRO_2D-Cuda.cpp index d10875074..ad09dd2a3 100644 --- a/src/lcals/HYDRO_2D-Cuda.cpp +++ b/src/lcals/HYDRO_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Hip.cpp b/src/lcals/HYDRO_2D-Hip.cpp index aeb4bdf29..58b530ba8 100644 --- a/src/lcals/HYDRO_2D-Hip.cpp +++ b/src/lcals/HYDRO_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-OMP.cpp b/src/lcals/HYDRO_2D-OMP.cpp index e153dbdca..92f1bb080 100644 --- a/src/lcals/HYDRO_2D-OMP.cpp +++ b/src/lcals/HYDRO_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-OMPTarget.cpp b/src/lcals/HYDRO_2D-OMPTarget.cpp index 43304884b..f830feefb 100644 --- a/src/lcals/HYDRO_2D-OMPTarget.cpp +++ b/src/lcals/HYDRO_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D-Seq.cpp b/src/lcals/HYDRO_2D-Seq.cpp index cf43e885f..522dba679 100644 --- a/src/lcals/HYDRO_2D-Seq.cpp +++ b/src/lcals/HYDRO_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index fd1dd9406..071811c16 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index b6ad936ca..e65af1320 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp index b3c70cdda..3ec139130 100644 --- a/src/lcals/INT_PREDICT-Cuda.cpp +++ b/src/lcals/INT_PREDICT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Hip.cpp b/src/lcals/INT_PREDICT-Hip.cpp index ccf226523..1e2741cd7 100644 --- a/src/lcals/INT_PREDICT-Hip.cpp +++ b/src/lcals/INT_PREDICT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-OMP.cpp b/src/lcals/INT_PREDICT-OMP.cpp index 29b167881..b33e5cd2b 100644 --- a/src/lcals/INT_PREDICT-OMP.cpp +++ b/src/lcals/INT_PREDICT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp index 4172c1822..a7e257532 100644 --- a/src/lcals/INT_PREDICT-OMPTarget.cpp +++ b/src/lcals/INT_PREDICT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT-Seq.cpp b/src/lcals/INT_PREDICT-Seq.cpp index de167bc11..1d8e52fda 100644 --- a/src/lcals/INT_PREDICT-Seq.cpp +++ b/src/lcals/INT_PREDICT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index eb56b5725..b43393ddd 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index a81ae6fb2..ff7a834d9 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp index 68265ab4e..40a8bf7f0 100644 --- a/src/lcals/PLANCKIAN-Cuda.cpp +++ b/src/lcals/PLANCKIAN-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Hip.cpp b/src/lcals/PLANCKIAN-Hip.cpp index b345e0ec3..00323115d 100644 --- a/src/lcals/PLANCKIAN-Hip.cpp +++ b/src/lcals/PLANCKIAN-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-OMP.cpp b/src/lcals/PLANCKIAN-OMP.cpp index cc90067eb..e82f9eccd 100644 --- a/src/lcals/PLANCKIAN-OMP.cpp +++ b/src/lcals/PLANCKIAN-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp index fb0f41cef..c69732531 100644 --- a/src/lcals/PLANCKIAN-OMPTarget.cpp +++ b/src/lcals/PLANCKIAN-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN-Seq.cpp b/src/lcals/PLANCKIAN-Seq.cpp index 25ff3ff2b..56c57971b 100644 --- a/src/lcals/PLANCKIAN-Seq.cpp +++ b/src/lcals/PLANCKIAN-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 2bb8d3f7b..06e74f59f 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 92b55fc95..dbbf9ceef 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Cuda.cpp b/src/lcals/TRIDIAG_ELIM-Cuda.cpp index 691817124..18cc284ea 100644 --- a/src/lcals/TRIDIAG_ELIM-Cuda.cpp +++ b/src/lcals/TRIDIAG_ELIM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Hip.cpp b/src/lcals/TRIDIAG_ELIM-Hip.cpp index 9afd6bb44..1b0db7e7f 100644 --- a/src/lcals/TRIDIAG_ELIM-Hip.cpp +++ b/src/lcals/TRIDIAG_ELIM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-OMP.cpp b/src/lcals/TRIDIAG_ELIM-OMP.cpp index 8f31c9493..22673b5f7 100644 --- a/src/lcals/TRIDIAG_ELIM-OMP.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp index 59a8a323c..5433879a5 100644 --- a/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp +++ b/src/lcals/TRIDIAG_ELIM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM-Seq.cpp b/src/lcals/TRIDIAG_ELIM-Seq.cpp index 5c0003d93..0b23c9143 100644 --- a/src/lcals/TRIDIAG_ELIM-Seq.cpp +++ b/src/lcals/TRIDIAG_ELIM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 710927c3e..efd58a02a 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index c95685de9..b4e101103 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index f9cd2c1c2..ff076b8f9 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp index 5cb0f48c8..28b49c779 100644 --- a/src/polybench/POLYBENCH_2MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-Hip.cpp b/src/polybench/POLYBENCH_2MM-Hip.cpp index 5dac8fa96..28308ef32 100644 --- a/src/polybench/POLYBENCH_2MM-Hip.cpp +++ b/src/polybench/POLYBENCH_2MM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-OMP.cpp b/src/polybench/POLYBENCH_2MM-OMP.cpp index 8b6cdb290..b73813df8 100644 --- a/src/polybench/POLYBENCH_2MM-OMP.cpp +++ b/src/polybench/POLYBENCH_2MM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp index 79d6f96c0..781139422 100644 --- a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 1a3120246..5cffa3207 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 5c0ebe484..af5a4450b 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index e11d4889b..944a88bda 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp index e20c71aa2..401660aca 100644 --- a/src/polybench/POLYBENCH_3MM-Cuda.cpp +++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Hip.cpp b/src/polybench/POLYBENCH_3MM-Hip.cpp index a6fd1ba59..53e106ad1 100644 --- a/src/polybench/POLYBENCH_3MM-Hip.cpp +++ b/src/polybench/POLYBENCH_3MM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-OMP.cpp b/src/polybench/POLYBENCH_3MM-OMP.cpp index 966853d7d..19b15098a 100644 --- a/src/polybench/POLYBENCH_3MM-OMP.cpp +++ b/src/polybench/POLYBENCH_3MM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp index c25a49dee..d165426d1 100644 --- a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM-Seq.cpp b/src/polybench/POLYBENCH_3MM-Seq.cpp index 619b2ff10..24098e109 100644 --- a/src/polybench/POLYBENCH_3MM-Seq.cpp +++ b/src/polybench/POLYBENCH_3MM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index a649e2e89..d05332c8f 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 4331e3930..c95c9b000 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Cuda.cpp b/src/polybench/POLYBENCH_ADI-Cuda.cpp index a43da0efc..ff5216dc9 100644 --- a/src/polybench/POLYBENCH_ADI-Cuda.cpp +++ b/src/polybench/POLYBENCH_ADI-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Hip.cpp b/src/polybench/POLYBENCH_ADI-Hip.cpp index 151d0d81c..51a69a986 100644 --- a/src/polybench/POLYBENCH_ADI-Hip.cpp +++ b/src/polybench/POLYBENCH_ADI-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-OMP.cpp b/src/polybench/POLYBENCH_ADI-OMP.cpp index 022888a54..08df1e9f7 100644 --- a/src/polybench/POLYBENCH_ADI-OMP.cpp +++ b/src/polybench/POLYBENCH_ADI-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp index c67e5a20a..8cee39e2c 100644 --- a/src/polybench/POLYBENCH_ADI-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ADI-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI-Seq.cpp b/src/polybench/POLYBENCH_ADI-Seq.cpp index 899a9a57b..3ec703a50 100644 --- a/src/polybench/POLYBENCH_ADI-Seq.cpp +++ b/src/polybench/POLYBENCH_ADI-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 7a31468a6..54997a63b 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index 848fb9dc4..dcbd3573e 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Cuda.cpp b/src/polybench/POLYBENCH_ATAX-Cuda.cpp index e244b08db..e83dc9590 100644 --- a/src/polybench/POLYBENCH_ATAX-Cuda.cpp +++ b/src/polybench/POLYBENCH_ATAX-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Hip.cpp b/src/polybench/POLYBENCH_ATAX-Hip.cpp index f918d355e..bbed90a83 100644 --- a/src/polybench/POLYBENCH_ATAX-Hip.cpp +++ b/src/polybench/POLYBENCH_ATAX-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-OMP.cpp b/src/polybench/POLYBENCH_ATAX-OMP.cpp index fda8ab7fd..a3880cc8f 100644 --- a/src/polybench/POLYBENCH_ATAX-OMP.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp index e9e13e9cd..ce7ba4843 100644 --- a/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_ATAX-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX-Seq.cpp b/src/polybench/POLYBENCH_ATAX-Seq.cpp index 05a19093d..4791b7018 100644 --- a/src/polybench/POLYBENCH_ATAX-Seq.cpp +++ b/src/polybench/POLYBENCH_ATAX-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 5a9d15e89..693420ca1 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index f94ade140..ea948fbe1 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp index 51c1f1f90..b71772a99 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp index 4b5e4c4ec..ddb62d9a5 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp index 28d06bdc7..6eaf696f3 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp index c34d939ad..be8c0491e 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp index cc23d5a18..ed8e43833 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index ed2432d87..575c40e3f 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index e1d1b67c3..19d0a3db2 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp index 87ebd0adf..5edb43b97 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp index 144abf695..ec408396a 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp index 1b2f57e4d..8aab52a55 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp index e0e5d6d54..f3af4a088 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp index 36ac66a84..8b088db8e 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 03c1e65ba..614ba9ed1 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index e8a067377..c2901a838 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Cuda.cpp b/src/polybench/POLYBENCH_GEMM-Cuda.cpp index a1cd2ec2f..2307900dd 100644 --- a/src/polybench/POLYBENCH_GEMM-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMM-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Hip.cpp b/src/polybench/POLYBENCH_GEMM-Hip.cpp index ce1b73c0d..f92beaed0 100644 --- a/src/polybench/POLYBENCH_GEMM-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMM-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-OMP.cpp b/src/polybench/POLYBENCH_GEMM-OMP.cpp index 21f63e7f2..444af4df7 100644 --- a/src/polybench/POLYBENCH_GEMM-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp index a1d618b5b..a660ec35e 100644 --- a/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMM-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM-Seq.cpp b/src/polybench/POLYBENCH_GEMM-Seq.cpp index 84fa70002..1aaaab119 100644 --- a/src/polybench/POLYBENCH_GEMM-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMM-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 6094ce908..63897bd37 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 33ea77997..ef9e6121d 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp index 0b02c7e57..d5119ede5 100644 --- a/src/polybench/POLYBENCH_GEMVER-Cuda.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Hip.cpp b/src/polybench/POLYBENCH_GEMVER-Hip.cpp index 5d1fa4be3..f51e15d42 100644 --- a/src/polybench/POLYBENCH_GEMVER-Hip.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index a20872867..5af84061d 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp index b12be578a..29f487d73 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index 5a4f9199a..c1524a2ef 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 7223f85fd..657beb3e0 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 07ecae962..a30448c7b 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp index c2772e506..24ed43947 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp index 2105ce55c..5a156b799 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Hip.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp index f9efd4d31..bc59ada36 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMP.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp index 86e73b293..8f572c16a 100644 --- a/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp index d7ba3fc70..34b70708f 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Seq.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index ea8e2224f..d227756da 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 32a1b0eae..507e8baaa 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp index 9bf147bdd..70fa00a76 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp index 224630f63..6a2fb3329 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp index 1b9380a15..ba80f5022 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp index 7a70c3f87..1c3999279 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp index 25af09240..115661cc7 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 4f14b54f9..2c3487c14 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 03150a267..28e860a02 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp index 9ff945c92..570792bbd 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp index ce9336999..d77497459 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp index 42ae4a0d5..0c7cbae57 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp index 39a2423df..35089be71 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index 20c8c9b73..383822a87 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index b2beb0dfd..cd56dd74c 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 5c84e0682..52ba1e3ca 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp index 5826a90a4..2620d0654 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp index 7c4a3f835..8aac79440 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp index 51f3cb146..d3d7b0471 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp index 97806cfac..e711660cc 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp index 107dd4ec4..18cc343cd 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 9fe51e5c1..4216c115a 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index fe77836cb..aed073955 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Cuda.cpp b/src/polybench/POLYBENCH_MVT-Cuda.cpp index d1990d8fe..83ea50512 100644 --- a/src/polybench/POLYBENCH_MVT-Cuda.cpp +++ b/src/polybench/POLYBENCH_MVT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Hip.cpp b/src/polybench/POLYBENCH_MVT-Hip.cpp index 7d33627bd..636ad234c 100644 --- a/src/polybench/POLYBENCH_MVT-Hip.cpp +++ b/src/polybench/POLYBENCH_MVT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-OMP.cpp b/src/polybench/POLYBENCH_MVT-OMP.cpp index 159a86274..bb9c8f221 100644 --- a/src/polybench/POLYBENCH_MVT-OMP.cpp +++ b/src/polybench/POLYBENCH_MVT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp index c9ff17751..5b278628d 100644 --- a/src/polybench/POLYBENCH_MVT-OMPTarget.cpp +++ b/src/polybench/POLYBENCH_MVT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT-Seq.cpp b/src/polybench/POLYBENCH_MVT-Seq.cpp index 9d63fd997..efa2ec452 100644 --- a/src/polybench/POLYBENCH_MVT-Seq.cpp +++ b/src/polybench/POLYBENCH_MVT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index c0a5b8bb9..41caad0d4 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 518d75dd8..44d645648 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index d0e41694f..582482471 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -9,7 +9,7 @@ */ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp index 51e5bdf81..1a4280cc9 100644 --- a/src/stream-kokkos/ADD-Kokkos.cpp +++ b/src/stream-kokkos/ADD-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/CMakeLists.txt b/src/stream-kokkos/CMakeLists.txt index 4cd38bdf5..6ba8dbbb6 100644 --- a/src/stream-kokkos/CMakeLists.txt +++ b/src/stream-kokkos/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/stream-kokkos/COPY-Kokkos.cpp b/src/stream-kokkos/COPY-Kokkos.cpp index d363cd944..3312a57fa 100644 --- a/src/stream-kokkos/COPY-Kokkos.cpp +++ b/src/stream-kokkos/COPY-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp index ca6b0e304..ff1124068 100644 --- a/src/stream-kokkos/DOT-Kokkos.cpp +++ b/src/stream-kokkos/DOT-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/MUL-Kokkos.cpp b/src/stream-kokkos/MUL-Kokkos.cpp index aa53b0d66..e1f17be92 100644 --- a/src/stream-kokkos/MUL-Kokkos.cpp +++ b/src/stream-kokkos/MUL-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream-kokkos/TRIAD-Kokkos.cpp b/src/stream-kokkos/TRIAD-Kokkos.cpp index 3b897a46a..2d5465939 100644 --- a/src/stream-kokkos/TRIAD-Kokkos.cpp +++ b/src/stream-kokkos/TRIAD-Kokkos.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp index c70add7db..7b79f1b10 100644 --- a/src/stream/ADD-Cuda.cpp +++ b/src/stream/ADD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 0ff31e74f..fe470d391 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-OMP.cpp b/src/stream/ADD-OMP.cpp index ddd24eb30..22f850da3 100644 --- a/src/stream/ADD-OMP.cpp +++ b/src/stream/ADD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp index 6e4302446..c1a1480cf 100644 --- a/src/stream/ADD-OMPTarget.cpp +++ b/src/stream/ADD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index a07fe24d6..516fe61a6 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 02cf25107..82a2ef653 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 49e09a602..5b17d398a 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt index 03351ff5d..b730791fc 100644 --- a/src/stream/CMakeLists.txt +++ b/src/stream/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp index 5977b8926..a45d45a16 100644 --- a/src/stream/COPY-Cuda.cpp +++ b/src/stream/COPY-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 51d66459f..f5e19fac0 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-OMP.cpp b/src/stream/COPY-OMP.cpp index d9a0aa2a9..1718ff5ac 100644 --- a/src/stream/COPY-OMP.cpp +++ b/src/stream/COPY-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp index a9250c4cd..f1dd5017d 100644 --- a/src/stream/COPY-OMPTarget.cpp +++ b/src/stream/COPY-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 311f9754d..25b897707 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index c92018c63..b1b946664 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 0544e0d2f..0e639e68d 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index cce51a90c..45a2a5a8d 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 000331381..0badd32fb 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 6b7d67e0e..59e4cdf22 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index 7ab5d578e..f9a049770 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index fe7568191..715d99cce 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index cc32be5f2..f9500c992 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 05e304973..ca3330d2b 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp index 2fa56fd1a..55731255b 100644 --- a/src/stream/MUL-Cuda.cpp +++ b/src/stream/MUL-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index d2cd11b7c..0990ac09b 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-OMP.cpp b/src/stream/MUL-OMP.cpp index 3369d0f3d..e5a17864e 100644 --- a/src/stream/MUL-OMP.cpp +++ b/src/stream/MUL-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp index c5f20d6b3..07edb732f 100644 --- a/src/stream/MUL-OMPTarget.cpp +++ b/src/stream/MUL-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index 9107945fd..8bffdb3ca 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index fba825bf6..fa9e2b995 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 3db59092a..aea2cd08b 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp index b5a3f2e34..89f931f6c 100644 --- a/src/stream/TRIAD-Cuda.cpp +++ b/src/stream/TRIAD-Cuda.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 6b42deabe..aebaa3ec1 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-OMP.cpp b/src/stream/TRIAD-OMP.cpp index 5d9832d95..abbadb240 100644 --- a/src/stream/TRIAD-OMP.cpp +++ b/src/stream/TRIAD-OMP.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp index dfea3158d..5ec18d155 100644 --- a/src/stream/TRIAD-OMPTarget.cpp +++ b/src/stream/TRIAD-OMPTarget.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index 96ab6ccea..132892f76 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index d9897618c..1d64f153d 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 3f65bf804..efc2e8d78 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 38329f98f..88b61bbe5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # diff --git a/test/test-raja-perf-suite.cpp b/test/test-raja-perf-suite.cpp index f5992f3a9..6f36958c0 100644 --- a/test/test-raja-perf-suite.cpp +++ b/test/test-raja-perf-suite.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // From 40b595c928ae0296671f86ce93b92329995c196f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 8 Jan 2024 10:21:42 -0800 Subject: [PATCH 244/454] Update raja for camp support for rocm 6 --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 19116582a..f3e0fc5ed 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 19116582ad9ddd87b724656be36bb73e624b13f2 +Subproject commit f3e0fc5ed3bb0e8dd7dcb4d822a00c0875ad0e7a From ce1ea95623f8feaaea93d693a0c92459cdfd502f Mon Sep 17 00:00:00 2001 From: Brian Homerding Date: Fri, 19 Jan 2024 14:47:52 +0000 Subject: [PATCH 245/454] Add LCALS SYCL variants and updated NESTED_INIT policy --- src/basic/NESTED_INIT-Sycl.cpp | 8 +-- src/lcals/CMakeLists.txt | 9 ++++ src/lcals/DIFF_PREDICT-Sycl.cpp | 38 ++++---------- src/lcals/DIFF_PREDICT.cpp | 3 ++ src/lcals/DIFF_PREDICT.hpp | 4 ++ src/lcals/EOS-Sycl.cpp | 43 ++++----------- src/lcals/EOS.cpp | 3 ++ src/lcals/EOS.hpp | 4 ++ src/lcals/FIRST_DIFF-Sycl.cpp | 40 ++++---------- src/lcals/FIRST_DIFF.cpp | 3 ++ src/lcals/FIRST_DIFF.hpp | 4 ++ src/lcals/FIRST_MIN.hpp | 1 + src/lcals/FIRST_SUM.hpp | 1 + src/lcals/GEN_LIN_RECUR-Sycl.cpp | 48 +++++------------ src/lcals/GEN_LIN_RECUR.cpp | 3 ++ src/lcals/GEN_LIN_RECUR.hpp | 4 ++ src/lcals/HYDRO_1D-Sycl.cpp | 42 ++++----------- src/lcals/HYDRO_1D.cpp | 3 ++ src/lcals/HYDRO_1D.hpp | 4 ++ src/lcals/HYDRO_2D-Sycl.cpp | 89 ++++++-------------------------- src/lcals/HYDRO_2D.cpp | 3 ++ src/lcals/HYDRO_2D.hpp | 4 ++ src/lcals/INT_PREDICT-Sycl.cpp | 35 +++---------- src/lcals/INT_PREDICT.cpp | 3 ++ src/lcals/INT_PREDICT.hpp | 4 ++ src/lcals/PLANCKIAN-Sycl.cpp | 45 ++++------------ src/lcals/PLANCKIAN.cpp | 3 ++ src/lcals/PLANCKIAN.hpp | 4 ++ src/lcals/TRIDIAG_ELIM-Sycl.cpp | 40 +++----------- src/lcals/TRIDIAG_ELIM.cpp | 3 ++ src/lcals/TRIDIAG_ELIM.hpp | 4 ++ 31 files changed, 167 insertions(+), 335 deletions(-) diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index b7351cedb..94abf0f7e 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -98,9 +98,9 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::SyclKernelAsync< - RAJA::statement::For<2, RAJA::sycl_global_2<1>, // k + RAJA::statement::For<0, RAJA::sycl_global_2, // i RAJA::statement::For<1, RAJA::sycl_global_1<1>, // j - RAJA::statement::For<0, RAJA::sycl_global_0, // i + RAJA::statement::For<2, RAJA::sycl_global_0<1>, // i RAJA::statement::Lambda<0> > > @@ -111,9 +111,9 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, nk), RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), + RAJA::RangeSegment(0, ni)), [=] (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }); diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt index f767bbd0b..b4eb6120c 100644 --- a/src/lcals/CMakeLists.txt +++ b/src/lcals/CMakeLists.txt @@ -14,18 +14,21 @@ blt_add_library( DIFF_PREDICT-Cuda.cpp DIFF_PREDICT-OMP.cpp DIFF_PREDICT-OMPTarget.cpp + DIFF_PREDICT-Sycl.cpp EOS.cpp EOS-Seq.cpp EOS-Hip.cpp EOS-Cuda.cpp EOS-OMP.cpp EOS-OMPTarget.cpp + EOS-Sycl.cpp FIRST_DIFF.cpp FIRST_DIFF-Seq.cpp FIRST_DIFF-Hip.cpp FIRST_DIFF-Cuda.cpp FIRST_DIFF-OMP.cpp FIRST_DIFF-OMPTarget.cpp + FIRST_DIFF-Sycl.cpp FIRST_MIN.cpp FIRST_MIN-Seq.cpp FIRST_MIN-Hip.cpp @@ -44,35 +47,41 @@ blt_add_library( GEN_LIN_RECUR-Cuda.cpp GEN_LIN_RECUR-OMP.cpp GEN_LIN_RECUR-OMPTarget.cpp + GEN_LIN_RECUR-Sycl.cpp HYDRO_1D.cpp HYDRO_1D-Seq.cpp HYDRO_1D-Hip.cpp HYDRO_1D-Cuda.cpp HYDRO_1D-OMP.cpp HYDRO_1D-OMPTarget.cpp + HYDRO_1D-Sycl.cpp HYDRO_2D.cpp HYDRO_2D-Seq.cpp HYDRO_2D-Hip.cpp HYDRO_2D-Cuda.cpp HYDRO_2D-OMP.cpp HYDRO_2D-OMPTarget.cpp + HYDRO_2D-Sycl.cpp INT_PREDICT.cpp INT_PREDICT-Seq.cpp INT_PREDICT-Hip.cpp INT_PREDICT-Cuda.cpp INT_PREDICT-OMP.cpp INT_PREDICT-OMPTarget.cpp + INT_PREDICT-Sycl.cpp PLANCKIAN.cpp PLANCKIAN-Seq.cpp PLANCKIAN-Hip.cpp PLANCKIAN-Cuda.cpp PLANCKIAN-OMP.cpp PLANCKIAN-OMPTarget.cpp + PLANCKIAN-Sycl.cpp TRIDIAG_ELIM.cpp TRIDIAG_ELIM-Seq.cpp TRIDIAG_ELIM-Hip.cpp TRIDIAG_ELIM-Cuda.cpp TRIDIAG_ELIM-OMP.cpp TRIDIAG_ELIM-OMPTarget.cpp + TRIDIAG_ELIM-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp index dbb33c171..28d01cd98 100644 --- a/src/lcals/DIFF_PREDICT-Sycl.cpp +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -21,7 +21,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -29,22 +28,8 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define DIFF_PREDICT_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(px, m_px, m_array_length, qu); \ - allocAndInitSyclDeviceData(cx, m_cx, m_array_length, qu); - -#define DIFF_PREDICT_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_px, px, m_array_length, qu); \ - deallocSyclDeviceData(px, qu); \ - deallocSyclDeviceData(cx, qu); - -void DIFF_PREDICT::runSyclVariant(VariantID vid) +template +void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -54,16 +39,15 @@ void DIFF_PREDICT::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - DIFF_PREDICT_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(grid_size, block_size), - [=] (sycl::nd_item<1> item ) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -76,16 +60,12 @@ void DIFF_PREDICT::runSyclVariant(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - DIFF_PREDICT_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - DIFF_PREDICT_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { DIFF_PREDICT_BODY; }); @@ -94,13 +74,13 @@ void DIFF_PREDICT::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - DIFF_PREDICT_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n DIFF_PREDICT : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DIFF_PREDICT, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index b5ddc90e4..48ad0b9ec 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -50,6 +50,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 3a583381b..1ebc4bdfd 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -94,13 +94,17 @@ class DIFF_PREDICT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp index c81cd163b..6b8beaacc 100644 --- a/src/lcals/EOS-Sycl.cpp +++ b/src/lcals/EOS-Sycl.cpp @@ -21,33 +21,15 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf { namespace lcals { - - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - -#define EOS_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ - allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ - allocAndInitSyclDeviceData(z, m_z, m_array_length, qu); \ - allocAndInitSyclDeviceData(u, m_u, m_array_length, qu); - -#define EOS_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_x, x, m_array_length, qu); \ - deallocSyclDeviceData(x, qu); \ - deallocSyclDeviceData(y, qu); \ - deallocSyclDeviceData(z, qu); \ - deallocSyclDeviceData(u, qu); -void EOS::runSyclVariant(VariantID vid) +template +void EOS::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -57,16 +39,14 @@ void EOS::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - EOS_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); - qu->submit([&] (sycl::handler& h) - { - h.parallel_for(sycl::nd_range<1>(grid_size, block_size), - [=] (sycl::nd_item<1> item ) { + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -79,16 +59,12 @@ void EOS::runSyclVariant(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - EOS_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - EOS_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { EOS_BODY; }); @@ -97,12 +73,11 @@ void EOS::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - EOS_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n EOS : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(EOS, Sycl) } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 517d144f8..14e3ea7de 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -58,6 +58,9 @@ EOS::EOS(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 9cc202a02..c80831082 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -63,13 +63,17 @@ class EOS : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp index fa32b4e35..323a0313b 100644 --- a/src/lcals/FIRST_DIFF-Sycl.cpp +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -21,7 +21,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -29,22 +28,8 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define FIRST_DIFF_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(x, m_x, m_N, qu); \ - allocAndInitSyclDeviceData(y, m_y, m_N, qu); - -#define FIRST_DIFF_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_x, x, m_N, qu); \ - deallocSyclDeviceData(x, qu); \ - deallocSyclDeviceData(y, qu); - -void FIRST_DIFF::runSyclVariant(VariantID vid) +template +void FIRST_DIFF::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -54,16 +39,13 @@ void FIRST_DIFF::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - FIRST_DIFF_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); - qu->submit([&] (sycl::handler& h) - { - h.parallel_for(sycl::nd_range<1>(grid_size, block_size), - [=] (sycl::nd_item<1> item) { + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -76,16 +58,12 @@ void FIRST_DIFF::runSyclVariant(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - FIRST_DIFF_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - FIRST_DIFF_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { FIRST_DIFF_BODY; }); @@ -94,13 +72,13 @@ void FIRST_DIFF::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - FIRST_DIFF_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n FIRST_DIFF : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_DIFF, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 3e8e42ec6..4310ae285 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -54,6 +54,9 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index f3f6424f0..42127ea97 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -53,13 +53,17 @@ class FIRST_DIFF : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 1431dad62..30069584a 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -80,6 +80,7 @@ class FIRST_MIN : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); +// void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 59c1c0bfd..221fcfab3 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -56,6 +56,7 @@ class FIRST_SUM : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); +// void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp index 9c78c87b1..6c98c8908 100644 --- a/src/lcals/GEN_LIN_RECUR-Sycl.cpp +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -21,26 +21,8 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define GEN_LIN_RECUR_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(b5, m_b5, m_N, qu); \ - allocAndInitSyclDeviceData(stb5, m_stb5, m_N, qu); \ - allocAndInitSyclDeviceData(sa, m_sa, m_N, qu); \ - allocAndInitSyclDeviceData(sb, m_sb, m_N, qu); - -#define GEN_LIN_RECUR_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_b5, b5, m_N, qu); \ - deallocSyclDeviceData(b5, qu); \ - deallocSyclDeviceData(stb5, qu); \ - deallocSyclDeviceData(sa, qu); \ - deallocSyclDeviceData(sb, qu); - -void GEN_LIN_RECUR::runSyclVariant(VariantID vid) +template +void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -48,15 +30,13 @@ void GEN_LIN_RECUR::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - GEN_LIN_RECUR_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size1 = block_size * RAJA_DIVIDE_CEILING_INT(N, block_size); + const size_t global_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size1, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (global_size1, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type k = item.get_global_id(0); if (k < N) { @@ -66,10 +46,10 @@ void GEN_LIN_RECUR::runSyclVariant(VariantID vid) }); }); - const size_t grid_size2 = block_size * RAJA_DIVIDE_CEILING_INT(N+1, block_size); + const size_t global_size2 = work_group_size * RAJA_DIVIDE_CEILING_INT(N+1, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size2, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (global_size2, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i > 0 && i < N+1) { @@ -82,21 +62,17 @@ void GEN_LIN_RECUR::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - GEN_LIN_RECUR_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - GEN_LIN_RECUR_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(0, N), [=] (Index_type k) { GEN_LIN_RECUR_BODY1; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(1, N+1), [=] (Index_type i) { GEN_LIN_RECUR_BODY2; }); @@ -105,13 +81,13 @@ void GEN_LIN_RECUR::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - GEN_LIN_RECUR_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n GEN_LIN_RECUR : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(GEN_LIN_RECUR, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 9c132a3db..061b20a80 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -58,6 +58,9 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 9586a69b4..6ca26fb32 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -77,13 +77,17 @@ class GEN_LIN_RECUR : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp index ab2ab8899..12d29b335 100644 --- a/src/lcals/HYDRO_1D-Sycl.cpp +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -21,7 +21,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -29,24 +28,8 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define HYDRO_1D_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ - allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ - allocAndInitSyclDeviceData(z, m_z, m_array_length, qu); - -#define HYDRO_1D_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_x, x, m_array_length, qu); \ - deallocSyclDeviceData(x, qu); \ - deallocSyclDeviceData(y, qu); \ - deallocSyclDeviceData(z, qu); - -void HYDRO_1D::runSyclVariant(VariantID vid) +template +void HYDRO_1D::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -56,16 +39,13 @@ void HYDRO_1D::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - HYDRO_1D_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); - qu->submit([&] (sycl::handler& h) - { - h.parallel_for(sycl::nd_range<1>(grid_size, block_size), - [=] (sycl::nd_item<1> item ) { + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -78,16 +58,12 @@ void HYDRO_1D::runSyclVariant(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - HYDRO_1D_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - HYDRO_1D_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { HYDRO_1D_BODY; }); @@ -96,13 +72,13 @@ void HYDRO_1D::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - HYDRO_1D_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n HYDRO_1D : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HYDRO_1D, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index d92267fc9..4c6d3a35c 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -57,6 +57,9 @@ HYDRO_1D::HYDRO_1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index dd61f112c..3ba97458b 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -58,13 +58,17 @@ class HYDRO_1D : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index 3e605ed49..a737c97f7 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -29,42 +29,12 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - constexpr size_t j_block_sz = 32; - constexpr size_t k_block_sz = 8; - -#define HYDRO_2D_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(zadat, m_za, m_array_length, qu); \ - allocAndInitSyclDeviceData(zbdat, m_zb, m_array_length, qu); \ - allocAndInitSyclDeviceData(zmdat, m_zm, m_array_length, qu); \ - allocAndInitSyclDeviceData(zpdat, m_zp, m_array_length, qu); \ - allocAndInitSyclDeviceData(zqdat, m_zq, m_array_length, qu); \ - allocAndInitSyclDeviceData(zrdat, m_zr, m_array_length, qu); \ - allocAndInitSyclDeviceData(zudat, m_zu, m_array_length, qu); \ - allocAndInitSyclDeviceData(zvdat, m_zv, m_array_length, qu); \ - allocAndInitSyclDeviceData(zzdat, m_zz, m_array_length, qu); \ - allocAndInitSyclDeviceData(zroutdat, m_zrout, m_array_length, qu); \ - allocAndInitSyclDeviceData(zzoutdat, m_zzout, m_array_length, qu); - -#define HYDRO_2D_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_zrout, zroutdat, m_array_length, qu); \ - getSyclDeviceData(m_zzout, zzoutdat, m_array_length, qu); \ - deallocSyclDeviceData(zadat, qu); \ - deallocSyclDeviceData(zbdat, qu); \ - deallocSyclDeviceData(zmdat, qu); \ - deallocSyclDeviceData(zpdat, qu); \ - deallocSyclDeviceData(zqdat, qu); \ - deallocSyclDeviceData(zrdat, qu); \ - deallocSyclDeviceData(zudat, qu); \ - deallocSyclDeviceData(zvdat, qu); \ - deallocSyclDeviceData(zzdat, qu); \ - deallocSyclDeviceData(zroutdat, qu); \ - deallocSyclDeviceData(zzoutdat, qu); - -void HYDRO_2D::runSyclVariant(VariantID vid) -{ +#define j_block_sz (32) +#define k_block_sz (work_group_size / j_block_sz) + +template +void HYDRO_2D::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); const Index_type kbeg = 1; const Index_type kend = m_kn - 1; @@ -74,17 +44,15 @@ void HYDRO_2D::runSyclVariant(VariantID vid) HYDRO_2D_DATA_SETUP; if ( vid == Base_SYCL ) { - - HYDRO_2D_DATA_SETUP_SYCL; - auto kn_grid_size = k_block_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_block_sz); - auto jn_grid_size = j_block_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz); + auto kn_global_size = k_block_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_block_sz); + auto jn_global_size = j_block_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_grid_size, jn_grid_size), + h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_global_size, jn_global_size), sycl::range<2>(k_block_sz,j_block_sz)), [=] (sycl::nd_item<2> item) { @@ -98,7 +66,7 @@ void HYDRO_2D::runSyclVariant(VariantID vid) }); }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_grid_size, jn_grid_size), + h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_global_size, jn_global_size), sycl::range<2>(k_block_sz,j_block_sz)), [=] (sycl::nd_item<2> item) { @@ -113,7 +81,7 @@ void HYDRO_2D::runSyclVariant(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_grid_size, jn_grid_size), + h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_global_size, jn_global_size), sycl::range<2>(k_block_sz,j_block_sz)), [=] (sycl::nd_item<2> item) { @@ -126,45 +94,20 @@ void HYDRO_2D::runSyclVariant(VariantID vid) }); }); -/* qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<2>(kn-2, jn-2), - sycl::id<2>(1, 1), // offset to start a idx 1 - [=] (sycl::item<2> item ) { - int j = item.get_id(1); - int k = item.get_id(0); - HYDRO_2D_BODY2 - - }); - }); - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<2>(kn-2, jn-2), - sycl::id<2>(1, 1), // offset to start a idx 1 - [=] (sycl::item<2> item ) { - int j = item.get_id(1); - int k = item.get_id(0); - HYDRO_2D_BODY3 - - }); - });*/ } qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - HYDRO_2D_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - HYDRO_2D_DATA_SETUP_SYCL; - HYDRO_2D_VIEWS_RAJA; using EXECPOL = RAJA::KernelPolicy< - RAJA::statement::SyclKernel< - RAJA::statement::For<0, RAJA::sycl_global_1<8>, // k - RAJA::statement::For<1, RAJA::sycl_global_2<32>, // j + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, // k + RAJA::statement::For<1, RAJA::sycl_global_2, // j RAJA::statement::Lambda<0> > > @@ -199,13 +142,13 @@ void HYDRO_2D::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - HYDRO_2D_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n HYDRO_2D : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(HYDRO_2D, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index fd1dd9406..84f0fa597 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -72,6 +72,9 @@ HYDRO_2D::HYDRO_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index b6ad936ca..120048aad 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -152,13 +152,17 @@ class HYDRO_2D : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp index 371be9ff7..5d09278e2 100644 --- a/src/lcals/INT_PREDICT-Sycl.cpp +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -21,7 +21,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -29,20 +28,8 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define INT_PREDICT_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(px, m_px, m_array_length, qu); - -#define INT_PREDICT_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_px, px, m_array_length, qu); \ - deallocSyclDeviceData(px, qu); - -void INT_PREDICT::runSyclVariant(VariantID vid) +template +void INT_PREDICT::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -52,16 +39,14 @@ void INT_PREDICT::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - INT_PREDICT_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -74,16 +59,12 @@ void INT_PREDICT::runSyclVariant(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - INT_PREDICT_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - INT_PREDICT_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { INT_PREDICT_BODY; }); @@ -92,13 +73,13 @@ void INT_PREDICT::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - INT_PREDICT_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n INT_PREDICT : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INT_PREDICT, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index eb56b5725..f1583d625 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -50,6 +50,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index a81ae6fb2..6493ec815 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -73,13 +73,17 @@ class INT_PREDICT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp index e3f71316a..9a2d29d6c 100644 --- a/src/lcals/PLANCKIAN-Sycl.cpp +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -30,28 +30,8 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define PLANCKIAN_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(x, m_x, iend, qu); \ - allocAndInitSyclDeviceData(y, m_y, iend, qu); \ - allocAndInitSyclDeviceData(u, m_u, iend, qu); \ - allocAndInitSyclDeviceData(v, m_v, iend, qu); \ - allocAndInitSyclDeviceData(w, m_w, iend, qu); - -#define PLANCKIAN_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_w, w, iend, qu); \ - deallocSyclDeviceData(x, qu); \ - deallocSyclDeviceData(y, qu); \ - deallocSyclDeviceData(u, qu); \ - deallocSyclDeviceData(v, qu); \ - deallocSyclDeviceData(w, qu); - -void PLANCKIAN::runSyclVariant(VariantID vid) +template +void PLANCKIAN::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -63,16 +43,13 @@ void PLANCKIAN::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - PLANCKIAN_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); - qu->submit([&] (sycl::handler& h) - { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -85,16 +62,12 @@ void PLANCKIAN::runSyclVariant(VariantID vid) qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); - PLANCKIAN_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - PLANCKIAN_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { PLANCKIAN_BODY; }); @@ -103,13 +76,13 @@ void PLANCKIAN::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - PLANCKIAN_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n PLANCKIAN : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PLANCKIAN, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 2bb8d3f7b..353ddca86 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -50,6 +50,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 92b55fc95..3d3c4145f 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -58,13 +58,17 @@ class PLANCKIAN : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp index 06b513d03..e8237882f 100644 --- a/src/lcals/TRIDIAG_ELIM-Sycl.cpp +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -21,26 +21,8 @@ namespace rajaperf namespace lcals { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define TRIDIAG_ELIM_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(xout, m_xout, m_N, qu); \ - allocAndInitSyclDeviceData(xin, m_xin, m_N, qu); \ - allocAndInitSyclDeviceData(y, m_y, m_N, qu); \ - allocAndInitSyclDeviceData(z, m_z, m_N, qu); - -#define TRIDIAG_ELIM_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_xout, xout, m_N, qu); \ - deallocSyclDeviceData(xout, qu); \ - deallocSyclDeviceData(xin, qu); \ - deallocSyclDeviceData(y, qu); \ - deallocSyclDeviceData(z, qu); - -void TRIDIAG_ELIM::runSyclVariant(VariantID vid) +template +void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -50,15 +32,13 @@ void TRIDIAG_ELIM::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - TRIDIAG_ELIM_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i > 0 && i < iend) { @@ -71,16 +51,12 @@ void TRIDIAG_ELIM::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - TRIDIAG_ELIM_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { TRIDIAG_ELIM_BODY; }); @@ -89,13 +65,13 @@ void TRIDIAG_ELIM::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - TRIDIAG_ELIM_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n TRIDIAG_ELIM : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(TRIDIAG_ELIM, Sycl) + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 710927c3e..10512c9e0 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -52,6 +52,9 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index c95685de9..07f5ef276 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -58,13 +58,17 @@ class TRIDIAG_ELIM : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From d109a8b7b0d7badb84074a2faec87d3f06f7a3a7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 9 Feb 2024 12:51:17 -0800 Subject: [PATCH 246/454] Remove --allow-problematic-implementations We no longer have any problematic implementations so remove this flag --- src/basic/INDEXLIST.cpp | 8 ++------ src/common/RunParams.cpp | 11 ----------- src/common/RunParams.hpp | 5 ----- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index 27aaec8b4..cb559c8b2 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -46,13 +46,9 @@ INDEXLIST::INDEXLIST(const RunParams& params) setVariantDefined( Base_OpenMPTarget ); #endif - if (params.getAllowProblematicImplementations()) { - // these may deadlock depending on the order that blocks are scheduled + setVariantDefined( Base_CUDA ); - setVariantDefined( Base_CUDA ); - - setVariantDefined( Base_HIP ); - } + setVariantDefined( Base_HIP ); } INDEXLIST::~INDEXLIST() diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 061e143cf..57dbab075 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -71,7 +71,6 @@ RunParams::RunParams(int argc, char** argv) add_to_spot_config(), #endif disable_warmup(false), - allow_problematic_implementations(false), run_kernels(), run_variants() { @@ -142,8 +141,6 @@ void RunParams::print(std::ostream& str) const #endif str << "\n disable_warmup = " << disable_warmup; - str << "\n allow_problematic_implementations = " - << allow_problematic_implementations; str << "\n seq data space = " << getDataSpaceName(seqDataSpace); str << "\n omp data space = " << getDataSpaceName(ompDataSpace); @@ -799,11 +796,6 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) disable_warmup = true; - } else if ( std::string(argv[i]) == - std::string("--allow-problematic-implementations") ) { - - allow_problematic_implementations = true; - } else if ( std::string(argv[i]) == std::string("--checkrun") ) { input_state = CheckRun; @@ -993,9 +985,6 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --disable-warmup (disable warmup kernels) [Default is run warmup kernels that are relevant to kernels selected to run]\n\n"; - str << "\t --allow-problematic-implementations (allow problematic kernel implementations) [Default is to not allow problematic kernel implementations to run]\n" - << "\t These implementations may deadlock causing the code to hang indefinitely.\n\n"; - str << "\t --kernels, -k [Default is run all]\n" << "\t (names of individual kernels and/or groups of kernels to run)\n" << "\t See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 10ae761a0..798d64e0a 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -181,9 +181,6 @@ class RunParams { bool getDisableWarmup() const { return disable_warmup; } - bool getAllowProblematicImplementations() const - { return allow_problematic_implementations; } - const std::set& getKernelIDsToRun() const { return run_kernels; } const std::set& getVariantIDsToRun() const { return run_variants; } VariantID getReferenceVariantID() const { return reference_vid; } @@ -303,8 +300,6 @@ class RunParams { bool disable_warmup; - bool allow_problematic_implementations; - std::set run_kernels; std::set run_variants; From 001738331fd76a881161038846c5f9d5c358b996 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 9 Feb 2024 13:00:02 -0800 Subject: [PATCH 247/454] Fixup comment in build script --- scripts/lc-builds/toss4_cray-mpich_amdclang.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 0b36e8817..0a8dea853 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -104,11 +104,10 @@ echo echo " module unload rocm" echo " srun -n1 make" echo -echo " Please note that cray-mpich requires libmodules.so.1 from cce and" -echo " libpgmath.so from rocm/llvm to run." +echo " Please note that rocm requires libpgmath.so from rocm/llvm to run." echo " Until this is handled transparently in the build system you may add " -echo " cce and rocm/llvm to your LD_LIBRARY_PATH." +echo " rocm/llvm to your LD_LIBRARY_PATH." echo -echo " export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/cce-tce/cce-13.0.2/cce/x86_64/lib/:/usr/rocm-5.7.0/llvm/lib" +echo " export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm-${COMP_VER}/llvm/lib" echo echo "***********************************************************************" From ec276fd4f54812d2a510a945b44cdcf0197684e7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 12 Feb 2024 09:49:35 -0800 Subject: [PATCH 248/454] generalize gpu_block_size helpers Rename gpu_block_size namespace to integer and make_list_type to make_gpu_block_size_list_type --- src/algorithm/MEMCPY.hpp | 2 +- src/algorithm/MEMSET.hpp | 2 +- src/algorithm/REDUCE_SUM.hpp | 2 +- src/apps/CONVECTION3DPA.hpp | 2 +- src/apps/DEL_DOT_VEC_2D.hpp | 2 +- src/apps/DIFFUSION3DPA.hpp | 2 +- src/apps/EDGE3D.hpp | 2 +- src/apps/ENERGY.hpp | 2 +- src/apps/FIR.hpp | 2 +- src/apps/LTIMES-Cuda.cpp | 4 ++-- src/apps/LTIMES-Hip.cpp | 4 ++-- src/apps/LTIMES.hpp | 4 ++-- src/apps/LTIMES_NOVIEW-Cuda.cpp | 4 ++-- src/apps/LTIMES_NOVIEW-Hip.cpp | 4 ++-- src/apps/LTIMES_NOVIEW.hpp | 4 ++-- src/apps/MASS3DEA.hpp | 2 +- src/apps/MASS3DPA.hpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.hpp | 2 +- src/apps/PRESSURE.hpp | 2 +- src/apps/VOL3D.hpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D.hpp | 2 +- src/basic/ARRAY_OF_PTRS.hpp | 2 +- src/basic/COPY8.hpp | 2 +- src/basic/DAXPY.hpp | 2 +- src/basic/DAXPY_ATOMIC.hpp | 2 +- src/basic/IF_QUAD.hpp | 2 +- src/basic/INDEXLIST.hpp | 2 +- src/basic/INDEXLIST_3LOOP.hpp | 2 +- src/basic/INIT3.hpp | 2 +- src/basic/INIT_VIEW1D.hpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.hpp | 2 +- src/basic/MAT_MAT_SHARED-Cuda.cpp | 2 +- src/basic/MAT_MAT_SHARED-Hip.cpp | 2 +- src/basic/MAT_MAT_SHARED.hpp | 2 +- src/basic/MULADDSUB.hpp | 2 +- src/basic/NESTED_INIT.hpp | 4 ++-- src/basic/PI_ATOMIC.hpp | 2 +- src/basic/PI_REDUCE.hpp | 2 +- src/basic/REDUCE3_INT.hpp | 2 +- src/basic/REDUCE_STRUCT.hpp | 2 +- src/basic/TRAP_INT.hpp | 2 +- src/comm/HALO_EXCHANGE.hpp | 2 +- src/comm/HALO_EXCHANGE_FUSED.hpp | 2 +- src/comm/HALO_PACKING.hpp | 2 +- src/comm/HALO_PACKING_FUSED.hpp | 2 +- src/common/GPUUtils.hpp | 6 +++--- src/lcals/DIFF_PREDICT.hpp | 2 +- src/lcals/EOS.hpp | 2 +- src/lcals/FIRST_DIFF.hpp | 2 +- src/lcals/FIRST_MIN.hpp | 2 +- src/lcals/FIRST_SUM.hpp | 2 +- src/lcals/GEN_LIN_RECUR.hpp | 2 +- src/lcals/HYDRO_1D.hpp | 2 +- src/lcals/HYDRO_2D.hpp | 4 ++-- src/lcals/INT_PREDICT.hpp | 2 +- src/lcals/PLANCKIAN.hpp | 2 +- src/lcals/TRIDIAG_ELIM.hpp | 2 +- src/polybench/POLYBENCH_2MM.hpp | 4 ++-- src/polybench/POLYBENCH_3MM.hpp | 4 ++-- src/polybench/POLYBENCH_ADI.hpp | 2 +- src/polybench/POLYBENCH_ATAX.hpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.hpp | 4 ++-- src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 4 ++-- src/polybench/POLYBENCH_GEMM.hpp | 4 ++-- src/polybench/POLYBENCH_GEMVER.hpp | 4 ++-- src/polybench/POLYBENCH_GESUMMV.hpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.hpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_1D.hpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.hpp | 4 ++-- src/polybench/POLYBENCH_MVT.hpp | 2 +- src/rajaperf_config.hpp.in | 6 +++--- src/stream/ADD.hpp | 2 +- src/stream/COPY.hpp | 2 +- src/stream/DOT.hpp | 2 +- src/stream/MUL.hpp | 2 +- src/stream/TRIAD.hpp | 2 +- 76 files changed, 96 insertions(+), 96 deletions(-) diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp index f788a0e40..b6cd49038 100644 --- a/src/algorithm/MEMCPY.hpp +++ b/src/algorithm/MEMCPY.hpp @@ -71,7 +71,7 @@ class MEMCPY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/algorithm/MEMSET.hpp b/src/algorithm/MEMSET.hpp index 8be682823..0266c9e1a 100644 --- a/src/algorithm/MEMSET.hpp +++ b/src/algorithm/MEMSET.hpp @@ -71,7 +71,7 @@ class MEMSET : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_type m_val; diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index bb244208d..716794930 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -74,7 +74,7 @@ class REDUCE_SUM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_type m_sum_init; diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index d59b7e319..9d8eea6e8 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -388,7 +388,7 @@ class CONVECTION3DPA : public KernelBase private: static const size_t default_gpu_block_size = CPA_Q1D * CPA_Q1D * CPA_Q1D; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 14db3565a..a7f85a279 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -128,7 +128,7 @@ class DEL_DOT_VEC_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 5dff5e5ef..0c5271ddc 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -491,7 +491,7 @@ class DIFFUSION3DPA : public KernelBase private: static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp index 3707f90ed..ac87bf331 100644 --- a/src/apps/EDGE3D.hpp +++ b/src/apps/EDGE3D.hpp @@ -427,7 +427,7 @@ class EDGE3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 4a47e7912..8cee73a4c 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -213,7 +213,7 @@ class ENERGY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_e_new; Real_ptr m_e_old; diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 41933c4e8..1686e3250 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -88,7 +88,7 @@ class FIR : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_in; Real_ptr m_out; diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index c64d0d87d..a0142d1aa 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for CUDA execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index 87d1686c2..949694d10 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for Hip execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index c45be3ac9..be270804e 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -126,8 +126,8 @@ class LTIMES : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 39dbe6c66..9486f20e2 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for CUDA execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index 722071f1d..be6e2d756 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -25,8 +25,8 @@ namespace apps // Define thread block shape for Hip execution // #define m_block_sz (32) -#define g_block_sz (gpu_block_size::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (gpu_block_size::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) +#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 61db05db4..b85a96497 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -76,8 +76,8 @@ class LTIMES_NOVIEW : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index 3726cd470..300af975b 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -163,7 +163,7 @@ class MASS3DEA : public KernelBase { private: static const size_t default_gpu_block_size = MEA_D1D * MEA_D1D * MEA_D1D; using gpu_block_sizes_type = - gpu_block_size::list_type; + integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 8a70e326d..7489ee7af 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -373,7 +373,7 @@ class MASS3DPA : public KernelBase private: static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; Real_ptr m_Bt; diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index 51edd3310..18a4864f9 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -107,7 +107,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_vol; diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 82ab50aa0..0ba273c34 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -82,7 +82,7 @@ class PRESSURE : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_compression; Real_ptr m_bvc; diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index dce286a89..f341b5739 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -183,7 +183,7 @@ class VOL3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 6adedd04e..0e9c292d6 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -91,7 +91,7 @@ class ZONAL_ACCUMULATION_3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_vol; diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index fca763190..f94528fa9 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -83,7 +83,7 @@ class ARRAY_OF_PTRS : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index 7d047eba4..cd5ee83bc 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -90,7 +90,7 @@ class COPY8 : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x0; Real_ptr m_x1; diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index c61be7e9a..4bd653b15 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -63,7 +63,7 @@ class DAXPY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index ffaa4cc4e..8cc44ffe2 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -66,7 +66,7 @@ class DAXPY_ATOMIC : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index ce47ec332..81edf4437 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -80,7 +80,7 @@ class IF_QUAD : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index 0836d8197..efcf94aa8 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -70,7 +70,7 @@ class INDEXLIST : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_x; Int_ptr m_list; diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index e19ee5508..c298b8e45 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -81,7 +81,7 @@ class INDEXLIST_3LOOP : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_x; Int_ptr m_list; diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 89451433a..9be3544d8 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -66,7 +66,7 @@ class INIT3 : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 7512a6d81..11a0b75f7 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -77,7 +77,7 @@ class INIT_VIEW1D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_type m_val; diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 75e13923a..4d592bc26 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -76,7 +76,7 @@ class INIT_VIEW1D_OFFSET : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_type m_val; diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index 23e317815..926c5f979 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -50,7 +50,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, template < size_t block_size > void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) { - constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); + constexpr Index_type tile_size = integer::sqrt(block_size); static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index acf08168b..9c58d9267 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -50,7 +50,7 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, template < size_t block_size > void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) { - constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); + constexpr Index_type tile_size = integer::sqrt(block_size); static_assert(tile_size*tile_size == block_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 095721c27..5022a2c84 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -149,7 +149,7 @@ class MAT_MAT_SHARED : public KernelBase { private: static const size_t default_gpu_block_size = TL_SZ * TL_SZ; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_A; Real_ptr m_B; diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 778e23838..7db655b3a 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -69,7 +69,7 @@ class MULADDSUB : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index f26b9cba4..224c32f26 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -69,8 +69,8 @@ class NESTED_INIT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_array_length; diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 71327ce6b..399a2c172 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -72,7 +72,7 @@ class PI_ATOMIC : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_type m_dx; Real_type m_pi_init; diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 48a2fd519..bd906e17d 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -70,7 +70,7 @@ class PI_REDUCE : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_type m_dx; Real_type m_pi; diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index e56a7bc20..0d9b81b3a 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -85,7 +85,7 @@ class REDUCE3_INT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Int_ptr m_vec; Int_type m_vsum; diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 00c1400a4..0deaf4254 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -122,7 +122,7 @@ class REDUCE_STRUCT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; Real_type m_init_sum; Real_type m_init_min; diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 841d25a40..832e67fd6 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -82,7 +82,7 @@ class TRAP_INT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_type m_x0; Real_type m_xp; diff --git a/src/comm/HALO_EXCHANGE.hpp b/src/comm/HALO_EXCHANGE.hpp index d0eea9f86..8f3cf1cda 100644 --- a/src/comm/HALO_EXCHANGE.hpp +++ b/src/comm/HALO_EXCHANGE.hpp @@ -122,7 +122,7 @@ class HALO_EXCHANGE : public HALO_base private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; int m_mpi_size = -1; int m_my_mpi_rank = -1; diff --git a/src/comm/HALO_EXCHANGE_FUSED.hpp b/src/comm/HALO_EXCHANGE_FUSED.hpp index efc3a9501..a0962be3a 100644 --- a/src/comm/HALO_EXCHANGE_FUSED.hpp +++ b/src/comm/HALO_EXCHANGE_FUSED.hpp @@ -184,7 +184,7 @@ class HALO_EXCHANGE_FUSED : public HALO_base private: static const size_t default_gpu_block_size = 1024; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; int m_mpi_size = -1; int m_my_mpi_rank = -1; diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp index 0cf329b92..7b4531e74 100644 --- a/src/comm/HALO_PACKING.hpp +++ b/src/comm/HALO_PACKING.hpp @@ -98,7 +98,7 @@ class HALO_PACKING : public HALO_base private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_num_vars; Index_type m_var_size; diff --git a/src/comm/HALO_PACKING_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp index d89444104..065c0be3a 100644 --- a/src/comm/HALO_PACKING_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -162,7 +162,7 @@ class HALO_PACKING_FUSED : public HALO_base private: static const size_t default_gpu_block_size = 1024; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_num_vars; Index_type m_var_size; diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 4ccb20495..87492efa0 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -19,7 +19,7 @@ namespace rajaperf { -namespace gpu_block_size +namespace integer { namespace detail @@ -130,7 +130,7 @@ struct ExactSqrt // otherwise it is a list containing just default_block_size. // Invalid entries are removed according to validity_checker in either case. template < size_t default_block_size, typename validity_checker = AllowAny > -using make_list_type = +using make_gpu_block_size_list_type = typename detail::remove_invalid::value > 0), rajaperf::configuration::gpu_block_sizes, @@ -138,7 +138,7 @@ using make_list_type = >::type >::type; -} // closing brace for gpu_block_size namespace +} // closing brace for integer namespace namespace gpu_algorithm { diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 79ff8f9ac..bfd17de49 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -104,7 +104,7 @@ class DIFF_PREDICT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_px; Real_ptr m_cx; diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 91a4c1f00..5f4c4c458 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -73,7 +73,7 @@ class EOS : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 850f23852..1ca26ac01 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -63,7 +63,7 @@ class FIRST_DIFF : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index f8d8192b9..c187b8b59 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -94,7 +94,7 @@ class FIRST_MIN : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_type m_xmin_init; diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index ddf5d9c33..046252862 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -66,7 +66,7 @@ class FIRST_SUM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 1bcc39cf6..4f2427b47 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -87,7 +87,7 @@ class GEN_LIN_RECUR : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_b5; Real_ptr m_sa; diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 980b92281..1836f0371 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -68,7 +68,7 @@ class HYDRO_1D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index e65af1320..a622b543b 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -162,8 +162,8 @@ class HYDRO_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Real_ptr m_za; Real_ptr m_zb; diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index ff7a834d9..0dc974901 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -83,7 +83,7 @@ class INT_PREDICT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_array_length; Index_type m_offset; diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index dbbf9ceef..49803ff3c 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -68,7 +68,7 @@ class PLANCKIAN : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index b4e101103..714b390ab 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -68,7 +68,7 @@ class TRIDIAG_ELIM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_xout; Real_ptr m_xin; diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 944a88bda..836b05aee 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -137,8 +137,8 @@ class POLYBENCH_2MM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index c95c9b000..a4215289d 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -163,8 +163,8 @@ class POLYBENCH_3MM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index dcbd3573e..519a2de1f 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -205,7 +205,7 @@ class POLYBENCH_ADI : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_n; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index ea948fbe1..5e64d125f 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -125,7 +125,7 @@ class POLYBENCH_ATAX : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; Real_ptr m_tmp; diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index 19d0a3db2..2631b05c1 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -123,8 +123,8 @@ class POLYBENCH_FDTD_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_nx; Index_type m_ny; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index c2901a838..5a9d7f26e 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -86,8 +86,8 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_N; diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index ef9e6121d..1d788154c 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -109,8 +109,8 @@ class POLYBENCH_GEMM : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index a30448c7b..048645a3f 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -162,8 +162,8 @@ class POLYBENCH_GEMVER : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_n; Real_type m_alpha; diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 507e8baaa..75d4aa8c9 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -108,7 +108,7 @@ class POLYBENCH_GESUMMV : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 28e860a02..64c394630 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -132,8 +132,8 @@ class POLYBENCH_HEAT_3D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 52ba1e3ca..c86280036 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -78,7 +78,7 @@ class POLYBENCH_JACOBI_1D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index aed073955..7e8819bf1 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -97,8 +97,8 @@ class POLYBENCH_JACOBI_2D : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type>; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 44d645648..809c3e624 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -122,7 +122,7 @@ class POLYBENCH_MVT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Index_type m_N; Real_ptr m_x1; diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 582482471..7c7350df2 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -53,13 +53,13 @@ inline void RAJAPERF_UNUSED_VAR(Ts&&...) { } namespace rajaperf { -namespace gpu_block_size { +namespace integer { // helper alias to convert comma separated integer literals into list template < size_t... Is > using list_type = camp::list< camp::integral_constant... >; -} // closing brace for gpu_block_size namespace +} // closing brace for integer namespace struct configuration { #if defined(RAJA_PERFSUITE_USE_CALIPER) @@ -110,7 +110,7 @@ const adiak::catstring adiak_machine_build = std::string("@RAJAPERF_BUILD_HOST@" #endif // List of GPU block sizes -using gpu_block_sizes = gpu_block_size::list_type<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; +using gpu_block_sizes = integer::list_type<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; // Name of user who ran code std::string user_run; diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 5b17d398a..f33f3c83f 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -63,7 +63,7 @@ class ADD : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index 0e639e68d..b707d8014 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -62,7 +62,7 @@ class COPY : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_c; diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index ca3330d2b..650fefae3 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -66,7 +66,7 @@ class DOT : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index aea2cd08b..337d45943 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -63,7 +63,7 @@ class MUL : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_b; Real_ptr m_c; diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index efc2e8d78..a9aef7b7f 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -64,7 +64,7 @@ class TRIAD : public KernelBase private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_a; Real_ptr m_b; From dce1bb5a92937e25de49ac76ec3c2df2fe41f921 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 12 Feb 2024 10:41:27 -0800 Subject: [PATCH 249/454] Add ATOMIC kernel --- CMakeLists.txt | 9 ++ docs/sphinx/user_guide/build.rst | 20 ++- src/CMakeLists.txt | 3 + src/algorithm/ATOMIC-Cuda.cpp | 241 +++++++++++++++++++++++++++++ src/algorithm/ATOMIC-Hip.cpp | 241 +++++++++++++++++++++++++++++ src/algorithm/ATOMIC-OMP.cpp | 153 ++++++++++++++++++ src/algorithm/ATOMIC-OMPTarget.cpp | 127 +++++++++++++++ src/algorithm/ATOMIC-Seq.cpp | 146 +++++++++++++++++ src/algorithm/ATOMIC.cpp | 79 ++++++++++ src/algorithm/ATOMIC.hpp | 107 +++++++++++++ src/algorithm/CMakeLists.txt | 6 + src/common/Executor.cpp | 7 +- src/common/GPUUtils.hpp | 13 ++ src/common/KernelBase.hpp | 15 ++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + src/common/RunParams.cpp | 44 ++++++ src/common/RunParams.hpp | 11 ++ src/rajaperf_config.hpp.in | 6 +- 19 files changed, 1231 insertions(+), 4 deletions(-) create mode 100644 src/algorithm/ATOMIC-Cuda.cpp create mode 100644 src/algorithm/ATOMIC-Hip.cpp create mode 100644 src/algorithm/ATOMIC-OMP.cpp create mode 100644 src/algorithm/ATOMIC-OMPTarget.cpp create mode 100644 src/algorithm/ATOMIC-Seq.cpp create mode 100644 src/algorithm/ATOMIC.cpp create mode 100644 src/algorithm/ATOMIC.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4145d0c37..677acb095 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,8 @@ set(RAJA_USE_CHRONO On CACHE BOOL "") set(RAJA_PERFSUITE_GPU_BLOCKSIZES "" CACHE STRING "Comma separated list of GPU block sizes, ex '256,1024'") +set(RAJA_PERFSUITE_ATOMIC_REPLICATIONS "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'") + set(RAJA_RANGE_ALIGN 4) set(RAJA_RANGE_MIN_LENGTH 32) set(RAJA_DATA_ALIGN 64) @@ -86,6 +88,13 @@ else() message(STATUS "Using default gpu block size(s)") endif() +string(LENGTH "${RAJA_PERFSUITE_ATOMIC_REPLICATIONS}" ATOMIC_REPLICATIONS_LENGTH) +if (ATOMIC_REPLICATIONS_LENGTH GREATER 0) + message(STATUS "Using atomic replication(s): ${RAJA_PERFSUITE_ATOMIC_REPLICATIONS}") +else() + message(STATUS "Using default atomic replication(s)") +endif() + # exclude RAJA make targets from top-level build... add_subdirectory(tpl/RAJA) diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst index db8f0e663..8601c555c 100644 --- a/docs/sphinx/user_guide/build.rst +++ b/docs/sphinx/user_guide/build.rst @@ -201,7 +201,7 @@ multiple versions of GPU kernels that will run with different GPU thread-block sizes. The CMake option for this is ``-DRAJA_PERFSUITE_GPU_BLOCKSIZES=``. For example:: - $ mkdir my-gnu-build + $ mkdir my-gpu-build $ cd my-gpu-build $ cmake \ -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ @@ -211,6 +211,24 @@ sizes. The CMake option for this is will build versions of GPU kernels that use 64, 128, 256, 512, and 1024 threads per GPU thread-block. +Building with specific GPU atomic replication tunings +----------------------------------------------------- + +If desired, you can build a version of the RAJA Performance Suite code with +multiple versions of GPU kernels that will run with different GPU atomic +replication amounts. The CMake option for this is +``-DRAJA_PERFSUITE_ATOMIC_REPLICATIONS=``. For example:: + + $ mkdir my-gpu-build + $ cd my-gpu-build + $ cmake \ + -DRAJA_PERFSUITE_ATOMIC_REPLICATIONS=1,256,4096 \ + .. + $ make -j + +will build versions of GPU kernels that use 1, 256, and 4096 atomic +replications. + Building with Caliper --------------------- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8091a6df8..70b5b1d4f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -244,6 +244,9 @@ blt_add_executable( algorithm/MEMCPY.cpp algorithm/MEMCPY-Seq.cpp algorithm/MEMCPY-OMPTarget.cpp + algorithm/ATOMIC.cpp + algorithm/ATOMIC-Seq.cpp + algorithm/ATOMIC-OMPTarget.cpp comm/HALO_base.cpp comm/HALO_PACKING.cpp comm/HALO_PACKING-Seq.cpp diff --git a/src/algorithm/ATOMIC-Cuda.cpp b/src/algorithm/ATOMIC-Cuda.cpp new file mode 100644 index 000000000..b0d9d5198 --- /dev/null +++ b/src/algorithm/ATOMIC-Cuda.cpp @@ -0,0 +1,241 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_thread(Real_ptr atomic, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_block(Real_ptr atomic, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, blockIdx.x); + } +} + + +template < size_t block_size, size_t replication > +void ATOMIC::runCudaVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (atomic_replicate_thread), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +template < size_t block_size, size_t replication > +void ATOMIC::runCudaVariantReplicateBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (atomic_replicate_block), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +void ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + if ( vid == Base_CUDA ) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateBlock(vid); + + } + + t += 1; + + } + + }); + + } + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + if ( vid == Base_CUDA ) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_block_"+std::to_string(block_size)); + + } + + }); + + } + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/algorithm/ATOMIC-Hip.cpp b/src/algorithm/ATOMIC-Hip.cpp new file mode 100644 index 000000000..c04aa5928 --- /dev/null +++ b/src/algorithm/ATOMIC-Hip.cpp @@ -0,0 +1,241 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_thread(Real_ptr atomic, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_block(Real_ptr atomic, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, blockIdx.x); + } +} + + +template < size_t block_size, size_t replication > +void ATOMIC::runHipVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (atomic_replicate_thread), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +template < size_t block_size, size_t replication > +void ATOMIC::runHipVariantReplicateBlock(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (atomic_replicate_block), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + +void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + if ( vid == Base_HIP ) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateBlock(vid); + + } + + t += 1; + + } + + }); + + } + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + if ( vid == Base_HIP ) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_block_"+std::to_string(block_size)); + + } + + }); + + } + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/algorithm/ATOMIC-OMP.cpp b/src/algorithm/ATOMIC-OMP.cpp new file mode 100644 index 000000000..2f805d961 --- /dev/null +++ b/src/algorithm/ATOMIC-OMP.cpp @@ -0,0 +1,153 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +template < size_t replication > +void ATOMIC::runOpenMPVariantReplicate(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + ATOMIC_BODY(i); + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto atomic_base_lam = [=](Index_type i) { + #pragma omp atomic + ATOMIC_BODY(i); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + atomic_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + ATOMIC_RAJA_BODY(RAJA::omp_atomic, i); + }); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n ATOMIC : Unknown variant id = " << vid << std::endl; + } + + } + + ATOMIC_DATA_TEARDOWN(replication); + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + + +void ATOMIC::runOpenMPVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + runOpenMPVariantReplicate(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setOpenMPTuningDefinitions(VariantID vid) +{ + if ( vid == Base_OpenMP || vid == Lambda_OpenMP || vid == RAJA_OpenMP ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/ATOMIC-OMPTarget.cpp b/src/algorithm/ATOMIC-OMPTarget.cpp new file mode 100644 index 000000000..a488a62ee --- /dev/null +++ b/src/algorithm/ATOMIC-OMPTarget.cpp @@ -0,0 +1,127 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +template < size_t replication > +void ATOMIC::runOpenMPTargetReplicate(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(atomic) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + ATOMIC_BODY(i); + } + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + ATOMIC_RAJA_BODY(RAJA::omp_atomic, i); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); + +} + +void ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + runOpenMPTargetVariantReplicate(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/algorithm/ATOMIC-Seq.cpp b/src/algorithm/ATOMIC-Seq.cpp new file mode 100644 index 000000000..c8e9a68f0 --- /dev/null +++ b/src/algorithm/ATOMIC-Seq.cpp @@ -0,0 +1,146 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +template < size_t replication > +void ATOMIC::runSeqVariantReplicate(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + ATOMIC_BODY(i); + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto atomic_base_lam = [=](Index_type i) { + ATOMIC_BODY(i); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + atomic_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + ATOMIC_RAJA_BODY(RAJA::seq_atomic, i); + }); + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n ATOMIC : Unknown variant id = " << vid << std::endl; + } + + } + + ATOMIC_DATA_TEARDOWN(replication); + +} + + +void ATOMIC::runSeqVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_Seq || vid == Lambda_Seq || vid == RAJA_Seq ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + runSeqVariantReplicate(vid); + + } + + t += 1; + + } + + }); + + } else { + + getCout() << "\n ATOMIC : Unknown OMP Target variant id = " << vid << std::endl; + + } + +} + +void ATOMIC::setSeqTuningDefinitions(VariantID vid) +{ + if ( vid == Base_Seq || vid == Lambda_Seq || vid == RAJA_Seq ) { + + seq_for(cpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp new file mode 100644 index 000000000..b2054c937 --- /dev/null +++ b/src/algorithm/ATOMIC.cpp @@ -0,0 +1,79 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace algorithm +{ + + +ATOMIC::ATOMIC(const RunParams& params) + : KernelBase(rajaperf::Algorithm_ATOMIC, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(50); + + setActualProblemSize( getTargetProblemSize() ); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + + (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setFLOPsPerRep(getActualProblemSize()); + + setUsesFeature(Forall); + setUsesFeature(Atomic); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( Lambda_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( Lambda_HIP ); + setVariantDefined( RAJA_HIP ); +} + +ATOMIC::~ATOMIC() +{ +} + +void ATOMIC::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + m_init = 0; + m_final = -static_cast(vid); +} + +void ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += static_cast(m_final); +} + +void ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp new file mode 100644 index 000000000..5f1d8628b --- /dev/null +++ b/src/algorithm/ATOMIC.hpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// ATOMIC kernel reference implementation: +/// Test atomic throughput with an amount of replication known at compile time. +/// +/// for (Index_type i = 0; i < N; ++i ) { +/// atomic[i%replication] += 1; +/// } +/// + +#ifndef RAJAPerf_Algorithm_ATOMIC_HPP +#define RAJAPerf_Algorithm_ATOMIC_HPP + +#define ATOMIC_DATA_SETUP(replication) \ + Real_type init = m_init; \ + Real_ptr atomic; \ + allocAndInitDataConst(atomic, replication, init, vid); + +#define ATOMIC_DATA_TEARDOWN(replication) \ + { \ + auto reset_atomic = scopedMoveData(atomic, replication, vid); \ + m_final = init; \ + for (size_t r = 0; r < replication; ++r ) { \ + m_final += atomic[r]; \ + } \ + } \ + deallocData(atomic, vid); + +#define ATOMIC_BODY(i) \ + atomic[(i)%replication] += 1.0 + +#define ATOMIC_RAJA_BODY(policy, i) \ + RAJA::atomicAdd(&atomic[(i)%replication], 1.0) + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace algorithm +{ + +class ATOMIC : public KernelBase +{ +public: + + ATOMIC(const RunParams& params); + + ~ATOMIC(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + + template < size_t replication > + void runSeqVariantReplicate(VariantID vid); + template < size_t replication > + void runOpenMPVariantReplicate(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateGlobal(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateBlock(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateGlobal(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateBlock(VariantID vid); + template < size_t replication > + void runOpenMPTargetVariantReplicate(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + static const size_t default_cpu_atomic_replication = 64; + using cpu_atomic_replications_type = integer::make_atomic_replication_list_type; + static const size_t default_atomic_replication = 4096; + using atomic_replications_type = integer::make_atomic_replication_list_type; + + Real_type m_init; + Real_type m_final; +}; + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index 731bfdc76..43e3279e0 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -42,5 +42,11 @@ blt_add_library( MEMCPY-Cuda.cpp MEMCPY-OMP.cpp MEMCPY-OMPTarget.cpp + ATOMIC.cpp + ATOMIC-Seq.cpp + ATOMIC-Hip.cpp + ATOMIC-Cuda.cpp + ATOMIC-OMP.cpp + ATOMIC-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index dee369d98..bf6b7ddf2 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -168,8 +168,11 @@ Executor::Executor(int argc, char** argv) if (strlen(cc.adiak_cmake_hip_architectures) > 0) { adiak::value("cmake_hip_architectures", cc.adiak_cmake_hip_architectures); } - if (cc.adiak_gpu_targets_block_sizes.size() > 0) { - adiak::value("gpu_targets_block_sizes", cc.adiak_gpu_targets_block_sizes); + if (cc.adiak_gpu_block_sizes.size() > 0) { + adiak::value("gpu_block_sizes", cc.adiak_gpu_block_sizes); + } + if (cc.adiak_atomic_replications.size() > 0) { + adiak::value("atomic_replications", cc.adiak_atomic_replications); } if (cc.adiak_raja_hipcc_flags.size() > 0) { adiak::value("raja_hipcc_flags", cc.adiak_raja_hipcc_flags); diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 87492efa0..d11b3e58e 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -138,6 +138,19 @@ using make_gpu_block_size_list_type = >::type >::type; +// A camp::list of camp::integral_constant types. +// If atomic_replications from the configuration is not empty it is those atomic_replications, +// otherwise it is a list containing just default_atomic_replication. +// Invalid entries are removed according to validity_checker in either case. +template < size_t default_atomic_replication, typename validity_checker = AllowAny > +using make_atomic_replication_list_type = + typename detail::remove_invalid::value > 0), + rajaperf::configuration::atomic_replications, + list_type + >::type + >::type; + } // closing brace for integer namespace namespace gpu_algorithm { diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 32f32f64b..7cbec1c47 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -278,6 +278,21 @@ class KernelBase ptr, len, getDataAlignment()); } + template + void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, T val) + { + rajaperf::allocAndInitDataConst(dataSpace, + ptr, len, getDataAlignment(), val); + } + + template + rajaperf::AutoDataMover scopedMoveData(DataSpace dataSpace, T*& ptr, Size_type len) + { + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + rajaperf::moveData(hds, dataSpace, ptr, len, getDataAlignment()); + return {dataSpace, hds, ptr, len, getDataAlignment()}; + } + template void copyData(DataSpace dst_dataSpace, T* dst_ptr, DataSpace src_dataSpace, const T* src_ptr, diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index dc9d0e20b..17bd15bb7 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -104,6 +104,7 @@ #include "algorithm/REDUCE_SUM.hpp" #include "algorithm/MEMSET.hpp" #include "algorithm/MEMCPY.hpp" +#include "algorithm/ATOMIC.hpp" // // Comm kernels... @@ -254,6 +255,7 @@ static const std::string KernelNames [] = std::string("Algorithm_REDUCE_SUM"), std::string("Algorithm_MEMSET"), std::string("Algorithm_MEMCPY"), + std::string("Algorithm_ATOMIC"), // // Comm kernels... @@ -986,6 +988,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new algorithm::MEMCPY(run_params); break; } + case Algorithm_ATOMIC: { + kernel = new algorithm::ATOMIC(run_params); + break; + } // // Comm kernels... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index a112a44d1..fdb2878c4 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -164,6 +164,7 @@ enum KernelID { Algorithm_REDUCE_SUM, Algorithm_MEMSET, Algorithm_MEMCPY, + Algorithm_ATOMIC, // // Comm kernels... diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 57dbab075..4d1e90df9 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -40,6 +40,7 @@ RunParams::RunParams(int argc, char** argv) data_alignment(RAJA::DATA_ALIGN), gpu_stream(1), gpu_block_sizes(), + atomic_replications(), mpi_size(1), mpi_rank(0), mpi_3d_division({-1, -1, -1}), @@ -123,6 +124,10 @@ void RunParams::print(std::ostream& str) const for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { str << "\n\t" << gpu_block_sizes[j]; } + str << "\n atomic_replications = "; + for (size_t j = 0; j < atomic_replications.size(); ++j) { + str << "\n\t" << atomic_replications[j]; + } str << "\n mpi_size = " << mpi_size; str << "\n mpi_3d_division = "; for (size_t j = 0; j < 3; ++j) { @@ -465,6 +470,37 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--atomic_replication") ) { + + bool got_someting = false; + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + got_someting = true; + int atomic_replication = ::atoi( opt.c_str() ); + if ( atomic_replication <= 0 ) { + getCout() << "\nBad input:" + << " must give --atomic_replication POSITIVE values (int)" + << std::endl; + input_state = BadInput; + } else { + atomic_replications.push_back(atomic_replication); + } + ++i; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give --atomic_replication one or more values (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--mpi_3d_division") ) { int num_got = 0; @@ -1077,6 +1113,14 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --gpu_block_size 128 256 512 (runs kernels with gpu_block_size 128, 256, and 512)\n\n"; + str << "\t --atomic_replication [no default]\n" + << "\t (atomic replications to run for all GPU kernels)\n" + << "\t GPU kernels not supporting atomic_replication option will be skipped.\n" + << "\t Behavior depends on kernel implementations and \n" + << "\t values give via CMake variable RAJA_PERFSUITE_ATOMIC_REPLICATIONS.\n"; + str << "\t\t Example...\n" + << "\t\t --atomic_replication 128 256 512 (runs kernels with atomic_replication 128, 256, and 512)\n\n"; + str << "\t --mpi_3d_division [no default]\n" << "\t (number of mpi ranks in each dimension in a 3d grid)\n" << "\t (3D MPI kernels will be skipped if the product of mpi_3d_division is not equal to the number of ranks)\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 798d64e0a..6d58a1302 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -134,6 +134,16 @@ class RunParams { } return false; } + size_t numValidAtomicReplication() const { return atomic_replications.size(); } + bool validAtomicReplication(size_t atomic_replication) const + { + for (size_t valid_atomic_replication : atomic_replications) { + if (valid_atomic_replication == atomic_replication) { + return true; + } + } + return false; + } int getMPISize() const { return mpi_size; } int getMPIRank() const { return mpi_rank; } @@ -233,6 +243,7 @@ class RunParams { int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ + std::vector atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */ int mpi_size; /*!< Number of MPI ranks */ int mpi_rank; /*!< Rank of this MPI process */ std::array mpi_3d_division; /*!< Number of MPI ranks in each dimension of a 3D grid */ diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 7c7350df2..679e8ffba 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -102,7 +102,8 @@ const adiak::version adiak_compiler_version = std::string("@CMAKE_CXX_COMPILER_V const adiak::version adiak_cuda_compiler_version = std::string("@CMAKE_CUDA_COMPILER_VERSION@"); constexpr static const char* adiak_gpu_targets = "@GPU_TARGETS@"; constexpr static const char* adiak_cmake_hip_architectures = "@CMAKE_HIP_ARCHIECTURES@"; -const std::vector adiak_gpu_targets_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@}; +const std::vector adiak_gpu_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@}; +const std::vector adiak_atomic_replications = {@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@}; const std::vector adiak_raja_hipcc_flags = str_to_list(std::string("@RAJA_HIPCC_FLAGS@")); const adiak::catstring adiak_mpi_cxx_compiler = std::string("@MPI_CXX_COMPILER@"); const adiak::catstring adiak_systype_build = std::string("@RAJAPERF_BUILD_SYSTYPE@"); @@ -112,6 +113,9 @@ const adiak::catstring adiak_machine_build = std::string("@RAJAPERF_BUILD_HOST@" // List of GPU block sizes using gpu_block_sizes = integer::list_type<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; +// List of GPU atomic replications +using atomic_replications = integer::list_type<@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@>; + // Name of user who ran code std::string user_run; From 1cc7a121cdf3874dd2eed9b0fc725c5c1d31fe2f Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 15 Feb 2024 16:32:46 -0800 Subject: [PATCH 250/454] update BLT to v0.6.1 release --- blt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blt b/blt index a7f0a6ecc..148c53ecc 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81 +Subproject commit 148c53ecc8bcaad5eaa4c1e39cb8144b8f1388ae From 8913f3dd20934138544bf77694f47200c473665b Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 15 Feb 2024 16:33:32 -0800 Subject: [PATCH 251/454] update RAJA tpl and gitlab ci to match RAJA --- .gitlab-ci.yml | 6 ++++-- .gitlab/custom-jobs-and-variables.yml | 14 +++++++------- .gitlab/jobs/corona.yml | 16 +++++++--------- .gitlab/jobs/lassen.yml | 22 ++++++++++++++-------- .gitlab/jobs/poodle.yml | 16 +++++++++++----- .gitlab/jobs/ruby.yml | 16 +++++++++++----- .gitlab/jobs/tioga.yml | 14 ++++++++++---- .uberenv_config.json | 2 +- tpl/RAJA | 2 +- 9 files changed, 66 insertions(+), 42 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9a9b83686..0d5f6bc6c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,5 +1,5 @@ ############################################################################### -# Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC # and RAJA Performance Suite project contributors. # See the RAJAPerf/LICENSE file for details. # @@ -49,7 +49,9 @@ variables: GITHUB_PROJECT_NAME: "RAJAPerf" GITHUB_PROJECT_ORG: "LLNL" # Set the build-and-test command. - JOB_CMD: "./scripts/gitlab/build_and_test.sh" + JOB_CMD: + value: "./scripts/gitlab/build_and_test.sh" + expand: false # Override the pattern describing branches that will skip the "draft PR filter # test". Add protected branches here. See default value in # preliminary-ignore-draft-pr.yml. diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index d4478afd7..78e6c28d3 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -22,7 +22,7 @@ variables: # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby - PROJECT_RUBY_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + PROJECT_RUBY_DEPS: "" # Poodle # Arguments for top level allocation @@ -35,7 +35,7 @@ variables: # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle - PROJECT_POODLE_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + PROJECT_POODLE_DEPS: "" # Corona # Arguments for top level allocation @@ -45,7 +45,7 @@ variables: # Project specific variants for corona PROJECT_CORONA_VARIANTS: "~shared ~openmp" # Project specific deps for corona - PROJECT_CORONA_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + PROJECT_CORONA_DEPS: "^blt@develop " # Tioga # Arguments for top level allocation @@ -53,18 +53,18 @@ variables: # Arguments for job level allocation TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for tioga - PROJECT_TIOGA_VARIANTS: "~shared ~openmp" + PROJECT_TIOGA_VARIANTS: "~shared +openmp" # Project specific deps for tioga - PROJECT_TIOGA_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + PROJECT_TIOGA_DEPS: "^blt@develop " # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_JOB_ALLOC: "1 -W 16" + LASSEN_JOB_ALLOC: "1 -W 16 -q pci" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen - PROJECT_LASSEN_DEPS: "^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + PROJECT_LASSEN_DEPS: "^blt@develop " # Configuration shared by build and test jobs specific to this project. # Not all configuration can be shared. Here projects can fine tune the diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index dc7de5077..bc9e0c7d1 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -6,6 +6,12 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################# +# Override reproducer section to define project specific variables. +.corona_reproducer_vars: &corona_reproducer_vars + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## @@ -13,10 +19,7 @@ # We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that # the comparison with the original job is easier. -rocmcc_5_6_0_hip: - variables: - SPEC: "${PROJECT_CORONA_VARIANTS} +rocm amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ${PROJECT_CORONA_DEPS}" - extends: .job_on_corona +# No overridden jobs so far. ############ # Extra jobs @@ -25,11 +28,6 @@ rocmcc_5_6_0_hip: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -rocmcc_5_6_0_hip_mpi: - variables: - SPEC: "~shared ~openmp +rocm +mpi amdgpu_target=gfx906 %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" - extends: .job_on_corona - # With GitLab CI, included files cannot be empty. variables: INCLUDED_FILE_CANNOT_BE_EMPTY: "True" diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index 112972606..9d87bb2b0 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -6,6 +6,12 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## +# Override reproducer section to define project specific variables. +.lassen_reproducer_vars: &lassen_reproducer_vars + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## @@ -16,9 +22,9 @@ # Overriding shared spec: Longer allocation + extra flags xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS} ^blt@develop" MODULE_LIST: "cuda/11.2.0" - LASSEN_JOB_ALLOC: "1 -W 120" + LASSEN_JOB_ALLOC: "1 -W 60 -q pci" extends: .job_on_lassen @@ -31,22 +37,22 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: gcc_8_3_1: variables: - SPEC: " ~shared +openmp %gcc@8.3.1 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: " ~shared +openmp %gcc@8.3.1 ^blt@develop" extends: .job_on_lassen gcc_8_3_1_cuda_11_5_0_ats_disabled: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^blt@develop" MODULE_LIST: "cuda/11.5.0" - LASSEN_JOB_ALLOC: "1 --atsdisable -W 30" + LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda +mpi %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: " ~shared +openmp +cuda +mpi %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ^blt@develop" MODULE_LIST: "cuda/11.5.0" - LASSEN_JOB_ALLOC: "1 --atsdisable -W 30" + LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" ########## # OTHERS @@ -54,7 +60,7 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: clang_13_0_1_libcpp: variables: - SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ^blt@develop" extends: .job_on_lassen #clang_14_0_5_asan: diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 484b385a4..5d63efe5a 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -6,6 +6,12 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## +# Override reproducer section to define projet specific variables. +.poodle_reproducer_vars: &poodle_reproducer_vars + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## @@ -15,22 +21,22 @@ clang_14_0_6: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@14.0.6 ^blt@develop" extends: .job_on_poodle gcc_10_3_1: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@10.3.1 ^blt@develop" extends: .job_on_poodle intel_19_1_2_gcc_10_3_1: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@19.1.2.gcc.10.3.1 ^blt@develop" extends: .job_on_poodle intel_2022_1_0: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ^blt@develop" allow_failure: true extends: .job_on_poodle @@ -43,6 +49,6 @@ intel_2022_1_0: intel_2022_1_0_mpi: variables: - SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@develop" allow_failure: true extends: .job_on_poodle diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index e07f65dff..636ac96b9 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -6,6 +6,12 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## +# Override reproducer section to define project specific variables. +.ruby_reproducer_vars: &ruby_reproducer_vars + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## @@ -16,24 +22,24 @@ # Overriding shared config for longer run and algorithm variants clang_14_0_6: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@14.0.6 ^blt@develop" extends: .job_on_ruby gcc_10_3_1: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@10.3.1 ^blt@develop" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" extends: .job_on_ruby intel_19_1_2_gcc_10_3_1: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@19.1.2.gcc.10.3.1 ^blt@develop" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" extends: .job_on_ruby intel_2022_1_0: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ^blt@develop" extends: .job_on_ruby ############ @@ -45,5 +51,5 @@ intel_2022_1_0: intel_2022_1_0_mpi: variables: - SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@develop" extends: .job_on_ruby diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index 9cd06caa1..0ed6ae169 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -6,6 +6,12 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################# +# Override reproducer section to define project specific variables. +.tioga_reproducer_vars: &tioga_reproducer_vars + - | + echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" + echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + ######################## # Overridden shared jobs ######################## @@ -22,12 +28,12 @@ # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -rocmcc_5_6_0_hip_openmp: +rocmcc_5_7_1_hip_openmp: variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ^blt@develop" extends: .job_on_tioga -rocmcc_5_6_0_hip_openmp_mpi: +rocmcc_5_7_1_hip_openmp_mpi: variables: - SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.6.0 ^hip@5.6.0 ^blt@git.a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81=develop" + SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ^blt@develop" extends: .job_on_tioga diff --git a/.uberenv_config.json b/.uberenv_config.json index 79039830f..1568498cc 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -4,7 +4,7 @@ "package_final_phase" : "initconfig", "package_source_dir" : "../..", "spack_url": "https://github.com/spack/spack.git", -"spack_branch": "v0.20.1", +"spack_branch": "develop-2024-01-21", "spack_activate" : {}, "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", "spack_packages_path": "tpl/RAJA/scripts/radiuss-spack-configs/packages", diff --git a/tpl/RAJA b/tpl/RAJA index f3e0fc5ed..82d1b926a 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit f3e0fc5ed3bb0e8dd7dcb4d822a00c0875ad0e7a +Subproject commit 82d1b926ada0fbb15a4a6e0adadc30c715cfda7b From 9e88d18d16fe47c21ba538b7b3afb9931d6b72de Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 16 Feb 2024 10:44:48 -0800 Subject: [PATCH 252/454] centralize BLT dependency --- .gitlab/custom-jobs-and-variables.yml | 4 ++-- .gitlab/jobs/poodle.yml | 10 +++++----- .gitlab/jobs/ruby.yml | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 78e6c28d3..4744a2052 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -22,7 +22,7 @@ variables: # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby - PROJECT_RUBY_DEPS: "" + PROJECT_RUBY_DEPS: "^blt@develop " # Poodle # Arguments for top level allocation @@ -35,7 +35,7 @@ variables: # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle - PROJECT_POODLE_DEPS: "" + PROJECT_POODLE_DEPS: "^blt@develop " # Corona # Arguments for top level allocation diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 5d63efe5a..96fa9b8a4 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -21,22 +21,22 @@ clang_14_0_6: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@14.0.6 ^blt@develop" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle gcc_10_3_1: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@10.3.1 ^blt@develop" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle intel_19_1_2_gcc_10_3_1: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@19.1.2.gcc.10.3.1 ^blt@develop" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle intel_2022_1_0: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ^blt@develop" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ${PROJECT_POODLE_DEPS}" allow_failure: true extends: .job_on_poodle @@ -49,6 +49,6 @@ intel_2022_1_0: intel_2022_1_0_mpi: variables: - SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@develop" + SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ${PROJECT_POODLE_DEPS}" allow_failure: true extends: .job_on_poodle diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index 636ac96b9..6944bb010 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -22,24 +22,24 @@ # Overriding shared config for longer run and algorithm variants clang_14_0_6: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@14.0.6 ^blt@develop" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby gcc_10_3_1: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@10.3.1 ^blt@develop" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" extends: .job_on_ruby intel_19_1_2_gcc_10_3_1: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@19.1.2.gcc.10.3.1 ^blt@develop" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" extends: .job_on_ruby intel_2022_1_0: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ^blt@develop" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby ############ @@ -51,5 +51,5 @@ intel_2022_1_0: intel_2022_1_0_mpi: variables: - SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ^blt@develop" + SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby From 336c08129791d14d69492a7e8f0e0f336a138d3f Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 16 Feb 2024 11:09:33 -0800 Subject: [PATCH 253/454] Consolidate remaining BLT dependencies --- .gitlab/jobs/lassen.yml | 14 +++++++------- .gitlab/jobs/tioga.yml | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index 9d87bb2b0..e895a38a8 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -22,7 +22,7 @@ # Overriding shared spec: Longer allocation + extra flags xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS} ^blt@develop" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.2.0" LASSEN_JOB_ALLOC: "1 -W 60 -q pci" extends: .job_on_lassen @@ -37,20 +37,20 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: gcc_8_3_1: variables: - SPEC: " ~shared +openmp %gcc@8.3.1 ^blt@develop" + SPEC: " ~shared +openmp %gcc@8.3.1 ${PROJECT_LASSEN_DEPS}" extends: .job_on_lassen gcc_8_3_1_cuda_11_5_0_ats_disabled: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^blt@develop" + SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.5.0" LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda +mpi %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ^blt@develop" + SPEC: " ~shared +openmp +cuda +mpi %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.5.0" LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" @@ -60,12 +60,12 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: clang_13_0_1_libcpp: variables: - SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ^blt@develop" + SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ${PROJECT_LASSEN_DEPS}" extends: .job_on_lassen #clang_14_0_5_asan: # variables: -# SPEC: " ~shared +openmp %clang@14.0.5 cxxflags==\"-fsanitize=address\"" +# SPEC: " ~shared +openmp %clang@14.0.5 cxxflags==\"-fsanitize=address\" ${PROJECT_LASSEN_DEPS}" # ASAN_OPTIONS: "detect_leaks=1" # LSAN_OPTIONS: "suppressions=${CI_PROJECT_DIR}/tpl/RAJA/suppressions.asan" # extends: .job_on_lassen @@ -73,5 +73,5 @@ clang_13_0_1_libcpp: # Activated in RAJA, but we don't use desul atomics here #gcc_8_3_1_cuda_10_1_168_desul_atomics: # variables: -# SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers" +# SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" # extends: .job_on_lassen diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index 0ed6ae169..a3e03cdce 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -30,10 +30,10 @@ rocmcc_5_7_1_hip_openmp: variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ^blt@develop" + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" extends: .job_on_tioga rocmcc_5_7_1_hip_openmp_mpi: variables: - SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ^blt@develop" + SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" extends: .job_on_tioga From ca42c5a9ab450b72a64b047c6003418869c4f9d3 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 16 Feb 2024 14:30:20 -0800 Subject: [PATCH 254/454] release candidate branch for v2024.02.0 release. --- CMakeLists.txt | 4 ++-- docs/conf.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4145d0c37..ddee3b0ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,8 +101,8 @@ if (ENABLE_OPENMP) add_definitions(-DRUN_OPENMP) endif () -set(RAJA_PERFSUITE_VERSION_MAJOR 2023) -set(RAJA_PERFSUITE_VERSION_MINOR 06) +set(RAJA_PERFSUITE_VERSION_MAJOR 2024) +set(RAJA_PERFSUITE_VERSION_MINOR 02) set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0) set(RAJA_PERFSUITE_DEPENDS RAJA) diff --git a/docs/conf.py b/docs/conf.py index abbac1ada..072637997 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,9 +86,9 @@ # built documents. # # The short X.Y version. -version = u'2023.06' +version = u'2024.02' # The full version, including alpha/beta/rc tags. -release = u'2023.06.0' +release = u'2024.02.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 642874c7263a45ed2321c0d4eed28f3bfe60e250 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 15 Feb 2024 09:29:30 -0800 Subject: [PATCH 255/454] Aggregate to atomics per warp and block --- src/algorithm/ATOMIC-Cuda.cpp | 117 ++++++++++++++++++++++++++--- src/algorithm/ATOMIC-Hip.cpp | 117 ++++++++++++++++++++++++++--- src/algorithm/ATOMIC-OMP.cpp | 6 +- src/algorithm/ATOMIC-OMPTarget.cpp | 4 +- src/algorithm/ATOMIC-Seq.cpp | 6 +- src/algorithm/ATOMIC.hpp | 18 +++-- 6 files changed, 234 insertions(+), 34 deletions(-) diff --git a/src/algorithm/ATOMIC-Cuda.cpp b/src/algorithm/ATOMIC-Cuda.cpp index b0d9d5198..18e8a2492 100644 --- a/src/algorithm/ATOMIC-Cuda.cpp +++ b/src/algorithm/ATOMIC-Cuda.cpp @@ -14,6 +14,9 @@ #include "common/CudaDataUtils.hpp" +#include +#include + #include namespace rajaperf @@ -21,15 +24,37 @@ namespace rajaperf namespace algorithm { +const size_t warp_size = 32; + template < size_t block_size, size_t replication > __launch_bounds__(block_size) __global__ void atomic_replicate_thread(Real_ptr atomic, Index_type iend) { - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i); - } + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i, ATOMIC_VALUE); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_warp(Real_ptr atomic, + Index_type iend) +{ + Real_type val = 0; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + val = ATOMIC_VALUE; + } + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage warp_reduce_storage; + val = WarpReduce(warp_reduce_storage).Sum(val); + if ((threadIdx.x % warp_size) == 0) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i/warp_size, val); + } } template < size_t block_size, size_t replication > @@ -37,10 +62,19 @@ __launch_bounds__(block_size) __global__ void atomic_replicate_block(Real_ptr atomic, Index_type iend) { - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - ATOMIC_RAJA_BODY(RAJA::cuda_atomic, blockIdx.x); - } + Real_type val = 0; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + val = ATOMIC_VALUE; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage block_reduce_storage; + val = BlockReduce(block_reduce_storage).Sum(val); + if (threadIdx.x == 0) { + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, blockIdx.x, val); + } } @@ -79,7 +113,7 @@ void ATOMIC::runCudaVariantReplicateGlobal(VariantID vid) RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i); + ATOMIC_RAJA_BODY(RAJA::cuda_atomic, i, ATOMIC_VALUE); }); } @@ -92,11 +126,44 @@ void ATOMIC::runCudaVariantReplicateGlobal(VariantID vid) ATOMIC_DATA_TEARDOWN(replication); } +template < size_t block_size, size_t replication > +void ATOMIC::runCudaVariantReplicateWarp(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (atomic_replicate_warp), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + template < size_t block_size, size_t replication > void ATOMIC::runCudaVariantReplicateBlock(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -158,6 +225,24 @@ void ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) if ( vid == Base_CUDA ) { + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateWarp(vid); + + } + + t += 1; + + } + + }); + seq_for(atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || @@ -213,6 +298,18 @@ void ATOMIC::setCudaTuningDefinitions(VariantID vid) if ( vid == Base_CUDA ) { + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_warp_"+std::to_string(block_size)); + + } + + }); + seq_for(atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || diff --git a/src/algorithm/ATOMIC-Hip.cpp b/src/algorithm/ATOMIC-Hip.cpp index c04aa5928..c7af9c946 100644 --- a/src/algorithm/ATOMIC-Hip.cpp +++ b/src/algorithm/ATOMIC-Hip.cpp @@ -14,6 +14,9 @@ #include "common/HipDataUtils.hpp" +#include +#include + #include namespace rajaperf @@ -21,6 +24,8 @@ namespace rajaperf namespace algorithm { +const size_t warp_size = 64; + template < size_t block_size, size_t replication > __launch_bounds__(block_size) __global__ void atomic_replicate_thread(Real_ptr atomic, @@ -28,7 +33,27 @@ __global__ void atomic_replicate_thread(Real_ptr atomic, { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - ATOMIC_RAJA_BODY(RAJA::hip_atomic, i); + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i, ATOMIC_VALUE); + } +} + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void atomic_replicate_warp(Real_ptr atomic, + Index_type iend) +{ + Real_type val = 0; + + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + val = ATOMIC_VALUE; + } + + using WarpReduce = rocprim::warp_reduce; + __shared__ typename WarpReduce::storage_type warp_reduce_storage; + WarpReduce().reduce(val, val, warp_reduce_storage); + if ((threadIdx.x % warp_size) == 0) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i/warp_size, val); } } @@ -37,9 +62,18 @@ __launch_bounds__(block_size) __global__ void atomic_replicate_block(Real_ptr atomic, Index_type iend) { + Real_type val = 0; + Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - ATOMIC_RAJA_BODY(RAJA::hip_atomic, blockIdx.x); + val = ATOMIC_VALUE; + } + + using BlockReduce = rocprim::block_reduce; + __shared__ typename BlockReduce::storage_type block_reduce_storage; + BlockReduce().reduce(val, val, block_reduce_storage); + if (threadIdx.x == 0) { + ATOMIC_RAJA_BODY(RAJA::hip_atomic, blockIdx.x, val); } } @@ -79,7 +113,7 @@ void ATOMIC::runHipVariantReplicateGlobal(VariantID vid) RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - ATOMIC_RAJA_BODY(RAJA::hip_atomic, i); + ATOMIC_RAJA_BODY(RAJA::hip_atomic, i, ATOMIC_VALUE); }); } @@ -92,11 +126,44 @@ void ATOMIC::runHipVariantReplicateGlobal(VariantID vid) ATOMIC_DATA_TEARDOWN(replication); } +template < size_t block_size, size_t replication > +void ATOMIC::runHipVariantReplicateWarp(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + ATOMIC_DATA_SETUP(replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (atomic_replicate_warp), + grid_size, block_size, + shmem, res.get_stream(), + atomic, + iend ); + + } + stopTimer(); + + } else { + getCout() << "\n ATOMIC : Unknown Hip variant id = " << vid << std::endl; + } + + ATOMIC_DATA_TEARDOWN(replication); +} + template < size_t block_size, size_t replication > void ATOMIC::runHipVariantReplicateBlock(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; @@ -140,8 +207,8 @@ void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) seq_for(atomic_replications_type{}, [&](auto replication) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { if (tune_idx == t) { @@ -160,8 +227,26 @@ void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) seq_for(atomic_replications_type{}, [&](auto replication) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateWarp(vid); + + } + + t += 1; + + } + + }); + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { if (tune_idx == t) { @@ -215,8 +300,20 @@ void ATOMIC::setHipTuningDefinitions(VariantID vid) seq_for(atomic_replications_type{}, [&](auto replication) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_warp_"+std::to_string(block_size)); + + } + + }); + + seq_for(atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ "_block_"+std::to_string(block_size)); diff --git a/src/algorithm/ATOMIC-OMP.cpp b/src/algorithm/ATOMIC-OMP.cpp index 2f805d961..ae3863bb1 100644 --- a/src/algorithm/ATOMIC-OMP.cpp +++ b/src/algorithm/ATOMIC-OMP.cpp @@ -39,7 +39,7 @@ void ATOMIC::runOpenMPVariantReplicate(VariantID vid) #pragma omp parallel for for (Index_type i = ibegin; i < iend; ++i ) { #pragma omp atomic - ATOMIC_BODY(i); + ATOMIC_BODY(i, ATOMIC_VALUE); } } @@ -52,7 +52,7 @@ void ATOMIC::runOpenMPVariantReplicate(VariantID vid) auto atomic_base_lam = [=](Index_type i) { #pragma omp atomic - ATOMIC_BODY(i); + ATOMIC_BODY(i, ATOMIC_VALUE); }; startTimer(); @@ -76,7 +76,7 @@ void ATOMIC::runOpenMPVariantReplicate(VariantID vid) RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - ATOMIC_RAJA_BODY(RAJA::omp_atomic, i); + ATOMIC_RAJA_BODY(RAJA::omp_atomic, i, ATOMIC_VALUE); }); } diff --git a/src/algorithm/ATOMIC-OMPTarget.cpp b/src/algorithm/ATOMIC-OMPTarget.cpp index a488a62ee..e7a3bca1c 100644 --- a/src/algorithm/ATOMIC-OMPTarget.cpp +++ b/src/algorithm/ATOMIC-OMPTarget.cpp @@ -44,7 +44,7 @@ void ATOMIC::runOpenMPTargetReplicate(VariantID vid) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { #pragma omp atomic - ATOMIC_BODY(i); + ATOMIC_BODY(i, ATOMIC_VALUE); } } @@ -57,7 +57,7 @@ void ATOMIC::runOpenMPTargetReplicate(VariantID vid) RAJA::forall>( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - ATOMIC_RAJA_BODY(RAJA::omp_atomic, i); + ATOMIC_RAJA_BODY(RAJA::omp_atomic, i, ATOMIC_VALUE); }); } diff --git a/src/algorithm/ATOMIC-Seq.cpp b/src/algorithm/ATOMIC-Seq.cpp index c8e9a68f0..1cccb8a6b 100644 --- a/src/algorithm/ATOMIC-Seq.cpp +++ b/src/algorithm/ATOMIC-Seq.cpp @@ -35,7 +35,7 @@ void ATOMIC::runSeqVariantReplicate(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type i = ibegin; i < iend; ++i ) { - ATOMIC_BODY(i); + ATOMIC_BODY(i, ATOMIC_VALUE); } } @@ -48,7 +48,7 @@ void ATOMIC::runSeqVariantReplicate(VariantID vid) case Lambda_Seq : { auto atomic_base_lam = [=](Index_type i) { - ATOMIC_BODY(i); + ATOMIC_BODY(i, ATOMIC_VALUE); }; startTimer(); @@ -71,7 +71,7 @@ void ATOMIC::runSeqVariantReplicate(VariantID vid) RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - ATOMIC_RAJA_BODY(RAJA::seq_atomic, i); + ATOMIC_RAJA_BODY(RAJA::seq_atomic, i, ATOMIC_VALUE); }); } diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp index 5f1d8628b..72255b1ca 100644 --- a/src/algorithm/ATOMIC.hpp +++ b/src/algorithm/ATOMIC.hpp @@ -33,11 +33,13 @@ } \ deallocData(atomic, vid); -#define ATOMIC_BODY(i) \ - atomic[(i)%replication] += 1.0 +#define ATOMIC_VALUE 1.0 -#define ATOMIC_RAJA_BODY(policy, i) \ - RAJA::atomicAdd(&atomic[(i)%replication], 1.0) +#define ATOMIC_BODY(i, val) \ + atomic[(i)%replication] += (val) + +#define ATOMIC_RAJA_BODY(policy, i, val) \ + RAJA::atomicAdd(&atomic[(i)%replication], (val)) #include "common/KernelBase.hpp" @@ -81,10 +83,14 @@ class ATOMIC : public KernelBase template < size_t block_size, size_t replication > void runCudaVariantReplicateGlobal(VariantID vid); template < size_t block_size, size_t replication > - void runCudaVariantReplicateBlock(VariantID vid); - template < size_t block_size, size_t replication > void runHipVariantReplicateGlobal(VariantID vid); template < size_t block_size, size_t replication > + void runCudaVariantReplicateWarp(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateWarp(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateBlock(VariantID vid); + template < size_t block_size, size_t replication > void runHipVariantReplicateBlock(VariantID vid); template < size_t replication > void runOpenMPTargetVariantReplicate(VariantID vid); From 25ac9afbfecd3b717134e52109039de5f7118303 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 16 Feb 2024 12:59:41 -0800 Subject: [PATCH 256/454] Fix naming --- src/algorithm/ATOMIC-Cuda.cpp | 12 ++++++------ src/algorithm/ATOMIC-Hip.cpp | 12 ++++++------ src/algorithm/ATOMIC-OMPTarget.cpp | 4 ++-- src/algorithm/ATOMIC.hpp | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/algorithm/ATOMIC-Cuda.cpp b/src/algorithm/ATOMIC-Cuda.cpp index 18e8a2492..a286c60d2 100644 --- a/src/algorithm/ATOMIC-Cuda.cpp +++ b/src/algorithm/ATOMIC-Cuda.cpp @@ -205,7 +205,7 @@ void ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -225,7 +225,7 @@ void ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) if ( vid == Base_CUDA ) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -243,7 +243,7 @@ void ATOMIC::runCudaVariant(VariantID vid, size_t tune_idx) }); - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -284,7 +284,7 @@ void ATOMIC::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -298,7 +298,7 @@ void ATOMIC::setCudaTuningDefinitions(VariantID vid) if ( vid == Base_CUDA ) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -310,7 +310,7 @@ void ATOMIC::setCudaTuningDefinitions(VariantID vid) }); - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { diff --git a/src/algorithm/ATOMIC-Hip.cpp b/src/algorithm/ATOMIC-Hip.cpp index c7af9c946..fbb103596 100644 --- a/src/algorithm/ATOMIC-Hip.cpp +++ b/src/algorithm/ATOMIC-Hip.cpp @@ -205,7 +205,7 @@ void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -225,7 +225,7 @@ void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) if ( vid == Base_HIP ) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -243,7 +243,7 @@ void ATOMIC::runHipVariant(VariantID vid, size_t tune_idx) }); - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -284,7 +284,7 @@ void ATOMIC::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -298,7 +298,7 @@ void ATOMIC::setHipTuningDefinitions(VariantID vid) if ( vid == Base_HIP ) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -310,7 +310,7 @@ void ATOMIC::setHipTuningDefinitions(VariantID vid) }); - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { diff --git a/src/algorithm/ATOMIC-OMPTarget.cpp b/src/algorithm/ATOMIC-OMPTarget.cpp index e7a3bca1c..bbd3d6b67 100644 --- a/src/algorithm/ATOMIC-OMPTarget.cpp +++ b/src/algorithm/ATOMIC-OMPTarget.cpp @@ -77,7 +77,7 @@ void ATOMIC::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { @@ -106,7 +106,7 @@ void ATOMIC::setOpenMPTargetTuningDefinitions(VariantID vid) { if ( vid == Base_OpenMPTarget || vid == RAJA_OpenMPTarget ) { - seq_for(atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(replication)) { diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp index 72255b1ca..55ab41ad8 100644 --- a/src/algorithm/ATOMIC.hpp +++ b/src/algorithm/ATOMIC.hpp @@ -101,7 +101,7 @@ class ATOMIC : public KernelBase static const size_t default_cpu_atomic_replication = 64; using cpu_atomic_replications_type = integer::make_atomic_replication_list_type; static const size_t default_atomic_replication = 4096; - using atomic_replications_type = integer::make_atomic_replication_list_type; + using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; Real_type m_init; Real_type m_final; From 5e5bcb947a228193a161c0db3fdfa54d6b080a3a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 13 Feb 2024 10:23:42 -0800 Subject: [PATCH 257/454] Split gpu scan implementations into separate files --- src/basic/INDEXLIST-Cuda.cpp | 179 +------------------------------- src/basic/INDEXLIST-Hip.cpp | 178 +------------------------------- src/common/CudaGridScan.hpp | 193 ++++++++++++++++++++++++++++++++++ src/common/HipGridScan.hpp | 195 +++++++++++++++++++++++++++++++++++ 4 files changed, 397 insertions(+), 348 deletions(-) create mode 100644 src/common/CudaGridScan.hpp create mode 100644 src/common/HipGridScan.hpp diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 131b74f7a..6cd0bce93 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -13,11 +13,7 @@ #if defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" - -#include -#include -#include -#include +#include "common/CudaGridScan.hpp" #include @@ -26,172 +22,6 @@ namespace rajaperf namespace basic { - // - // Define magic numbers for CUDA execution - // - const size_t warp_size = 32; - const size_t items_per_thread = 15; - - -// perform a grid scan on val and returns the result at each thread -// in exclusive and inclusive, note that val is used as scratch space -template < size_t block_size, size_t items_per_thread > -__device__ void grid_scan(const int block_id, - Index_type (&val)[items_per_thread], - Index_type (&exclusive)[items_per_thread], - Index_type (&inclusive)[items_per_thread], - Index_type* block_counts, - Index_type* grid_counts, - unsigned* block_readys) -{ - const bool first_block = (block_id == 0); - const bool last_block = (block_id == gridDim.x-1); - const bool last_thread = (threadIdx.x == block_size-1); - const bool last_warp = (threadIdx.x >= block_size - warp_size); - const int warp_index = (threadIdx.x % warp_size); - const unsigned warp_index_mask = (1u << warp_index); - const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u); - - using BlockScan = cub::BlockScan; //, cub::BLOCK_SCAN_WARP_SCANS>; - using BlockExchange = cub::BlockExchange; - using WarpReduce = cub::WarpReduce; - - union SharedStorage { - typename BlockScan::TempStorage block_scan_storage; - typename BlockExchange::TempStorage block_exchange_storage; - typename WarpReduce::TempStorage warp_reduce_storage; - volatile Index_type prev_grid_count; - }; - __shared__ SharedStorage s_temp_storage; - - - BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val); - __syncthreads(); - - - BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); - __syncthreads(); - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - inclusive[ti] = exclusive[ti] + val[ti]; - } - - BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive); - __syncthreads(); - BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive); - __syncthreads(); - if (first_block) { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts, grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready - } - - } else { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - __threadfence(); // ensure block_counts ready (release) - atomicExch(&block_readys[block_id], 1u); // write block_counts is ready - } - - // get prev_grid_count using last warp in block - if (last_warp) { - - Index_type prev_grid_count = 0; - - // accumulate previous block counts into registers of warp - - int prev_block_base_id = block_id - warp_size; - - unsigned prev_block_ready = 0u; - unsigned prev_blocks_ready_ballot = 0u; - unsigned prev_grids_ready_ballot = 0u; - - // accumulate full warp worths of block counts - // stop if run out of full warps of a grid count is ready - while (prev_block_base_id >= 0) { - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - do { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - - prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); - - } while (prev_blocks_ready_ballot != 0xffffffffu); - - prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - - if (prev_grids_ready_ballot != 0u) { - break; - } - - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - - prev_block_ready = 0u; - - prev_block_base_id -= warp_size; - } - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - // this checks that block counts is ready for all blocks above - // the highest grid count that is ready - while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { - - if (prev_block_id >= 0) { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - } - - prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); - prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // read one grid_count from a block with id grid_count_ready_id - // and read the block_counts from blocks with higher ids. - if (warp_index_mask > prev_grids_ready_ballot) { - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // accumulate grid_count for grid_count_ready_id - prev_grid_count += grid_counts[prev_block_id]; - } - - - prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count); - prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp - - if (last_thread) { - - if (!last_block) { - grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready - } - - s_temp_storage.prev_grid_count = prev_grid_count; - } - } - - __syncthreads(); - Index_type prev_grid_count = s_temp_storage.prev_grid_count; - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - exclusive[ti] = prev_grid_count + exclusive[ti]; - inclusive[ti] = prev_grid_count + inclusive[ti]; - } - } -} - template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) __global__ void indexlist(Real_ptr x, @@ -222,7 +52,7 @@ __global__ void indexlist(Real_ptr x, Index_type exclusives[items_per_thread]; Index_type inclusives[items_per_thread]; - grid_scan( + detail::cuda::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -240,6 +70,7 @@ __global__ void indexlist(Real_ptr x, } } + template < size_t block_size > void INDEXLIST::runCudaVariantImpl(VariantID vid) { @@ -253,7 +84,7 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::cuda::grid_scan_items_per_thread); const size_t shmem_size = 0; Index_type* len; @@ -270,7 +101,7 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - RPlaunchCudaKernel( (indexlist), + RPlaunchCudaKernel( (indexlist), grid_size, block_size, shmem_size, res.get_stream(), x+ibegin, list+ibegin, diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 9c4bff838..84c5d36ea 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -13,11 +13,7 @@ #if defined(RAJA_ENABLE_HIP) #include "common/HipDataUtils.hpp" - -#include -#include -#include -#include +#include "common/HipGridScan.hpp" #include @@ -26,172 +22,6 @@ namespace rajaperf namespace basic { - // - // Define magic numbers for HIP execution - // - const size_t warp_size = 64; - const size_t items_per_thread = 8; - - -// perform a grid scan on val and returns the result at each thread -// in exclusive and inclusive, note that val is used as scratch space -template < size_t block_size, size_t items_per_thread > -__device__ void grid_scan(const int block_id, - Index_type (&val)[items_per_thread], - Index_type (&exclusive)[items_per_thread], - Index_type (&inclusive)[items_per_thread], - Index_type* block_counts, - Index_type* grid_counts, - unsigned* block_readys) -{ - const bool first_block = (block_id == 0); - const bool last_block = (block_id == static_cast(gridDim.x-1)); - const bool last_thread = (threadIdx.x == block_size-1); - const bool last_warp = (threadIdx.x >= block_size - warp_size); - const int warp_index = (threadIdx.x % warp_size); - const unsigned long long warp_index_mask = (1ull << warp_index); - const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull); - - using BlockScan = rocprim::block_scan; //, rocprim::block_scan_algorithm::reduce_then_scan>; - using BlockExchange = rocprim::block_exchange; - using WarpReduce = rocprim::warp_reduce; - - union SharedStorage { - typename BlockScan::storage_type block_scan_storage; - typename BlockExchange::storage_type block_exchange_storage; - typename WarpReduce::storage_type warp_reduce_storage; - volatile Index_type prev_grid_count; - }; - __shared__ SharedStorage s_temp_storage; - - - BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage); - __syncthreads(); - - - BlockScan().exclusive_scan(val, exclusive, Index_type{0}, s_temp_storage.block_scan_storage); - __syncthreads(); - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - inclusive[ti] = exclusive[ti] + val[ti]; - } - - BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage); - __syncthreads(); - BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage); - __syncthreads(); - if (first_block) { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts, grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready - } - - } else { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - __threadfence(); // ensure block_counts ready (release) - atomicExch(&block_readys[block_id], 1u); // write block_counts is ready - } - - // get prev_grid_count using last warp in block - if (last_warp) { - - Index_type prev_grid_count = 0; - - // accumulate previous block counts into registers of warp - - int prev_block_base_id = block_id - warp_size; - - unsigned prev_block_ready = 0u; - unsigned long long prev_blocks_ready_ballot = 0ull; - unsigned long long prev_grids_ready_ballot = 0ull; - - // accumulate full warp worths of block counts - // stop if run out of full warps of a grid count is ready - while (prev_block_base_id >= 0) { - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - do { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - - prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); - - } while (prev_blocks_ready_ballot != 0xffffffffffffffffull); - - prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); - - if (prev_grids_ready_ballot != 0ull) { - break; - } - - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - - prev_block_ready = 0u; - - prev_block_base_id -= warp_size; - } - - const int prev_block_id = prev_block_base_id + warp_index; - - // ensure previous block_counts are ready - // this checks that block counts is ready for all blocks above - // the highest grid count that is ready - while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { - - if (prev_block_id >= 0) { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - } - - prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); - prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); - } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // read one grid_count from a block with id grid_count_ready_id - // and read the block_counts from blocks with higher ids. - if (warp_index_mask > prev_grids_ready_ballot) { - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // accumulate grid_count for grid_count_ready_id - prev_grid_count += grid_counts[prev_block_id]; - } - - - WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage); - prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp - - if (last_thread) { - - if (!last_block) { - grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready - } - - s_temp_storage.prev_grid_count = prev_grid_count; - } - } - - __syncthreads(); - Index_type prev_grid_count = s_temp_storage.prev_grid_count; - - for (size_t ti = 0; ti < items_per_thread; ++ti) { - exclusive[ti] = prev_grid_count + exclusive[ti]; - inclusive[ti] = prev_grid_count + inclusive[ti]; - } - } -} - template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) __global__ void indexlist(Real_ptr x, @@ -222,7 +52,7 @@ __global__ void indexlist(Real_ptr x, Index_type exclusives[items_per_thread]; Index_type inclusives[items_per_thread]; - grid_scan( + detail::hip::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -253,7 +83,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::hip::grid_scan_items_per_thread); const size_t shmem_size = 0; Index_type* len; @@ -271,7 +101,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - RPlaunchHipKernel( (indexlist), + RPlaunchHipKernel( (indexlist), grid_size, block_size, shmem_size, res.get_stream(), x+ibegin, list+ibegin, diff --git a/src/common/CudaGridScan.hpp b/src/common/CudaGridScan.hpp new file mode 100644 index 000000000..cd237d7ad --- /dev/null +++ b/src/common/CudaGridScan.hpp @@ -0,0 +1,193 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#if defined(RAJA_ENABLE_CUDA) + +#include +#include +#include +#include + +namespace rajaperf +{ +namespace detail +{ +namespace cuda +{ + +// +// Define magic numbers for CUDA execution +// +const size_t warp_size = 32; +const size_t grid_scan_items_per_thread = 15; + + +// perform a grid scan on val and returns the result at each thread +// in exclusive and inclusive, note that val is used as scratch space +template < size_t block_size, size_t items_per_thread, typename DataType > +__device__ void grid_scan(const int block_id, + DataType (&val)[items_per_thread], + DataType (&exclusive)[items_per_thread], + DataType (&inclusive)[items_per_thread], + DataType* block_counts, + DataType* grid_counts, + unsigned* block_readys) +{ + const bool first_block = (block_id == 0); + const bool last_block = (block_id == gridDim.x-1); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const unsigned warp_index_mask = (1u << warp_index); + const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u); + + using BlockScan = cub::BlockScan; //, cub::BLOCK_SCAN_WARP_SCANS>; + using BlockExchange = cub::BlockExchange; + using WarpReduce = cub::WarpReduce; + + union SharedStorage { + typename BlockScan::TempStorage block_scan_storage; + typename BlockExchange::TempStorage block_exchange_storage; + typename WarpReduce::TempStorage warp_reduce_storage; + volatile DataType prev_grid_count; + }; + __shared__ SharedStorage s_temp_storage; + + + BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val); + __syncthreads(); + + + BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); + __syncthreads(); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } + + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive); + __syncthreads(); + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive); + __syncthreads(); + if (first_block) { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } + + } else { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } + + // get prev_grid_count using last warp in block + if (last_warp) { + + DataType prev_grid_count = 0; + + // accumulate previous block counts into registers of warp + + int prev_block_base_id = block_id - warp_size; + + unsigned prev_block_ready = 0u; + unsigned prev_blocks_ready_ballot = 0u; + unsigned prev_grids_ready_ballot = 0u; + + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + + } while (prev_blocks_ready_ballot != 0xffffffffu); + + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + + if (prev_grids_ready_ballot != 0u) { + break; + } + + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + + prev_block_ready = 0u; + + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + + if (prev_block_id >= 0) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } + + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + } + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; + } + + + prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count); + prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp + + if (last_thread) { + + if (!last_block) { + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } + + s_temp_storage.prev_grid_count = prev_grid_count; + } + } + + __syncthreads(); + DataType prev_grid_count = s_temp_storage.prev_grid_count; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } + } +} + +} // end namespace cuda +} // end namespace detail +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/common/HipGridScan.hpp b/src/common/HipGridScan.hpp new file mode 100644 index 000000000..9918776e1 --- /dev/null +++ b/src/common/HipGridScan.hpp @@ -0,0 +1,195 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#if defined(RAJA_ENABLE_HIP) + +#include +#include +#include +#include + +#include + +namespace rajaperf +{ +namespace detail +{ +namespace hip +{ + +// +// Define magic numbers for HIP execution +// +const size_t warp_size = 64; +const size_t grid_scan_items_per_thread = 8; + + +// perform a grid scan on val and returns the result at each thread +// in exclusive and inclusive, note that val is used as scratch space +template < size_t block_size, size_t items_per_thread, typename DataType > +__device__ void grid_scan(const int block_id, + DataType (&val)[items_per_thread], + DataType (&exclusive)[items_per_thread], + DataType (&inclusive)[items_per_thread], + DataType* block_counts, + DataType* grid_counts, + unsigned* block_readys) +{ + const bool first_block = (block_id == 0); + const bool last_block = (block_id == static_cast(gridDim.x-1)); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const unsigned long long warp_index_mask = (1ull << warp_index); + const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull); + + using BlockScan = rocprim::block_scan; //, rocprim::block_scan_algorithm::reduce_then_scan>; + using BlockExchange = rocprim::block_exchange; + using WarpReduce = rocprim::warp_reduce; + + union SharedStorage { + typename BlockScan::storage_type block_scan_storage; + typename BlockExchange::storage_type block_exchange_storage; + typename WarpReduce::storage_type warp_reduce_storage; + volatile DataType prev_grid_count; + }; + __shared__ SharedStorage s_temp_storage; + + + BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage); + __syncthreads(); + + + BlockScan().exclusive_scan(val, exclusive, DataType{0}, s_temp_storage.block_scan_storage); + __syncthreads(); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } + + BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + if (first_block) { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } + + } else { + + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } + + // get prev_grid_count using last warp in block + if (last_warp) { + + DataType prev_grid_count = 0; + + // accumulate previous block counts into registers of warp + + int prev_block_base_id = block_id - warp_size; + + unsigned prev_block_ready = 0u; + unsigned long long prev_blocks_ready_ballot = 0ull; + unsigned long long prev_grids_ready_ballot = 0ull; + + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + + } while (prev_blocks_ready_ballot != 0xffffffffffffffffull); + + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); + + if (prev_grids_ready_ballot != 0ull) { + break; + } + + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + + prev_block_ready = 0u; + + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; + + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + + if (prev_block_id >= 0) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } + + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); + } + __threadfence(); // ensure block_counts or grid_counts ready (acquire) + + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; + } + + + WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage); + prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp + + if (last_thread) { + + if (!last_block) { + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } + + s_temp_storage.prev_grid_count = prev_grid_count; + } + } + + __syncthreads(); + DataType prev_grid_count = s_temp_storage.prev_grid_count; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } + } +} + +} // end namespace hip +} // end namespace detail +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP From 9102808981fe1b9b2de5e3d22ecbc02fc20d4b2c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 13 Feb 2024 10:24:09 -0800 Subject: [PATCH 258/454] Use gpu scan implemtations in SCAN --- src/algorithm/SCAN-Cuda.cpp | 137 ++++++++++++++++++++++++++++++++++- src/algorithm/SCAN-Hip.cpp | 140 +++++++++++++++++++++++++++++++++++- src/algorithm/SCAN.hpp | 12 +++- 3 files changed, 285 insertions(+), 4 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 674e25f5a..d3500d47e 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -16,6 +16,7 @@ #include "cub/util_allocator.cuh" #include "common/CudaDataUtils.hpp" +#include "common/CudaGridScan.hpp" #include @@ -24,8 +25,46 @@ namespace rajaperf namespace algorithm { +template < size_t block_size, size_t items_per_thread > +__launch_bounds__(block_size) +__global__ void scan(Real_ptr x, + Real_ptr y, + Real_ptr block_counts, + Real_ptr grid_counts, + unsigned* block_readys, + Index_type iend) +{ + // blocks do start running in order in cuda, so a block with a higher + // index can wait on a block with a lower index without deadlocking + // (replace with an atomicInc if this changes) + const int block_id = blockIdx.x; + + Real_type vals[items_per_thread]; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + vals[ti] = x[i]; + } else { + vals[ti] = 0; + } + } + + Real_type exclusives[items_per_thread]; + Real_type inclusives[items_per_thread]; + detail::cuda::grid_scan( + block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + y[i] = exclusives[ti]; + } + } +} + -void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void SCAN::runCudaVariantCub(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -80,6 +119,53 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) // Free temporary storage deallocData(DataSpace::CudaDevice, temp_storage); + } else { + getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void SCAN::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + SCAN_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::cuda::grid_scan_items_per_thread); + const size_t shmem_size = 0; + + Real_ptr block_counts; + allocData(DataSpace::CudaDevice, block_counts, grid_size); + Real_ptr grid_counts; + allocData(DataSpace::CudaDevice, grid_counts, grid_size); + unsigned* block_readys; + allocData(DataSpace::CudaDevice, block_readys, grid_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + RPlaunchCudaKernel( (scan), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, y+ibegin, + block_counts, grid_counts, block_readys, + iend-ibegin ); + + } + stopTimer(); + + deallocData(DataSpace::CudaDevice, block_counts); + deallocData(DataSpace::CudaDevice, grid_counts); + deallocData(DataSpace::CudaDevice, block_readys); + } else if ( vid == RAJA_CUDA ) { startTimer(); @@ -95,6 +181,55 @@ void SCAN::runCudaVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } } + +void SCAN::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + runCudaVariantCub(vid); + + } + + t += 1; + + } + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + runCudaVariantImpl(vid); + + } + + t += 1; + + } else { + + getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; + + } +} + +void SCAN::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA ) { + + addVariantTuningName(vid, "cub"); + + } + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + addVariantTuningName(vid, "default"); + + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 6e7135188..9723d46a4 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -21,6 +21,7 @@ #endif #include "common/HipDataUtils.hpp" +#include "common/HipGridScan.hpp" #include @@ -29,8 +30,46 @@ namespace rajaperf namespace algorithm { +template < size_t block_size, size_t items_per_thread > +__launch_bounds__(block_size) +__global__ void scan(Real_ptr x, + Real_ptr y, + Real_ptr block_counts, + Real_ptr grid_counts, + unsigned* block_readys, + Index_type iend) +{ + // It looks like blocks do not start running in order in hip, so a block + // with a higher index can't wait on a block with a lower index without + // deadlocking (have to replace with an atomicInc) + const int block_id = blockIdx.x; + + Real_type vals[items_per_thread]; + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + vals[ti] = x[i]; + } else { + vals[ti] = 0; + } + } + + Real_type exclusives[items_per_thread]; + Real_type inclusives[items_per_thread]; + detail::hip::grid_scan( + block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); + + for (size_t ti = 0; ti < items_per_thread; ++ti) { + Index_type i = block_id * block_size * items_per_thread + ti * block_size + threadIdx.x; + if (i < iend) { + y[i] = exclusives[ti]; + } + } +} -void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) + +void SCAN::runHipVariantRocprim(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -107,12 +146,60 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); + } else { + getCout() << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; + } +} + +template < size_t block_size > +void SCAN::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + SCAN_DATA_SETUP; + + if ( vid == Base_HIP ) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::hip::grid_scan_items_per_thread); + const size_t shmem_size = 0; + + Real_ptr block_counts; + allocData(DataSpace::HipDevice, block_counts, grid_size); + Real_ptr grid_counts; + allocData(DataSpace::HipDevice, grid_counts, grid_size); + unsigned* block_readys; + allocData(DataSpace::HipDevice, block_readys, grid_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, + res.get_stream()) ); + + RPlaunchHipKernel( (scan), + grid_size, block_size, + shmem_size, res.get_stream(), + x+ibegin, y+ibegin, + block_counts, grid_counts, block_readys, + iend-ibegin ); + + } + stopTimer(); + + deallocData(DataSpace::HipDevice, block_counts); + deallocData(DataSpace::HipDevice, grid_counts); + deallocData(DataSpace::HipDevice, block_readys); + } else if ( vid == RAJA_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::exclusive_scan< RAJA::hip_exec >(res, RAJA_SCAN_ARGS); + RAJA::exclusive_scan< RAJA::hip_exec >(res, RAJA_SCAN_ARGS); } stopTimer(); @@ -122,6 +209,55 @@ void SCAN::runHipVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } } + +void SCAN::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + runHipVariantRocprim(vid); + + } + + t += 1; + + } + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + if (tune_idx == t) { + + runHipVariantImpl(vid); + + } + + t += 1; + + } else { + + getCout() << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; + + } +} + +void SCAN::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP ) { + + addVariantTuningName(vid, "rocprim"); + + } + + if ( vid == Base_HIP || vid == RAJA_HIP ) { + + addVariantTuningName(vid, "default"); + + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index 519789a55..8db3ef02e 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -62,8 +62,18 @@ class SCAN : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void runCudaVariantCub(VariantID vid); + void runHipVariantRocprim(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + private: - static const size_t default_gpu_block_size = 0; + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_x; Real_ptr m_y; From 6523e4aeba54f38d45c29df6551761b1d3d3a9f5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 14 Feb 2024 09:11:07 -0800 Subject: [PATCH 259/454] Add items_per_thread tuning parameter This lets you tune the gpu scan implementation. This must be set at configure time by setting RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD to a comma separated list of integers. If unset a default value is used. This default should vary based on device but currently a value for cuda and a value for hip. Different items_per_thread can be selected to run via the command line option --items_per_thread. --- CMakeLists.txt | 9 ++ docs/sphinx/user_guide/build.rst | 17 +++ src/algorithm/SCAN-Cuda.cpp | 98 +++++++++---- src/algorithm/SCAN-Hip.cpp | 96 ++++++++---- src/algorithm/SCAN.hpp | 10 +- src/basic/INDEXLIST-Cuda.cpp | 82 ++++++++++- src/basic/INDEXLIST-Hip.cpp | 82 ++++++++++- src/basic/INDEXLIST.hpp | 6 +- src/common/CudaGridScan.hpp | 245 ++++++++++++++++++------------- src/common/Executor.cpp | 3 + src/common/GPUUtils.hpp | 19 +++ src/common/HipGridScan.hpp | 245 ++++++++++++++++++------------- src/common/RunParams.cpp | 44 ++++++ src/common/RunParams.hpp | 11 ++ src/rajaperf_config.hpp.in | 4 + 15 files changed, 687 insertions(+), 284 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 677acb095..3378a0e80 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,8 @@ set(RAJA_PERFSUITE_GPU_BLOCKSIZES "" CACHE STRING "Comma separated list of GPU b set(RAJA_PERFSUITE_ATOMIC_REPLICATIONS "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'") +set(RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'") + set(RAJA_RANGE_ALIGN 4) set(RAJA_RANGE_MIN_LENGTH 32) set(RAJA_DATA_ALIGN 64) @@ -95,6 +97,13 @@ else() message(STATUS "Using default atomic replication(s)") endif() +string(LENGTH "${RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD}" GPU_ITEMS_PER_THREAD_LENGTH) +if (GPU_ITEMS_PER_THREAD_LENGTH GREATER 0) + message(STATUS "Using gpu items per thread(s): ${RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD}") +else() + message(STATUS "Using default gpu items per thread(s)") +endif() + # exclude RAJA make targets from top-level build... add_subdirectory(tpl/RAJA) diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst index 8601c555c..372f495e9 100644 --- a/docs/sphinx/user_guide/build.rst +++ b/docs/sphinx/user_guide/build.rst @@ -229,6 +229,23 @@ replication amounts. The CMake option for this is will build versions of GPU kernels that use 1, 256, and 4096 atomic replications. +Building with specific GPU items per thread tunings +----------------------------------------------------- + +If desired, you can build a version of the RAJA Performance Suite code with +multiple versions of GPU kernels that will run with different GPU items per +thread amounts. The CMake option for this is +``-DRAJA_PERFSUITE_GPU_ITEMS_PER_THREAD=``. For example:: + + $ mkdir my-gpu-build + $ cd my-gpu-build + $ cmake \ + -DRAJA_PERFSUITE_GPU_ITEMS_PER_THREAD=1,2,4,8 \ + .. + $ make -j + +will build versions of GPU kernels that use 1, 2, 4, and 8 items per thread. + Building with Caliper --------------------- diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index d3500d47e..894b5212d 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -25,6 +25,12 @@ namespace rajaperf namespace algorithm { +template < size_t block_size > +using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::cuda::grid_scan_default_items_per_thread, + integer::LessEqual::value>>; + + template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) __global__ void scan(Real_ptr x, @@ -52,7 +58,7 @@ __global__ void scan(Real_ptr x, Real_type exclusives[items_per_thread]; Real_type inclusives[items_per_thread]; - detail::cuda::grid_scan( + detail::cuda::GridScan::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -64,7 +70,7 @@ __global__ void scan(Real_ptr x, } -void SCAN::runCudaVariantCub(VariantID vid) +void SCAN::runCudaVariantLibrary(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -119,12 +125,22 @@ void SCAN::runCudaVariantCub(VariantID vid) // Free temporary storage deallocData(DataSpace::CudaDevice, temp_storage); + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::exclusive_scan< RAJA::cuda_exec<0, true /*async*/> >(res, RAJA_SCAN_ARGS); + + } + stopTimer(); + } else { getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; } } -template < size_t block_size > +template < size_t block_size, size_t items_per_thread > void SCAN::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -137,7 +153,7 @@ void SCAN::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::cuda::grid_scan_items_per_thread); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); const size_t shmem_size = 0; Real_ptr block_counts; @@ -152,7 +168,7 @@ void SCAN::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - RPlaunchCudaKernel( (scan), + RPlaunchCudaKernel( (scan), grid_size, block_size, shmem_size, res.get_stream(), x+ibegin, y+ibegin, @@ -166,16 +182,6 @@ void SCAN::runCudaVariantImpl(VariantID vid) deallocData(DataSpace::CudaDevice, grid_counts); deallocData(DataSpace::CudaDevice, block_readys); - } else if ( vid == RAJA_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::exclusive_scan< RAJA::cuda_exec >(res, RAJA_SCAN_ARGS); - - } - stopTimer(); - } else { getCout() << "\n SCAN : Unknown Cuda variant id = " << vid << std::endl; } @@ -186,27 +192,46 @@ void SCAN::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - if ( vid == Base_CUDA ) { + + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { if (tune_idx == t) { - runCudaVariantCub(vid); + runCudaVariantLibrary(vid); } t += 1; - } + if ( vid == Base_CUDA ) { - if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (tune_idx == t) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { - runCudaVariantImpl(vid); + seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { - } + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { - t += 1; + if (tune_idx == t) { + + runCudaVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } } else { @@ -217,15 +242,34 @@ void SCAN::runCudaVariant(VariantID vid, size_t tune_idx) void SCAN::setCudaTuningDefinitions(VariantID vid) { - if ( vid == Base_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { addVariantTuningName(vid, "cub"); - } + if ( vid == Base_CUDA ) { - if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { - addVariantTuningName(vid, "default"); + addVariantTuningName(vid, "block_"+std::to_string(block_size)+ + "_itemsPerThread_"+std::to_string(items_per_thread)); + + } + + }); + + } + + }); + + } } } diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 9723d46a4..6a6d54b2e 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -30,6 +30,12 @@ namespace rajaperf namespace algorithm { +template < size_t block_size > +using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::hip::grid_scan_default_items_per_thread, + integer::LessEqual::value>>; + + template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) __global__ void scan(Real_ptr x, @@ -57,7 +63,7 @@ __global__ void scan(Real_ptr x, Real_type exclusives[items_per_thread]; Real_type inclusives[items_per_thread]; - detail::hip::grid_scan( + detail::hip::GridScan::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -69,7 +75,7 @@ __global__ void scan(Real_ptr x, } -void SCAN::runHipVariantRocprim(VariantID vid) +void SCAN::runHipVariantLibrary(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -146,12 +152,22 @@ void SCAN::runHipVariantRocprim(VariantID vid) // Free temporary storage deallocData(DataSpace::HipDevice, temp_storage); + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::exclusive_scan< RAJA::hip_exec<0, true /*async*/> >(res, RAJA_SCAN_ARGS); + + } + stopTimer(); + } else { getCout() << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; } } -template < size_t block_size > +template < size_t block_size, size_t items_per_thread > void SCAN::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -164,7 +180,7 @@ void SCAN::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::hip::grid_scan_items_per_thread); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); const size_t shmem_size = 0; Real_ptr block_counts; @@ -180,7 +196,7 @@ void SCAN::runHipVariantImpl(VariantID vid) hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - RPlaunchHipKernel( (scan), + RPlaunchHipKernel( (scan), grid_size, block_size, shmem_size, res.get_stream(), x+ibegin, y+ibegin, @@ -194,16 +210,6 @@ void SCAN::runHipVariantImpl(VariantID vid) deallocData(DataSpace::HipDevice, grid_counts); deallocData(DataSpace::HipDevice, block_readys); - } else if ( vid == RAJA_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::exclusive_scan< RAJA::hip_exec >(res, RAJA_SCAN_ARGS); - - } - stopTimer(); - } else { getCout() << "\n SCAN : Unknown Hip variant id = " << vid << std::endl; } @@ -214,27 +220,44 @@ void SCAN::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - if ( vid == Base_HIP ) { + if ( vid == Base_HIP || vid == RAJA_HIP ) { if (tune_idx == t) { - runHipVariantRocprim(vid); + runHipVariantLibrary(vid); } t += 1; - } + if ( vid == Base_HIP ) { - if ( vid == Base_HIP || vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - if (tune_idx == t) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { - runHipVariantImpl(vid); + seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { - } + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { - t += 1; + if (tune_idx == t) { + + runHipVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + } } else { @@ -245,15 +268,34 @@ void SCAN::runHipVariant(VariantID vid, size_t tune_idx) void SCAN::setHipTuningDefinitions(VariantID vid) { - if ( vid == Base_HIP ) { + if ( vid == Base_HIP || vid == RAJA_HIP ) { addVariantTuningName(vid, "rocprim"); - } + if ( vid == Base_HIP ) { - if ( vid == Base_HIP || vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { - addVariantTuningName(vid, "default"); + addVariantTuningName(vid, "block_"+std::to_string(block_size)+ + "_itemsPerThread_"+std::to_string(items_per_thread)); + + } + + }); + + } + + }); + + } } } diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index 8db3ef02e..102e02981 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -64,16 +64,16 @@ class SCAN : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - void runCudaVariantCub(VariantID vid); - void runHipVariantRocprim(VariantID vid); - template < size_t block_size > + void runCudaVariantLibrary(VariantID vid); + void runHipVariantLibrary(VariantID vid); + template < size_t block_size, size_t items_per_thread > void runCudaVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t block_size, size_t items_per_thread > void runHipVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = integer::list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 6cd0bce93..71ad5d503 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -22,6 +22,12 @@ namespace rajaperf namespace basic { +template < size_t block_size > +using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::cuda::grid_scan_default_items_per_thread, + integer::LessEqual::value>>; + + template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) __global__ void indexlist(Real_ptr x, @@ -52,7 +58,7 @@ __global__ void indexlist(Real_ptr x, Index_type exclusives[items_per_thread]; Index_type inclusives[items_per_thread]; - detail::cuda::grid_scan( + detail::cuda::GridScan::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -71,7 +77,7 @@ __global__ void indexlist(Real_ptr x, } -template < size_t block_size > +template < size_t block_size, size_t items_per_thread > void INDEXLIST::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -84,7 +90,7 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) if ( vid == Base_CUDA ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::cuda::grid_scan_items_per_thread); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); const size_t shmem_size = 0; Index_type* len; @@ -101,7 +107,7 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) cudaErrchk( cudaMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - RPlaunchCudaKernel( (indexlist), + RPlaunchCudaKernel( (indexlist), grid_size, block_size, shmem_size, res.get_stream(), x+ibegin, list+ibegin, @@ -124,7 +130,73 @@ void INDEXLIST::runCudaVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INDEXLIST, Cuda) + +void INDEXLIST::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + if (tune_idx == t) { + + runCudaVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n INDEXLIST : Unknown Cuda variant id = " << vid << std::endl; + + } +} + +void INDEXLIST::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)+ + "_itemsPerThread_"+std::to_string(items_per_thread)); + + } + + }); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 84c5d36ea..5c5b4116f 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -22,6 +22,12 @@ namespace rajaperf namespace basic { +template < size_t block_size > +using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< + detail::hip::grid_scan_default_items_per_thread, + integer::LessEqual::value>>; + + template < size_t block_size, size_t items_per_thread > __launch_bounds__(block_size) __global__ void indexlist(Real_ptr x, @@ -52,7 +58,7 @@ __global__ void indexlist(Real_ptr x, Index_type exclusives[items_per_thread]; Index_type inclusives[items_per_thread]; - detail::hip::grid_scan( + detail::hip::GridScan::grid_scan( block_id, vals, exclusives, inclusives, block_counts, grid_counts, block_readys); for (size_t ti = 0; ti < items_per_thread; ++ti) { @@ -70,7 +76,7 @@ __global__ void indexlist(Real_ptr x, } } -template < size_t block_size > +template < size_t block_size, size_t items_per_thread > void INDEXLIST::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -83,7 +89,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) if ( vid == Base_HIP ) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*detail::hip::grid_scan_items_per_thread); + const size_t grid_size = RAJA_DIVIDE_CEILING_INT((iend-ibegin), block_size*items_per_thread); const size_t shmem_size = 0; Index_type* len; @@ -101,7 +107,7 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) hipErrchk( hipMemsetAsync(block_readys, 0, sizeof(unsigned)*grid_size, res.get_stream()) ); - RPlaunchHipKernel( (indexlist), + RPlaunchHipKernel( (indexlist), grid_size, block_size, shmem_size, res.get_stream(), x+ibegin, list+ibegin, @@ -124,7 +130,73 @@ void INDEXLIST::runHipVariantImpl(VariantID vid) } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(INDEXLIST, Hip) + +void INDEXLIST::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + if (tune_idx == t) { + + runHipVariantImpl(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n INDEXLIST : Unknown Hip variant id = " << vid << std::endl; + + } +} + +void INDEXLIST::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { + + if (run_params.numValidItemsPerThread() == 0u || + run_params.validItemsPerThread(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)+ + "_itemsPerThread_"+std::to_string(items_per_thread)); + + } + + }); + + } + + }); + + } +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index efcf94aa8..0373da31b 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -63,14 +63,14 @@ class INDEXLIST : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > + template < size_t block_size, size_t items_per_thread > void runCudaVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t block_size, size_t items_per_thread > void runHipVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = integer::list_type; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; Real_ptr m_x; Int_ptr m_list; diff --git a/src/common/CudaGridScan.hpp b/src/common/CudaGridScan.hpp index cd237d7ad..2b7d386ba 100644 --- a/src/common/CudaGridScan.hpp +++ b/src/common/CudaGridScan.hpp @@ -24,28 +24,19 @@ namespace cuda // Define magic numbers for CUDA execution // const size_t warp_size = 32; -const size_t grid_scan_items_per_thread = 15; +const size_t max_static_shmem = 49154; + +// grid scan tunings are in (block_size, items_per_thread) +// these tunings maximize throughput while minimizing items_per_thread +// sm_70: (64, 13), (128, 9), (256, 6), (512, 5), (1024, 5) +const size_t grid_scan_default_items_per_thread = 7; // perform a grid scan on val and returns the result at each thread // in exclusive and inclusive, note that val is used as scratch space -template < size_t block_size, size_t items_per_thread, typename DataType > -__device__ void grid_scan(const int block_id, - DataType (&val)[items_per_thread], - DataType (&exclusive)[items_per_thread], - DataType (&inclusive)[items_per_thread], - DataType* block_counts, - DataType* grid_counts, - unsigned* block_readys) +template < typename DataType, size_t block_size, size_t items_per_thread > +struct GridScan { - const bool first_block = (block_id == 0); - const bool last_block = (block_id == gridDim.x-1); - const bool last_thread = (threadIdx.x == block_size-1); - const bool last_warp = (threadIdx.x >= block_size - warp_size); - const int warp_index = (threadIdx.x % warp_size); - const unsigned warp_index_mask = (1u << warp_index); - const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u); - using BlockScan = cub::BlockScan; //, cub::BLOCK_SCAN_WARP_SCANS>; using BlockExchange = cub::BlockExchange; using WarpReduce = cub::WarpReduce; @@ -56,136 +47,178 @@ __device__ void grid_scan(const int block_id, typename WarpReduce::TempStorage warp_reduce_storage; volatile DataType prev_grid_count; }; - __shared__ SharedStorage s_temp_storage; + static constexpr size_t shmem_size = sizeof(SharedStorage); + + __device__ + static void grid_scan(const int block_id, + DataType (&val)[items_per_thread], + DataType (&exclusive)[items_per_thread], + DataType (&inclusive)[items_per_thread], + DataType* block_counts, + DataType* grid_counts, + unsigned* block_readys) + { + const bool first_block = (block_id == 0); + const bool last_block = (block_id == gridDim.x-1); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const unsigned warp_index_mask = (1u << warp_index); + const unsigned warp_index_mask_right = warp_index_mask | (warp_index_mask - 1u); + + __shared__ SharedStorage s_temp_storage; + + + BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val); + __syncthreads(); - BlockExchange(s_temp_storage.block_exchange_storage).StripedToBlocked(val); - __syncthreads(); + BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); + __syncthreads(); - BlockScan(s_temp_storage.block_scan_storage).ExclusiveSum(val, exclusive); - __syncthreads(); + for (size_t ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } - for (size_t ti = 0; ti < items_per_thread; ++ti) { - inclusive[ti] = exclusive[ti] + val[ti]; - } + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive); + __syncthreads(); + BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive); + __syncthreads(); + if (first_block) { - BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(exclusive); - __syncthreads(); - BlockExchange(s_temp_storage.block_exchange_storage).BlockedToStriped(inclusive); - __syncthreads(); - if (first_block) { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts, grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready - } + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } - } else { + } else { - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - __threadfence(); // ensure block_counts ready (release) - atomicExch(&block_readys[block_id], 1u); // write block_counts is ready - } + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } - // get prev_grid_count using last warp in block - if (last_warp) { + // get prev_grid_count using last warp in block + if (last_warp) { - DataType prev_grid_count = 0; + DataType prev_grid_count = 0; - // accumulate previous block counts into registers of warp + // accumulate previous block counts into registers of warp - int prev_block_base_id = block_id - warp_size; + int prev_block_base_id = block_id - warp_size; - unsigned prev_block_ready = 0u; - unsigned prev_blocks_ready_ballot = 0u; - unsigned prev_grids_ready_ballot = 0u; + unsigned prev_block_ready = 0u; + unsigned prev_blocks_ready_ballot = 0u; + unsigned prev_grids_ready_ballot = 0u; - // accumulate full warp worths of block counts - // stop if run out of full warps of a grid count is ready - while (prev_block_base_id >= 0) { + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { - const int prev_block_id = prev_block_base_id + warp_index; + const int prev_block_id = prev_block_base_id + warp_index; - // ensure previous block_counts are ready - do { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); - } while (prev_blocks_ready_ballot != 0xffffffffu); + } while (prev_blocks_ready_ballot != 0xffffffffu); - prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - if (prev_grids_ready_ballot != 0u) { - break; - } + if (prev_grids_ready_ballot != 0u) { + break; + } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) + __threadfence(); // ensure block_counts or grid_counts ready (acquire) - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; - prev_block_ready = 0u; + prev_block_ready = 0u; - prev_block_base_id -= warp_size; - } + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; - const int prev_block_id = prev_block_base_id + warp_index; + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { - // ensure previous block_counts are ready - // this checks that block counts is ready for all blocks above - // the highest grid count that is ready - while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + if (prev_block_id >= 0) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } - if (prev_block_id >= 0) { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); } + __threadfence(); // ensure block_counts or grid_counts ready (acquire) - prev_blocks_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready >= 1u); - prev_grids_ready_ballot = __ballot_sync(0xffffffffu, prev_block_ready == 2u); - } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // read one grid_count from a block with id grid_count_ready_id - // and read the block_counts from blocks with higher ids. - if (warp_index_mask > prev_grids_ready_ballot) { - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // accumulate grid_count for grid_count_ready_id - prev_grid_count += grid_counts[prev_block_id]; - } + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; + } - prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count); - prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp + prev_grid_count = WarpReduce(s_temp_storage.warp_reduce_storage).Sum(prev_grid_count); + prev_grid_count = __shfl_sync(0xffffffffu, prev_grid_count, 0, warp_size); // broadcast output to all threads in warp - if (last_thread) { + if (last_thread) { - if (!last_block) { - grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready - } + if (!last_block) { + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } - s_temp_storage.prev_grid_count = prev_grid_count; + s_temp_storage.prev_grid_count = prev_grid_count; + } } - } - __syncthreads(); - DataType prev_grid_count = s_temp_storage.prev_grid_count; + __syncthreads(); + DataType prev_grid_count = s_temp_storage.prev_grid_count; - for (size_t ti = 0; ti < items_per_thread; ++ti) { - exclusive[ti] = prev_grid_count + exclusive[ti]; - inclusive[ti] = prev_grid_count + inclusive[ti]; + for (size_t ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } } } + +}; + + +namespace detail +{ + +template < typename T, size_t block_size, size_t max_items_per_thread > +struct grid_scan_max_items_per_thread + : std::conditional_t< (GridScan::shmem_size <= max_static_shmem), + grid_scan_max_items_per_thread, + std::integral_constant > +{ +}; + } +template < typename T, size_t block_size > +struct grid_scan_max_items_per_thread + : detail::grid_scan_max_items_per_thread +{ +}; + } // end namespace cuda } // end namespace detail } // end namespace rajaperf diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index bf6b7ddf2..fb9af5046 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -174,6 +174,9 @@ Executor::Executor(int argc, char** argv) if (cc.adiak_atomic_replications.size() > 0) { adiak::value("atomic_replications", cc.adiak_atomic_replications); } + if (cc.adiak_gpu_items_per_thread.size() > 0) { + adiak::value("gpu_items_per_thread", cc.adiak_gpu_items_per_thread); + } if (cc.adiak_raja_hipcc_flags.size() > 0) { adiak::value("raja_hipcc_flags", cc.adiak_raja_hipcc_flags); } diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index d11b3e58e..f706a292b 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -125,6 +125,12 @@ struct ExactSqrt static constexpr bool valid(size_t i) { return sqrt(i)*sqrt(i) == i; } }; +template < size_t N > +struct LessEqual +{ + static constexpr bool valid(size_t i) { return i <= N; } +}; + // A camp::list of camp::integral_constant types. // If gpu_block_sizes from the configuration is not empty it is those gpu_block_sizes, // otherwise it is a list containing just default_block_size. @@ -151,6 +157,19 @@ using make_atomic_replication_list_type = >::type >::type; +// A camp::list of camp::integral_constant types. +// If gpu_items_per_thread from the configuration is not empty it is those gpu_items_per_thread, +// otherwise it is a list containing just default_gpu_items_per_thread. +// Invalid entries are removed according to validity_checker in either case. +template < size_t default_gpu_items_per_thread, typename validity_checker = AllowAny > +using make_gpu_items_per_thread_list_type = + typename detail::remove_invalid::value > 0), + rajaperf::configuration::gpu_items_per_thread, + list_type + >::type + >::type; + } // closing brace for integer namespace namespace gpu_algorithm { diff --git a/src/common/HipGridScan.hpp b/src/common/HipGridScan.hpp index 9918776e1..7f94f66c3 100644 --- a/src/common/HipGridScan.hpp +++ b/src/common/HipGridScan.hpp @@ -26,28 +26,19 @@ namespace hip // Define magic numbers for HIP execution // const size_t warp_size = 64; -const size_t grid_scan_items_per_thread = 8; +const size_t max_static_shmem = 65536; + +// grid scan tunings are in (block_size, items_per_thread) +// these tunings maximize throughput while minimizing items_per_thread +// gfx90a: (64, 6), (128, 4), (256, 4), (512, 4), (1024, 2) +const size_t grid_scan_default_items_per_thread = 4; // perform a grid scan on val and returns the result at each thread // in exclusive and inclusive, note that val is used as scratch space -template < size_t block_size, size_t items_per_thread, typename DataType > -__device__ void grid_scan(const int block_id, - DataType (&val)[items_per_thread], - DataType (&exclusive)[items_per_thread], - DataType (&inclusive)[items_per_thread], - DataType* block_counts, - DataType* grid_counts, - unsigned* block_readys) +template < typename DataType, size_t block_size, size_t items_per_thread > +struct GridScan { - const bool first_block = (block_id == 0); - const bool last_block = (block_id == static_cast(gridDim.x-1)); - const bool last_thread = (threadIdx.x == block_size-1); - const bool last_warp = (threadIdx.x >= block_size - warp_size); - const int warp_index = (threadIdx.x % warp_size); - const unsigned long long warp_index_mask = (1ull << warp_index); - const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull); - using BlockScan = rocprim::block_scan; //, rocprim::block_scan_algorithm::reduce_then_scan>; using BlockExchange = rocprim::block_exchange; using WarpReduce = rocprim::warp_reduce; @@ -58,136 +49,178 @@ __device__ void grid_scan(const int block_id, typename WarpReduce::storage_type warp_reduce_storage; volatile DataType prev_grid_count; }; - __shared__ SharedStorage s_temp_storage; + static constexpr size_t shmem_size = sizeof(SharedStorage); + + __device__ + static void grid_scan(const int block_id, + DataType (&val)[items_per_thread], + DataType (&exclusive)[items_per_thread], + DataType (&inclusive)[items_per_thread], + DataType* block_counts, + DataType* grid_counts, + unsigned* block_readys) + { + const bool first_block = (block_id == 0); + const bool last_block = (block_id == static_cast(gridDim.x-1)); + const bool last_thread = (threadIdx.x == block_size-1); + const bool last_warp = (threadIdx.x >= block_size - warp_size); + const int warp_index = (threadIdx.x % warp_size); + const unsigned long long warp_index_mask = (1ull << warp_index); + const unsigned long long warp_index_mask_right = warp_index_mask | (warp_index_mask - 1ull); + + __shared__ SharedStorage s_temp_storage; + + + BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage); + __syncthreads(); - BlockExchange().striped_to_blocked(val, val, s_temp_storage.block_exchange_storage); - __syncthreads(); + BlockScan().exclusive_scan(val, exclusive, DataType{0}, s_temp_storage.block_scan_storage); + __syncthreads(); - BlockScan().exclusive_scan(val, exclusive, DataType{0}, s_temp_storage.block_scan_storage); - __syncthreads(); + for (size_t ti = 0; ti < items_per_thread; ++ti) { + inclusive[ti] = exclusive[ti] + val[ti]; + } - for (size_t ti = 0; ti < items_per_thread; ++ti) { - inclusive[ti] = exclusive[ti] + val[ti]; - } + BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage); + __syncthreads(); + if (first_block) { - BlockExchange().blocked_to_striped(exclusive, exclusive, s_temp_storage.block_exchange_storage); - __syncthreads(); - BlockExchange().blocked_to_striped(inclusive, inclusive, s_temp_storage.block_exchange_storage); - __syncthreads(); - if (first_block) { - - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure block_counts, grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready - } + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + grid_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure block_counts, grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write block_counts, grid_counts are ready + } - } else { + } else { - if (!last_block && last_thread) { - block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block - __threadfence(); // ensure block_counts ready (release) - atomicExch(&block_readys[block_id], 1u); // write block_counts is ready - } + if (!last_block && last_thread) { + block_counts[block_id] = inclusive[items_per_thread-1]; // write inclusive scan result for block + __threadfence(); // ensure block_counts ready (release) + atomicExch(&block_readys[block_id], 1u); // write block_counts is ready + } - // get prev_grid_count using last warp in block - if (last_warp) { + // get prev_grid_count using last warp in block + if (last_warp) { - DataType prev_grid_count = 0; + DataType prev_grid_count = 0; - // accumulate previous block counts into registers of warp + // accumulate previous block counts into registers of warp - int prev_block_base_id = block_id - warp_size; + int prev_block_base_id = block_id - warp_size; - unsigned prev_block_ready = 0u; - unsigned long long prev_blocks_ready_ballot = 0ull; - unsigned long long prev_grids_ready_ballot = 0ull; + unsigned prev_block_ready = 0u; + unsigned long long prev_blocks_ready_ballot = 0ull; + unsigned long long prev_grids_ready_ballot = 0ull; - // accumulate full warp worths of block counts - // stop if run out of full warps of a grid count is ready - while (prev_block_base_id >= 0) { + // accumulate full warp worths of block counts + // stop if run out of full warps of a grid count is ready + while (prev_block_base_id >= 0) { - const int prev_block_id = prev_block_base_id + warp_index; + const int prev_block_id = prev_block_base_id + warp_index; - // ensure previous block_counts are ready - do { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + // ensure previous block_counts are ready + do { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); - prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); - } while (prev_blocks_ready_ballot != 0xffffffffffffffffull); + } while (prev_blocks_ready_ballot != 0xffffffffffffffffull); - prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); - if (prev_grids_ready_ballot != 0ull) { - break; - } + if (prev_grids_ready_ballot != 0ull) { + break; + } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) + __threadfence(); // ensure block_counts or grid_counts ready (acquire) - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; - prev_block_ready = 0u; + prev_block_ready = 0u; - prev_block_base_id -= warp_size; - } + prev_block_base_id -= warp_size; + } + + const int prev_block_id = prev_block_base_id + warp_index; - const int prev_block_id = prev_block_base_id + warp_index; + // ensure previous block_counts are ready + // this checks that block counts is ready for all blocks above + // the highest grid count that is ready + while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { - // ensure previous block_counts are ready - // this checks that block counts is ready for all blocks above - // the highest grid count that is ready - while (~prev_blocks_ready_ballot >= prev_grids_ready_ballot) { + if (prev_block_id >= 0) { + prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + } - if (prev_block_id >= 0) { - prev_block_ready = atomicCAS(&block_readys[prev_block_id], 11u, 11u); + prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); + prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); } + __threadfence(); // ensure block_counts or grid_counts ready (acquire) - prev_blocks_ready_ballot = __ballot(prev_block_ready >= 1u); - prev_grids_ready_ballot = __ballot(prev_block_ready == 2u); - } - __threadfence(); // ensure block_counts or grid_counts ready (acquire) - - // read one grid_count from a block with id grid_count_ready_id - // and read the block_counts from blocks with higher ids. - if (warp_index_mask > prev_grids_ready_ballot) { - // accumulate block_counts for prev_block_id - prev_grid_count += block_counts[prev_block_id]; - } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { - // accumulate grid_count for grid_count_ready_id - prev_grid_count += grid_counts[prev_block_id]; - } + // read one grid_count from a block with id grid_count_ready_id + // and read the block_counts from blocks with higher ids. + if (warp_index_mask > prev_grids_ready_ballot) { + // accumulate block_counts for prev_block_id + prev_grid_count += block_counts[prev_block_id]; + } else if (prev_grids_ready_ballot == (prev_grids_ready_ballot & warp_index_mask_right)) { + // accumulate grid_count for grid_count_ready_id + prev_grid_count += grid_counts[prev_block_id]; + } - WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage); - prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp + WarpReduce().reduce(prev_grid_count, prev_grid_count, s_temp_storage.warp_reduce_storage); + prev_grid_count = __shfl(prev_grid_count, 0, warp_size); // broadcast output to all threads in warp - if (last_thread) { + if (last_thread) { - if (!last_block) { - grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block - __threadfence(); // ensure grid_counts ready (release) - atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready - } + if (!last_block) { + grid_counts[block_id] = prev_grid_count + inclusive[items_per_thread-1]; // write inclusive scan result for grid through block + __threadfence(); // ensure grid_counts ready (release) + atomicExch(&block_readys[block_id], 2u); // write grid_counts is ready + } - s_temp_storage.prev_grid_count = prev_grid_count; + s_temp_storage.prev_grid_count = prev_grid_count; + } } - } - __syncthreads(); - DataType prev_grid_count = s_temp_storage.prev_grid_count; + __syncthreads(); + DataType prev_grid_count = s_temp_storage.prev_grid_count; - for (size_t ti = 0; ti < items_per_thread; ++ti) { - exclusive[ti] = prev_grid_count + exclusive[ti]; - inclusive[ti] = prev_grid_count + inclusive[ti]; + for (size_t ti = 0; ti < items_per_thread; ++ti) { + exclusive[ti] = prev_grid_count + exclusive[ti]; + inclusive[ti] = prev_grid_count + inclusive[ti]; + } } } + +}; + + +namespace detail +{ + +template < typename T, size_t block_size, size_t max_items_per_thread > +struct grid_scan_max_items_per_thread + : std::conditional_t< (GridScan::shmem_size <= max_static_shmem), + grid_scan_max_items_per_thread, + std::integral_constant > +{ +}; + } +template < typename T, size_t block_size > +struct grid_scan_max_items_per_thread + : detail::grid_scan_max_items_per_thread +{ +}; + } // end namespace hip } // end namespace detail } // end namespace rajaperf diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 4d1e90df9..734ad9e55 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -41,6 +41,7 @@ RunParams::RunParams(int argc, char** argv) gpu_stream(1), gpu_block_sizes(), atomic_replications(), + items_per_threads(), mpi_size(1), mpi_rank(0), mpi_3d_division({-1, -1, -1}), @@ -128,6 +129,10 @@ void RunParams::print(std::ostream& str) const for (size_t j = 0; j < atomic_replications.size(); ++j) { str << "\n\t" << atomic_replications[j]; } + str << "\n items_per_threads = "; + for (size_t j = 0; j < items_per_threads.size(); ++j) { + str << "\n\t" << items_per_threads[j]; + } str << "\n mpi_size = " << mpi_size; str << "\n mpi_3d_division = "; for (size_t j = 0; j < 3; ++j) { @@ -501,6 +506,37 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--items_per_thread") ) { + + bool got_someting = false; + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + got_someting = true; + int items_per_thread = ::atoi( opt.c_str() ); + if ( items_per_thread <= 0 ) { + getCout() << "\nBad input:" + << " must give --items_per_thread POSITIVE values (int)" + << std::endl; + input_state = BadInput; + } else { + items_per_threads.push_back(items_per_thread); + } + ++i; + } + } + if (!got_someting) { + getCout() << "\nBad input:" + << " must give --items_per_thread one or more values (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--mpi_3d_division") ) { int num_got = 0; @@ -1121,6 +1157,14 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --atomic_replication 128 256 512 (runs kernels with atomic_replication 128, 256, and 512)\n\n"; + str << "\t --items_per_thread [no default]\n" + << "\t (items per thread to run for all GPU kernels)\n" + << "\t GPU kernels not supporting items_per_thread option will be skipped.\n" + << "\t Behavior depends on kernel implementations and \n" + << "\t values give via CMake variable RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD.\n"; + str << "\t\t Example...\n" + << "\t\t --items_per_thread 128 256 512 (runs kernels with items_per_thread 128, 256, and 512)\n\n"; + str << "\t --mpi_3d_division [no default]\n" << "\t (number of mpi ranks in each dimension in a 3d grid)\n" << "\t (3D MPI kernels will be skipped if the product of mpi_3d_division is not equal to the number of ranks)\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 6d58a1302..65e5c542b 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -144,6 +144,16 @@ class RunParams { } return false; } + size_t numValidItemsPerThread() const { return items_per_threads.size(); } + bool validItemsPerThread(size_t items_per_thread) const + { + for (size_t valid_items_per_thread : items_per_threads) { + if (valid_items_per_thread == items_per_thread) { + return true; + } + } + return false; + } int getMPISize() const { return mpi_size; } int getMPIRank() const { return mpi_rank; } @@ -244,6 +254,7 @@ class RunParams { int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ std::vector atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */ + std::vector items_per_threads; /*!< Items per thread for gpu tunings to run (input option) */ int mpi_size; /*!< Number of MPI ranks */ int mpi_rank; /*!< Rank of this MPI process */ std::array mpi_3d_division; /*!< Number of MPI ranks in each dimension of a 3D grid */ diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 679e8ffba..655e9bd55 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -104,6 +104,7 @@ constexpr static const char* adiak_gpu_targets = "@GPU_TARGETS@"; constexpr static const char* adiak_cmake_hip_architectures = "@CMAKE_HIP_ARCHIECTURES@"; const std::vector adiak_gpu_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@}; const std::vector adiak_atomic_replications = {@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@}; +const std::vector adiak_gpu_items_per_thread = {@RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD@}; const std::vector adiak_raja_hipcc_flags = str_to_list(std::string("@RAJA_HIPCC_FLAGS@")); const adiak::catstring adiak_mpi_cxx_compiler = std::string("@MPI_CXX_COMPILER@"); const adiak::catstring adiak_systype_build = std::string("@RAJAPERF_BUILD_SYSTYPE@"); @@ -116,6 +117,9 @@ using gpu_block_sizes = integer::list_type<@RAJA_PERFSUITE_GPU_BLOCKSIZES@>; // List of GPU atomic replications using atomic_replications = integer::list_type<@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@>; +// List of GPU items per thread +using gpu_items_per_thread = integer::list_type<@RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD@>; + // Name of user who ran code std::string user_run; From 9db8bc7710843c7bdd7719c8476d3bd5861545a2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 14 Feb 2024 09:27:28 -0800 Subject: [PATCH 260/454] add gfx942 tuning comment --- src/common/HipGridScan.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/HipGridScan.hpp b/src/common/HipGridScan.hpp index 7f94f66c3..4ad100437 100644 --- a/src/common/HipGridScan.hpp +++ b/src/common/HipGridScan.hpp @@ -31,6 +31,7 @@ const size_t max_static_shmem = 65536; // grid scan tunings are in (block_size, items_per_thread) // these tunings maximize throughput while minimizing items_per_thread // gfx90a: (64, 6), (128, 4), (256, 4), (512, 4), (1024, 2) +// gfx942: (64, 22), (128, 22), (256, 19), (512, 13), (1024, 7) const size_t grid_scan_default_items_per_thread = 4; From e4f13f200104d6ac2d1065221e35fb46fc1bf5cf Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 16 Feb 2024 09:31:04 -0800 Subject: [PATCH 261/454] Encode tunings for block_size and arch grid_scan_default_items_per_thread is now a class that is specialized for different block_size and architectures. --- src/algorithm/SCAN-Cuda.cpp | 2 +- src/algorithm/SCAN-Hip.cpp | 2 +- src/basic/INDEXLIST-Cuda.cpp | 2 +- src/basic/INDEXLIST-Hip.cpp | 2 +- src/common/CudaGridScan.hpp | 24 +++++++++++++++++++---- src/common/HipGridScan.hpp | 37 +++++++++++++++++++++++++++++++----- 6 files changed, 56 insertions(+), 13 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 894b5212d..6dd1db0ec 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -27,7 +27,7 @@ namespace algorithm template < size_t block_size > using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::cuda::grid_scan_default_items_per_thread, + detail::cuda::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 6a6d54b2e..9e4b56eb3 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -32,7 +32,7 @@ namespace algorithm template < size_t block_size > using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::hip::grid_scan_default_items_per_thread, + detail::hip::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 71ad5d503..3b35403cd 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -24,7 +24,7 @@ namespace basic template < size_t block_size > using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::cuda::grid_scan_default_items_per_thread, + detail::cuda::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index 5c5b4116f..ca51fb58e 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -24,7 +24,7 @@ namespace basic template < size_t block_size > using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::hip::grid_scan_default_items_per_thread, + detail::hip::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/common/CudaGridScan.hpp b/src/common/CudaGridScan.hpp index 2b7d386ba..79f52cdd9 100644 --- a/src/common/CudaGridScan.hpp +++ b/src/common/CudaGridScan.hpp @@ -26,10 +26,26 @@ namespace cuda const size_t warp_size = 32; const size_t max_static_shmem = 49154; -// grid scan tunings are in (block_size, items_per_thread) -// these tunings maximize throughput while minimizing items_per_thread -// sm_70: (64, 13), (128, 9), (256, 6), (512, 5), (1024, 5) -const size_t grid_scan_default_items_per_thread = 7; +const size_t default_arch = 700; + +// grid scan tunings that maximize throughput while minimizing items_per_thread +template < size_t block_size, size_t cuda_arch > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = 1; +}; + +// tuning for sm_70 +template < size_t block_size > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = + (block_size <= 64) ? 13 : + (block_size <= 128) ? 9 : + (block_size <= 256) ? 6 : + (block_size <= 512) ? 5 : + (block_size <= 1024) ? 5 : 1; +}; // perform a grid scan on val and returns the result at each thread diff --git a/src/common/HipGridScan.hpp b/src/common/HipGridScan.hpp index 4ad100437..d2294b7bb 100644 --- a/src/common/HipGridScan.hpp +++ b/src/common/HipGridScan.hpp @@ -28,11 +28,38 @@ namespace hip const size_t warp_size = 64; const size_t max_static_shmem = 65536; -// grid scan tunings are in (block_size, items_per_thread) -// these tunings maximize throughput while minimizing items_per_thread -// gfx90a: (64, 6), (128, 4), (256, 4), (512, 4), (1024, 2) -// gfx942: (64, 22), (128, 22), (256, 19), (512, 13), (1024, 7) -const size_t grid_scan_default_items_per_thread = 4; +const size_t default_arch = 910; + +// grid scan tunings that maximize throughput while minimizing items_per_thread +template < size_t block_size, size_t hip_arch > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = 1; +}; + +// tuning for gfx90a +template < size_t block_size > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = + (block_size <= 64) ? 6 : + (block_size <= 128) ? 4 : + (block_size <= 256) ? 4 : + (block_size <= 512) ? 4 : + (block_size <= 1024) ? 2 : 1; +}; + +// tuning for gfx942 +template < size_t block_size > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = + (block_size <= 64) ? 22 : + (block_size <= 128) ? 22 : + (block_size <= 256) ? 19 : + (block_size <= 512) ? 13 : + (block_size <= 1024) ? 7 : 1; +}; // perform a grid scan on val and returns the result at each thread From cc9bb8275f42a109ff152f89acc81da86f1f6440 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 16 Feb 2024 12:59:13 -0800 Subject: [PATCH 262/454] Add tuning gpu arch cmake option There is a separate option for cuda and hip RAJA_PERFSUITE_TUNING_CUDA_ARCH, and RAJA_PERFSUITE_TUNING_HIP_ARCH respectively. These are used internally to pick default tuning parameters at compile time based on an architecture. --- CMakeLists.txt | 17 +++++++++ src/algorithm/SCAN-Cuda.cpp | 2 +- src/algorithm/SCAN-Hip.cpp | 2 +- src/basic/INDEXLIST-Cuda.cpp | 2 +- src/basic/INDEXLIST-Hip.cpp | 2 +- src/common/CudaGridScan.hpp | 45 ++++++++++++----------- src/common/Executor.cpp | 6 ++++ src/common/HipGridScan.hpp | 70 +++++++++++++++++++----------------- src/rajaperf_config.hpp.in | 7 ++++ 9 files changed, 95 insertions(+), 58 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3378a0e80..5e19a3736 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,6 +73,9 @@ set(ENABLE_TBB Off CACHE BOOL "") set(RAJA_USE_CHRONO On CACHE BOOL "") +set(RAJA_PERFSUITE_TUNING_CUDA_ARCH "0" CACHE STRING "CUDA arch to tune the execution for, ex '700' for sm_70") +set(RAJA_PERFSUITE_TUNING_HIP_ARCH "0" CACHE STRING "HIP arch to tune the execution for, ex '910' for gfx90a, '942' for gfx942") + set(RAJA_PERFSUITE_GPU_BLOCKSIZES "" CACHE STRING "Comma separated list of GPU block sizes, ex '256,1024'") set(RAJA_PERFSUITE_ATOMIC_REPLICATIONS "" CACHE STRING "Comma separated list of atomic replications, ex '1,256,4096'") @@ -83,6 +86,20 @@ set(RAJA_RANGE_ALIGN 4) set(RAJA_RANGE_MIN_LENGTH 32) set(RAJA_DATA_ALIGN 64) +string(LENGTH "${RAJA_PERFSUITE_TUNING_CUDA_ARCH}" CUDA_ARCH_LENGTH) +if (CUDA_ARCH_LENGTH GREATER 1) + message(STATUS "Using cuda tunings for arch: ${RAJA_PERFSUITE_TUNING_CUDA_ARCH}") +else() + message(STATUS "Using default cuda arch tunings") +endif() + +string(LENGTH "${RAJA_PERFSUITE_TUNING_HIP_ARCH}" HIP_ARCH_LENGTH) +if (HIP_ARCH_LENGTH GREATER 1) + message(STATUS "Using hip tunings for arch: ${RAJA_PERFSUITE_TUNING_HIP_ARCH}") +else() + message(STATUS "Using default hip arch tunings") +endif() + string(LENGTH "${RAJA_PERFSUITE_GPU_BLOCKSIZES}" BLOCKSIZES_LENGTH) if (BLOCKSIZES_LENGTH GREATER 0) message(STATUS "Using gpu block size(s): ${RAJA_PERFSUITE_GPU_BLOCKSIZES}") diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 6dd1db0ec..29715478e 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -27,7 +27,7 @@ namespace algorithm template < size_t block_size > using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::cuda::grid_scan_default_items_per_thread::value, + detail::cuda::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 9e4b56eb3..9262b2be8 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -32,7 +32,7 @@ namespace algorithm template < size_t block_size > using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::hip::grid_scan_default_items_per_thread::value, + detail::hip::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index 3b35403cd..b331d5a7c 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -24,7 +24,7 @@ namespace basic template < size_t block_size > using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::cuda::grid_scan_default_items_per_thread::value, + detail::cuda::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index ca51fb58e..baa164e3c 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -24,7 +24,7 @@ namespace basic template < size_t block_size > using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::hip::grid_scan_default_items_per_thread::value, + detail::hip::grid_scan_default_items_per_thread::value, integer::LessEqual::value>>; diff --git a/src/common/CudaGridScan.hpp b/src/common/CudaGridScan.hpp index 79f52cdd9..f2c8f2cd1 100644 --- a/src/common/CudaGridScan.hpp +++ b/src/common/CudaGridScan.hpp @@ -26,27 +26,6 @@ namespace cuda const size_t warp_size = 32; const size_t max_static_shmem = 49154; -const size_t default_arch = 700; - -// grid scan tunings that maximize throughput while minimizing items_per_thread -template < size_t block_size, size_t cuda_arch > -struct grid_scan_default_items_per_thread -{ - static constexpr size_t value = 1; -}; - -// tuning for sm_70 -template < size_t block_size > -struct grid_scan_default_items_per_thread -{ - static constexpr size_t value = - (block_size <= 64) ? 13 : - (block_size <= 128) ? 9 : - (block_size <= 256) ? 6 : - (block_size <= 512) ? 5 : - (block_size <= 1024) ? 5 : 1; -}; - // perform a grid scan on val and returns the result at each thread // in exclusive and inclusive, note that val is used as scratch space @@ -235,6 +214,30 @@ struct grid_scan_max_items_per_thread { }; + +// tune grid scan to maximize throughput while minimizing items_per_thread + +// default tuning for unknown DataType or cuda_arch +template < typename DataType, size_t block_size, size_t cuda_arch, typename enable = void > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = + grid_scan_max_items_per_thread::value / 2; +}; + +// tuning for sm_70 +template < typename DataType, size_t block_size > +struct grid_scan_default_items_per_thread< + DataType, block_size, 700, std::enable_if_t > +{ + static constexpr size_t value = + (block_size <= 64) ? 13 : + (block_size <= 128) ? 9 : + (block_size <= 256) ? 6 : + (block_size <= 512) ? 5 : + (block_size <= 1024) ? 5 : 1; +}; + } // end namespace cuda } // end namespace detail } // end namespace rajaperf diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index fb9af5046..174c56f80 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -168,6 +168,12 @@ Executor::Executor(int argc, char** argv) if (strlen(cc.adiak_cmake_hip_architectures) > 0) { adiak::value("cmake_hip_architectures", cc.adiak_cmake_hip_architectures); } + if (strlen(cc.adiak_tuning_cuda_arch) > 0) { + adiak::value("tuning_cuda_arch", cc.adiak_tuning_cuda_arch); + } + if (strlen(cc.adiak_tuning_hip_arch) > 0) { + adiak::value("tuning_hip_arch", cc.adiak_tuning_hip_arch); + } if (cc.adiak_gpu_block_sizes.size() > 0) { adiak::value("gpu_block_sizes", cc.adiak_gpu_block_sizes); } diff --git a/src/common/HipGridScan.hpp b/src/common/HipGridScan.hpp index d2294b7bb..c8c0c6e8b 100644 --- a/src/common/HipGridScan.hpp +++ b/src/common/HipGridScan.hpp @@ -28,39 +28,6 @@ namespace hip const size_t warp_size = 64; const size_t max_static_shmem = 65536; -const size_t default_arch = 910; - -// grid scan tunings that maximize throughput while minimizing items_per_thread -template < size_t block_size, size_t hip_arch > -struct grid_scan_default_items_per_thread -{ - static constexpr size_t value = 1; -}; - -// tuning for gfx90a -template < size_t block_size > -struct grid_scan_default_items_per_thread -{ - static constexpr size_t value = - (block_size <= 64) ? 6 : - (block_size <= 128) ? 4 : - (block_size <= 256) ? 4 : - (block_size <= 512) ? 4 : - (block_size <= 1024) ? 2 : 1; -}; - -// tuning for gfx942 -template < size_t block_size > -struct grid_scan_default_items_per_thread -{ - static constexpr size_t value = - (block_size <= 64) ? 22 : - (block_size <= 128) ? 22 : - (block_size <= 256) ? 19 : - (block_size <= 512) ? 13 : - (block_size <= 1024) ? 7 : 1; -}; - // perform a grid scan on val and returns the result at each thread // in exclusive and inclusive, note that val is used as scratch space @@ -249,6 +216,43 @@ struct grid_scan_max_items_per_thread { }; + +// tune grid scan to maximize throughput while minimizing items_per_thread + +// default tuning for unknown DataType or hip_arch +template < typename DataType, size_t block_size, size_t hip_arch, typename enable = void > +struct grid_scan_default_items_per_thread +{ + static constexpr size_t value = + grid_scan_max_items_per_thread::value / 2; +}; + +// tuning for gfx90a +template < typename DataType, size_t block_size > +struct grid_scan_default_items_per_thread< + DataType, block_size, 910, std::enable_if_t > +{ + static constexpr size_t value = + (block_size <= 64) ? 6 : + (block_size <= 128) ? 4 : + (block_size <= 256) ? 4 : + (block_size <= 512) ? 4 : + (block_size <= 1024) ? 2 : 1; +}; + +// tuning for gfx942 +template < typename DataType, size_t block_size > +struct grid_scan_default_items_per_thread< + DataType, block_size, 942, std::enable_if_t> +{ + static constexpr size_t value = + (block_size <= 64) ? 22 : + (block_size <= 128) ? 22 : + (block_size <= 256) ? 19 : + (block_size <= 512) ? 13 : + (block_size <= 1024) ? 7 : 1; +}; + } // end namespace hip } // end namespace detail } // end namespace rajaperf diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 655e9bd55..45cafeff1 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -30,7 +30,12 @@ #cmakedefine RAJA_PERFSUITE_ENABLE_MPI #cmakedefine RAJA_PERFSUITE_ENABLE_OPENMP5_SCAN +#if defined(RAJA_ENABLE_CUDA) +#define RAJA_PERFSUITE_TUNING_CUDA_ARCH @RAJA_PERFSUITE_TUNING_CUDA_ARCH@ +#endif + #if defined(RAJA_ENABLE_HIP) +#define RAJA_PERFSUITE_TUNING_HIP_ARCH @RAJA_PERFSUITE_TUNING_HIP_ARCH@ #include #if (HIP_VERSION_MAJOR > 5) || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) @@ -102,6 +107,8 @@ const adiak::version adiak_compiler_version = std::string("@CMAKE_CXX_COMPILER_V const adiak::version adiak_cuda_compiler_version = std::string("@CMAKE_CUDA_COMPILER_VERSION@"); constexpr static const char* adiak_gpu_targets = "@GPU_TARGETS@"; constexpr static const char* adiak_cmake_hip_architectures = "@CMAKE_HIP_ARCHIECTURES@"; +constexpr static const char* adiak_tuning_cuda_arch = "@RAJA_PERFSUITE_TUNING_CUDA_ARCH@"; +constexpr static const char* adiak_tuning_hip_arch = "@RAJA_PERFSUITE_TUNING_HIP_ARCH@"; const std::vector adiak_gpu_block_sizes = {@RAJA_PERFSUITE_GPU_BLOCKSIZES@}; const std::vector adiak_atomic_replications = {@RAJA_PERFSUITE_ATOMIC_REPLICATIONS@}; const std::vector adiak_gpu_items_per_thread = {@RAJA_PERFSUITE_GPU_ITEMS_PER_THREAD@}; From 6e930813e6222a83a19b3e0064e55490a2eb5320 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 20 Feb 2024 08:53:06 -0800 Subject: [PATCH 263/454] Add unsigned long long calcChecksum --- src/common/DataUtils.cpp | 63 +++++++++++++++++----------------------- src/common/DataUtils.hpp | 3 ++ 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index c9c4d73bd..57ea977e5 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -528,13 +528,14 @@ void initData(Real_type& d) /* * Calculate and return checksum for data arrays. */ -long double calcChecksum(Int_ptr ptr, Size_type len, - Real_type scale_factor) +template < typename Data_getter > +long double calcChecksumImpl(Data_getter data, Size_type len, + Real_type scale_factor) { long double tchk = 0.0; long double ckahan = 0.0; for (Size_type j = 0; j < len; ++j) { - long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; + long double x = (std::abs(std::sin(j+1.0))+0.5) * data(j); long double y = x - ckahan; volatile long double t = tchk + y; volatile long double z = t - tchk; @@ -550,48 +551,36 @@ long double calcChecksum(Int_ptr ptr, Size_type len, return tchk; } +long double calcChecksum(Int_ptr ptr, Size_type len, + Real_type scale_factor) +{ + return calcChecksumImpl([=](Size_type j) { + return static_cast(ptr[j]); + }, len, scale_factor); +} + +long double calcChecksum(unsigned long long* ptr, Size_type len, + Real_type scale_factor) +{ + return calcChecksumImpl([=](Size_type j) { + return static_cast(ptr[j]); + }, len, scale_factor); +} + long double calcChecksum(Real_ptr ptr, Size_type len, Real_type scale_factor) { - long double tchk = 0.0; - long double ckahan = 0.0; - for (Size_type j = 0; j < len; ++j) { - long double x = (std::abs(std::sin(j+1.0))+0.5) * ptr[j]; - long double y = x - ckahan; - volatile long double t = tchk + y; - volatile long double z = t - tchk; - ckahan = z - y; - tchk = t; -#if 0 // RDH DEBUG - if ( (j % 100) == 0 ) { - getCout() << "j : tchk = " << j << " : " << tchk << std::endl; - } -#endif - } - tchk *= scale_factor; - return tchk; + return calcChecksumImpl([=](Size_type j) { + return static_cast(ptr[j]); + }, len, scale_factor); } long double calcChecksum(Complex_ptr ptr, Size_type len, Real_type scale_factor) { - long double tchk = 0.0; - long double ckahan = 0.0; - for (Size_type j = 0; j < len; ++j) { - long double x = (std::abs(std::sin(j+1.0))+0.5) * (real(ptr[j])+imag(ptr[j])); - long double y = x - ckahan; - volatile long double t = tchk + y; - volatile long double z = t - tchk; - ckahan = z - y; - tchk = t; -#if 0 // RDH DEBUG - if ( (j % 100) == 0 ) { - getCout() << "j : tchk = " << j << " : " << tchk << std::endl; - } -#endif - } - tchk *= scale_factor; - return tchk; + return calcChecksumImpl([=](Size_type j) { + return static_cast(real(ptr[j])+imag(ptr[j])); + }, len, scale_factor); } } // closing brace for detail namespace diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index 225d8233f..36608d951 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -150,6 +150,9 @@ void initData(Real_type& d); long double calcChecksum(Int_ptr d, Size_type len, Real_type scale_factor); /// +long double calcChecksum(unsigned long long* d, Size_type len, + Real_type scale_factor); +/// long double calcChecksum(Real_ptr d, Size_type len, Real_type scale_factor); /// From 50530da6e1fa222f1950522ea0eb2b99a4b6ac17 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 17 Feb 2024 16:02:27 -0800 Subject: [PATCH 264/454] Add replication support to GPU reducer macros This makes it easier to implement reproduction with reducers --- src/algorithm/REDUCE_SUM-Cuda.cpp | 18 +++---- src/algorithm/REDUCE_SUM-Hip.cpp | 18 +++---- src/basic/PI_ATOMIC-Cuda.cpp | 23 ++++----- src/basic/PI_ATOMIC-Hip.cpp | 23 ++++----- src/basic/PI_REDUCE-Cuda.cpp | 9 ++-- src/basic/PI_REDUCE-Hip.cpp | 9 ++-- src/basic/REDUCE3_INT-Cuda.cpp | 13 +++-- src/basic/REDUCE3_INT-Hip.cpp | 13 +++-- src/basic/REDUCE_STRUCT-Cuda.cpp | 17 +++---- src/basic/REDUCE_STRUCT-Hip.cpp | 17 +++---- src/basic/TRAP_INT-Cuda.cpp | 9 ++-- src/basic/TRAP_INT-Hip.cpp | 9 ++-- src/common/GPUUtils.hpp | 82 +++++++++++++++---------------- src/lcals/FIRST_MIN-Cuda.cpp | 6 +-- src/lcals/FIRST_MIN-Hip.cpp | 6 +-- src/stream/DOT-Cuda.cpp | 9 ++-- src/stream/DOT-Hip.cpp | 9 ++-- 17 files changed, 131 insertions(+), 159 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 6f79928e2..6704f682d 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -72,7 +72,7 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) int len = iend - ibegin; - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -105,13 +105,8 @@ void REDUCE_SUM::runCudaVariantCub(VariantID vid) m_sum_init, stream)); - if (sum != hsum) { - cudaErrchk( cudaMemcpyAsync( hsum, sum, sizeof(Real_type), - cudaMemcpyDeviceToHost, stream ) ); - } - - cudaErrchk(cudaStreamSynchronize(stream)); - m_sum = *hsum; + RAJAPERF_CUDA_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); @@ -140,7 +135,7 @@ void REDUCE_SUM::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( @@ -149,7 +144,7 @@ void REDUCE_SUM::runCudaVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -159,7 +154,8 @@ void REDUCE_SUM::runCudaVariantBase(VariantID vid) shmem, res.get_stream(), x, sum, m_sum_init, iend ); - RAJAPERF_CUDA_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); + RAJAPERF_CUDA_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 9999ea674..478fb1f5d 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -77,7 +77,7 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) int len = iend - ibegin; - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); // Determine temporary device storage requirements void* d_temp_storage = nullptr; @@ -132,13 +132,8 @@ void REDUCE_SUM::runHipVariantRocprim(VariantID vid) stream)); #endif - if (sum != hsum) { - hipErrchk( hipMemcpyAsync( hsum, sum, sizeof(Real_type), - hipMemcpyDeviceToHost, stream ) ); - } - - hipErrchk(hipStreamSynchronize(stream)); - m_sum = *hsum; + RAJAPERF_HIP_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); @@ -167,7 +162,7 @@ void REDUCE_SUM::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sum, hsum, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( @@ -176,7 +171,7 @@ void REDUCE_SUM::runHipVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sum_init, sum, hsum, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -186,7 +181,8 @@ void REDUCE_SUM::runHipVariantBase(VariantID vid) shmem, res.get_stream(), x, sum, m_sum_init, iend ); - RAJAPERF_HIP_REDUCER_COPY_BACK(&m_sum, sum, hsum, 1); + RAJAPERF_HIP_REDUCER_COPY_BACK(sum, hsum, 1, 1); + m_sum = hsum[0]; } stopTimer(); diff --git a/src/basic/PI_ATOMIC-Cuda.cpp b/src/basic/PI_ATOMIC-Cuda.cpp index cb1c2a0bc..f49c53518 100644 --- a/src/basic/PI_ATOMIC-Cuda.cpp +++ b/src/basic/PI_ATOMIC-Cuda.cpp @@ -47,14 +47,14 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) PI_ATOMIC_GPU_DATA_SETUP; - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); if ( vid == Base_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; @@ -66,9 +66,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) dx, iend ); - Real_type rpi; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi_final = rpi * static_cast(4); + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -78,7 +77,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); auto pi_atomic_lambda = [=] __device__ (Index_type i) { double x = (double(i) + 0.5) * dx; @@ -94,9 +93,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) shmem, res.get_stream(), ibegin, iend, pi_atomic_lambda ); - Real_type rpi; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi_final = rpi * static_cast(4); + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -106,7 +104,7 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -114,9 +112,8 @@ void PI_ATOMIC::runCudaVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - Real_type rpi; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi_final = rpi * static_cast(4); + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index f65f36fdd..637c10156 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -47,14 +47,14 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) PI_ATOMIC_GPU_DATA_SETUP; - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); if ( vid == Base_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; @@ -66,9 +66,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) dx, iend ); - Real_type rpi; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi_final = rpi * static_cast(4); + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -78,7 +77,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); auto pi_atomic_lambda = [=] __device__ (Index_type i) { double x = (double(i) + 0.5) * dx; @@ -94,9 +93,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) shmem, res.get_stream(), ibegin, iend, pi_atomic_lambda ); - Real_type rpi; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi_final = rpi * static_cast(4); + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); @@ -106,7 +104,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { @@ -114,9 +112,8 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) RAJA::atomicAdd(pi, dx / (1.0 + x * x)); }); - Real_type rpi; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi_final = rpi * static_cast(4); + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi_final = hpi[0] * static_cast(4); } stopTimer(); diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index c9c165321..90a7e2c5a 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -67,7 +67,7 @@ void PI_REDUCE::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( @@ -76,7 +76,7 @@ void PI_REDUCE::runCudaVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -88,9 +88,8 @@ void PI_REDUCE::runCudaVariantBase(VariantID vid) pi, m_pi_init, iend ); - Real_type rpi; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi = rpi * static_cast(4); + RAJAPERF_CUDA_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi = hpi[0] * static_cast(4); } stopTimer(); diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 7df62b8c8..494391f36 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -67,7 +67,7 @@ void PI_REDUCE::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, pi, hpi, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( @@ -76,7 +76,7 @@ void PI_REDUCE::runHipVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_pi_init, pi, hpi, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -88,9 +88,8 @@ void PI_REDUCE::runHipVariantBase(VariantID vid) pi, m_pi_init, iend ); - Real_type rpi; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rpi, pi, hpi, 1); - m_pi = rpi * static_cast(4); + RAJAPERF_HIP_REDUCER_COPY_BACK(pi, hpi, 1, 1); + m_pi = hpi[0] * static_cast(4); } stopTimer(); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 015693eee..4c1db4b5b 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -80,7 +80,7 @@ void REDUCE3_INT::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); + RAJAPERF_CUDA_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3, 1); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( @@ -90,7 +90,7 @@ void REDUCE3_INT::runCudaVariantBase(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; - RAJAPERF_CUDA_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); + RAJAPERF_CUDA_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -104,11 +104,10 @@ void REDUCE3_INT::runCudaVariantBase(VariantID vid) vmem + 2, m_vmax_init, iend ); - Int_type rvmem[3]; - RAJAPERF_CUDA_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); - m_vsum += rvmem[0]; - m_vmin = RAJA_MIN(m_vmin, rvmem[1]); - m_vmax = RAJA_MAX(m_vmax, rvmem[2]); + RAJAPERF_CUDA_REDUCER_COPY_BACK(vmem, hvmem, 3, 1); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 460b68a2d..70e0f4fb4 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -80,7 +80,7 @@ void REDUCE3_INT::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3); + RAJAPERF_HIP_REDUCER_SETUP(Int_ptr, vmem, hvmem, 3, 1); constexpr size_t shmem = 3*sizeof(Int_type)*block_size; const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( @@ -90,7 +90,7 @@ void REDUCE3_INT::runHipVariantBase(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Int_type ivmem[3] {m_vsum_init, m_vmin_init, m_vmax_init}; - RAJAPERF_HIP_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3); + RAJAPERF_HIP_REDUCER_INITIALIZE(ivmem, vmem, hvmem, 3, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -104,11 +104,10 @@ void REDUCE3_INT::runHipVariantBase(VariantID vid) vmem + 2, m_vmax_init, iend ); - Int_type rvmem[3]; - RAJAPERF_HIP_REDUCER_COPY_BACK(rvmem, vmem, hvmem, 3); - m_vsum += rvmem[0]; - m_vmin = RAJA_MIN(m_vmin, rvmem[1]); - m_vmax = RAJA_MAX(m_vmax, rvmem[2]); + RAJAPERF_HIP_REDUCER_COPY_BACK(vmem, hvmem, 3, 1); + m_vsum += hvmem[0]; + m_vmin = RAJA_MIN(m_vmin, hvmem[1]); + m_vmax = RAJA_MAX(m_vmax, hvmem[2]); } stopTimer(); diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 2c20b2488..852c745e6 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -110,7 +110,7 @@ void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, mem, hmem, 6, 1); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( @@ -120,7 +120,7 @@ void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; - RAJAPERF_CUDA_REDUCER_INITIALIZE(imem, mem, hmem, 6); + RAJAPERF_CUDA_REDUCER_INITIALIZE(imem, mem, hmem, 6, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -134,13 +134,12 @@ void REDUCE_STRUCT::runCudaVariantBase(VariantID vid) m_init_sum, m_init_min, m_init_max, points.N ); - Real_type rmem[6]; - RAJAPERF_CUDA_REDUCER_COPY_BACK(rmem, mem, hmem, 6); - points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); - points.SetXMin(rmem[1]); - points.SetXMax(rmem[2]); - points.SetYMin(rmem[4]); - points.SetYMax(rmem[5]); + RAJAPERF_CUDA_REDUCER_COPY_BACK(mem, hmem, 6, 1); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); m_points=points; } diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index db31819f5..78508723e 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -110,7 +110,7 @@ void REDUCE_STRUCT::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, mem, hmem, 6, 1); constexpr size_t shmem = 6*sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( @@ -120,7 +120,7 @@ void REDUCE_STRUCT::runHipVariantBase(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Real_type imem[6] {m_init_sum, m_init_min, m_init_max, m_init_sum, m_init_min, m_init_max}; - RAJAPERF_HIP_REDUCER_INITIALIZE(imem, mem, hmem, 6); + RAJAPERF_HIP_REDUCER_INITIALIZE(imem, mem, hmem, 6, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -134,13 +134,12 @@ void REDUCE_STRUCT::runHipVariantBase(VariantID vid) m_init_sum, m_init_min, m_init_max, points.N ); - Real_type rmem[6]; - RAJAPERF_HIP_REDUCER_COPY_BACK(rmem, mem, hmem, 6); - points.SetCenter(rmem[0]/points.N, rmem[3]/points.N); - points.SetXMin(rmem[1]); - points.SetXMax(rmem[2]); - points.SetYMin(rmem[4]); - points.SetYMax(rmem[5]); + RAJAPERF_HIP_REDUCER_COPY_BACK(mem, hmem, 6, 1); + points.SetCenter(hmem[0]/points.N, hmem[3]/points.N); + points.SetXMin(hmem[1]); + points.SetXMax(hmem[2]); + points.SetYMin(hmem[4]); + points.SetYMax(hmem[5]); m_points=points; } diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 233563269..3be0a8e77 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -86,7 +86,7 @@ void TRAP_INT::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( @@ -95,7 +95,7 @@ void TRAP_INT::runCudaVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -109,9 +109,8 @@ void TRAP_INT::runCudaVariantBase(VariantID vid) sumx, iend); - Real_type rsumx; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); - m_sumx += rsumx * h; + RAJAPERF_CUDA_REDUCER_COPY_BACK(sumx, hsumx, 1, 1); + m_sumx += hsumx[0] * h; } stopTimer(); diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 51b6f0da3..04437b738 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -86,7 +86,7 @@ void TRAP_INT::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, sumx, hsumx, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( @@ -95,7 +95,7 @@ void TRAP_INT::runHipVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_sumx_init, sumx, hsumx, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -109,9 +109,8 @@ void TRAP_INT::runHipVariantBase(VariantID vid) sumx, iend); - Real_type rsumx; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rsumx, sumx, hsumx, 1); - m_sumx += rsumx * h; + RAJAPERF_HIP_REDUCER_COPY_BACK(sumx, hsumx, 1, 1); + m_sumx += hsumx[0] * h; } stopTimer(); diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index f706a292b..a3e49d6f2 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -239,15 +239,15 @@ using reducer_helpers = camp::list< // device_ptr_name gets memory in the reduction data space for the current variant // host_ptr_name is set to either device_ptr_name if the reduction data space is // host accessible or a new allocation in a host accessible data space otherwise -#define RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) \ +#define RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication) \ DataSpace reduction_data_space = getReductionDataSpace(vid); \ DataSpace host_data_space = hostAccessibleDataSpace(reduction_data_space); \ \ pointer_type device_ptr_name; \ - allocData(reduction_data_space, device_ptr_name, (length)); \ + allocData(reduction_data_space, device_ptr_name, (length)*(replication)); \ pointer_type host_ptr_name = device_ptr_name; \ if (reduction_data_space != host_data_space) { \ - allocData(host_data_space, host_ptr_name, (length)); \ + allocData(host_data_space, host_ptr_name, (length)*(replication)); \ } // deallocate device_ptr_name and host_ptr_name @@ -261,81 +261,77 @@ using reducer_helpers = camp::list< // Initialize device_ptr_name with length copies of init_value // host_ptr_name will be used as an intermediary with an explicit copy // if the reduction data space is not host accessible -#define RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(gpu_type, init_value, device_ptr_name, host_ptr_name, length) \ +#define RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(gpu_type, init_value, device_ptr_name, host_ptr_name, length, replication) \ if (device_ptr_name != host_ptr_name) { \ for (size_t i = 0; i < static_cast(length); ++i) { \ - host_ptr_name[i] = (init_value); \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + host_ptr_name[i*(replication) + r] = (init_value); \ + } \ } \ gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ - (length)*sizeof(device_ptr_name[0]), \ + (length)*(replication)*sizeof(device_ptr_name[0]), \ gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ } else { \ for (size_t i = 0; i < static_cast(length); ++i) { \ - device_ptr_name[i] = (init_value); \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + device_ptr_name[i*(replication) + r] = (init_value); \ + } \ } \ } // Initialize device_ptr_name with values in init_ptr // host_ptr_name will be used as an intermediary with an explicit copy // if the reduction data space is not host accessible -#define RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(gpu_type, init_ptr, device_ptr_name, host_ptr_name, length) \ +#define RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(gpu_type, init_ptr, device_ptr_name, host_ptr_name, length, replication) \ if (device_ptr_name != host_ptr_name) { \ for (size_t i = 0; i < static_cast(length); ++i) { \ - host_ptr_name[i] = (init_ptr)[i]; \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + host_ptr_name[i*(replication) + r] = (init_ptr)[i]; \ + } \ } \ gpu_type##Errchk( gpu_type##MemcpyAsync( device_ptr_name, host_ptr_name, \ - (length)*sizeof(device_ptr_name[0]), \ + (length)*(replication)*sizeof(device_ptr_name[0]), \ gpu_type##MemcpyHostToDevice, res.get_stream() ) ); \ } else { \ for (size_t i = 0; i < static_cast(length); ++i) { \ - device_ptr_name[i] = (init_ptr)[i]; \ + for (size_t r = 0; r < static_cast(replication); ++r) { \ + device_ptr_name[i*(replication) + r] = (init_ptr)[i]; \ + } \ } \ } // Copy back data from device_ptr_name into host_ptr_name // if the reduction data space is not host accessible -#define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, device_ptr_name, host_ptr_name, length) \ +#define RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(gpu_type, device_ptr_name, host_ptr_name, length, replication) \ if (device_ptr_name != host_ptr_name) { \ gpu_type##Errchk( gpu_type##MemcpyAsync( host_ptr_name, device_ptr_name, \ - (length)*sizeof(device_ptr_name[0]), \ + (length)*(replication)*sizeof(device_ptr_name[0]), \ gpu_type##MemcpyDeviceToHost, res.get_stream() ) ); \ } \ gpu_type##Errchk( gpu_type##StreamSynchronize( res.get_stream() ) ); -// Copy data into final_ptr from host_ptr_name -#define RAJAPERF_GPU_REDUCER_COPY_FINAL_IMPL(final_ptr, host_ptr_name, length) \ - for (size_t i = 0; i < static_cast(length); ++i) { \ - (final_ptr)[i] = host_ptr_name[i]; \ - } - - -#define RAJAPERF_CUDA_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_CUDA_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication) #define RAJAPERF_CUDA_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) -#define RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(cuda, init_value, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_CUDA_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(cuda, init_ptr, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_CUDA_REDUCER_COPY_BACK(final_ptr, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_FINAL_IMPL(final_ptr, host_ptr_name, length) - -#define RAJAPERF_HIP_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length) +#define RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(cuda, init_value, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_CUDA_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(cuda, init_ptr, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_CUDA_REDUCER_COPY_BACK(device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(cuda, device_ptr_name, host_ptr_name, length, replication) + +#define RAJAPERF_HIP_REDUCER_SETUP(pointer_type, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_SETUP_IMPL(pointer_type, device_ptr_name, host_ptr_name, length, replication) #define RAJAPERF_HIP_REDUCER_TEARDOWN(device_ptr_name, host_ptr_name) \ RAJAPERF_GPU_REDUCER_TEARDOWN_IMPL(device_ptr_name, host_ptr_name) -#define RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(hip, init_value, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_HIP_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(hip, init_ptr, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, device_ptr_name, host_ptr_name, length) -#define RAJAPERF_HIP_REDUCER_COPY_BACK(final_ptr, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, device_ptr_name, host_ptr_name, length) \ - RAJAPERF_GPU_REDUCER_COPY_FINAL_IMPL(final_ptr, host_ptr_name, length) +#define RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(init_value, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_VALUE_IMPL(hip, init_value, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_HIP_REDUCER_INITIALIZE(init_ptr, device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_INITIALIZE_IMPL(hip, init_ptr, device_ptr_name, host_ptr_name, length, replication) +#define RAJAPERF_HIP_REDUCER_COPY_BACK(device_ptr_name, host_ptr_name, length, replication) \ + RAJAPERF_GPU_REDUCER_COPY_BACK_IMPL(hip, device_ptr_name, host_ptr_name, length, replication) + // #define RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(kernel, variant) \ diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index f2433183d..f10972238 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -79,13 +79,13 @@ void FIRST_MIN::runCudaVariantBase(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - RAJAPERF_CUDA_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); + RAJAPERF_CUDA_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; - RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); + RAJAPERF_CUDA_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size, 1); RPlaunchCudaKernel( (first_min), grid_size, block_size, @@ -93,7 +93,7 @@ void FIRST_MIN::runCudaVariantBase(VariantID vid) x, dminloc, mymin, iend ); - RAJAPERF_CUDA_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); + RAJAPERF_CUDA_REDUCER_COPY_BACK(dminloc, mymin_block, grid_size, 1); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index fde3100e1..ced43a46d 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -79,13 +79,13 @@ void FIRST_MIN::runHipVariantBase(VariantID vid) const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); - RAJAPERF_HIP_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size); + RAJAPERF_HIP_REDUCER_SETUP(MyMinLoc*, dminloc, mymin_block, grid_size, 1); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { FIRST_MIN_MINLOC_INIT; - RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size); + RAJAPERF_HIP_REDUCER_INITIALIZE_VALUE(mymin, dminloc, mymin_block, grid_size, 1); RPlaunchHipKernel( (first_min), grid_size, block_size, @@ -93,7 +93,7 @@ void FIRST_MIN::runHipVariantBase(VariantID vid) x, dminloc, mymin, iend ); - RAJAPERF_HIP_REDUCER_COPY_BACK_NOFINAL(dminloc, mymin_block, grid_size); + RAJAPERF_HIP_REDUCER_COPY_BACK(dminloc, mymin_block, grid_size, 1); for (Index_type i = 0; i < static_cast(grid_size); i++) { if ( mymin_block[i].val < mymin.val ) { mymin = mymin_block[i]; diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 45a2a5a8d..db89eb23f 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -66,7 +66,7 @@ void DOT::runCudaVariantBase(VariantID vid) if ( vid == Base_CUDA ) { - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( @@ -75,7 +75,7 @@ void DOT::runCudaVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); + RAJAPERF_CUDA_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -85,9 +85,8 @@ void DOT::runCudaVariantBase(VariantID vid) shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); - Real_type rdprod; - RAJAPERF_CUDA_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); - m_dot += rdprod; + RAJAPERF_CUDA_REDUCER_COPY_BACK(dprod, hdprod, 1, 1); + m_dot += hdprod[0]; } stopTimer(); diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 0badd32fb..ce2896dee 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -66,7 +66,7 @@ void DOT::runHipVariantBase(VariantID vid) if ( vid == Base_HIP ) { - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1); + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, dprod, hdprod, 1, 1); constexpr size_t shmem = sizeof(Real_type)*block_size; const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( @@ -75,7 +75,7 @@ void DOT::runHipVariantBase(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1); + RAJAPERF_HIP_REDUCER_INITIALIZE(&m_dot_init, dprod, hdprod, 1, 1); const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); const size_t grid_size = std::min(normal_grid_size, max_grid_size); @@ -85,9 +85,8 @@ void DOT::runHipVariantBase(VariantID vid) shmem, res.get_stream(), a, b, dprod, m_dot_init, iend ); - Real_type rdprod; - RAJAPERF_HIP_REDUCER_COPY_BACK(&rdprod, dprod, hdprod, 1); - m_dot += rdprod; + RAJAPERF_HIP_REDUCER_COPY_BACK(dprod, hdprod, 1, 1); + m_dot += hdprod[0]; } stopTimer(); From 1d7d959d7b47e02bea590564ab63c0f5454249c4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 16 Feb 2024 16:12:15 -0800 Subject: [PATCH 265/454] Add MULTI_REDUCE kernel Add the MULTI_REDUCE kernel which looks at performance for reducing into a runtime number of bins. In this case each iterate contributes to a single bin. --- src/CMakeLists.txt | 3 + src/basic/CMakeLists.txt | 6 + src/basic/MULTI_REDUCE-Cuda.cpp | 201 +++++++++++++++++++++++++++ src/basic/MULTI_REDUCE-Hip.cpp | 201 +++++++++++++++++++++++++++ src/basic/MULTI_REDUCE-OMP.cpp | 113 +++++++++++++++ src/basic/MULTI_REDUCE-OMPTarget.cpp | 85 +++++++++++ src/basic/MULTI_REDUCE-Seq.cpp | 106 ++++++++++++++ src/basic/MULTI_REDUCE.cpp | 105 ++++++++++++++ src/basic/MULTI_REDUCE.hpp | 120 ++++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 947 insertions(+) create mode 100644 src/basic/MULTI_REDUCE-Cuda.cpp create mode 100644 src/basic/MULTI_REDUCE-Hip.cpp create mode 100644 src/basic/MULTI_REDUCE-OMP.cpp create mode 100644 src/basic/MULTI_REDUCE-OMPTarget.cpp create mode 100644 src/basic/MULTI_REDUCE-Seq.cpp create mode 100644 src/basic/MULTI_REDUCE.cpp create mode 100644 src/basic/MULTI_REDUCE.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 70b5b1d4f..13584d7d2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -134,6 +134,9 @@ blt_add_executable( basic/TRAP_INT.cpp basic/TRAP_INT-Seq.cpp basic/TRAP_INT-OMPTarget.cpp + basic/MULTI_REDUCE.cpp + basic/MULTI_REDUCE-Seq.cpp + basic/MULTI_REDUCE-OMPTarget.cpp lcals/DIFF_PREDICT.cpp lcals/DIFF_PREDICT-Seq.cpp lcals/DIFF_PREDICT-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index bab953cc3..cefc7c94c 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -116,5 +116,11 @@ blt_add_library( TRAP_INT-Cuda.cpp TRAP_INT-OMPTarget.cpp TRAP_INT-OMP.cpp + MULTI_REDUCE.cpp + MULTI_REDUCE-Seq.cpp + MULTI_REDUCE-Hip.cpp + MULTI_REDUCE-Cuda.cpp + MULTI_REDUCE-OMP.cpp + MULTI_REDUCE-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp new file mode 100644 index 000000000..b18e38c12 --- /dev/null +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -0,0 +1,201 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void multi_reduce(Real_ptr values, + Index_ptr bins, + Real_ptr data, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + MULTI_REDUCE_GPU_RAJA_BODY(RAJA::cuda_atomic); + } +} + + + +template < size_t block_size, size_t replication > +void MULTI_REDUCE::runCudaVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + MULTI_REDUCE_GPU_DATA_SETUP; + + RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, values, hvalues, num_bins, replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (multi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + values, + bins, + data, + iend ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); + MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == Lambda_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + + auto multi_reduce_lambda = [=] __device__ (Index_type i) { + MULTI_REDUCE_GPU_RAJA_BODY(RAJA::cuda_atomic); + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, multi_reduce_lambda ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); + MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + MULTI_REDUCE_GPU_RAJA_BODY(RAJA::cuda_atomic); + }); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); + MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + + } + stopTimer(); + + } else { + getCout() << "\n MULTI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } + + RAJAPERF_CUDA_REDUCER_TEARDOWN(values, hvalues); + +} + +void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n MULTI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void MULTI_REDUCE::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp new file mode 100644 index 000000000..41766ad0e --- /dev/null +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -0,0 +1,201 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void multi_reduce(Real_ptr values, + Index_ptr bins, + Real_ptr data, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + MULTI_REDUCE_GPU_RAJA_BODY(RAJA::hip_atomic); + } +} + + + +template < size_t block_size, size_t replication > +void MULTI_REDUCE::runHipVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + MULTI_REDUCE_GPU_DATA_SETUP; + + RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, values, hvalues, num_bins, replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (multi_reduce), + grid_size, block_size, + shmem, res.get_stream(), + values, + bins, + data, + iend ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); + MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == Lambda_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + + auto multi_reduce_lambda = [=] __device__ (Index_type i) { + MULTI_REDUCE_GPU_RAJA_BODY(RAJA::hip_atomic); + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, multi_reduce_lambda ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); + MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + MULTI_REDUCE_GPU_RAJA_BODY(RAJA::hip_atomic); + }); + + RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); + MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + + } + stopTimer(); + + } else { + getCout() << "\n MULTI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } + + RAJAPERF_HIP_REDUCER_TEARDOWN(values, hvalues); + +} + +void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n MULTI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void MULTI_REDUCE::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/MULTI_REDUCE-OMP.cpp b/src/basic/MULTI_REDUCE-OMP.cpp new file mode 100644 index 000000000..18777940e --- /dev/null +++ b/src/basic/MULTI_REDUCE-OMP.cpp @@ -0,0 +1,113 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void MULTI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + MULTI_REDUCE_BODY; + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto multi_reduce_base_lam = [=](Index_type i) { + #pragma omp atomic + MULTI_REDUCE_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + multi_reduce_base_lam(i); + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + MULTI_REDUCE_RAJA_BODY(RAJA::omp_atomic); + }); + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MULTI_REDUCE : Unknown variant id = " << vid << std::endl; + } + + } + + MULTI_REDUCE_DATA_TEARDOWN; + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE-OMPTarget.cpp b/src/basic/MULTI_REDUCE-OMPTarget.cpp new file mode 100644 index 000000000..30e2a5c2d --- /dev/null +++ b/src/basic/MULTI_REDUCE-OMPTarget.cpp @@ -0,0 +1,85 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void MULTI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(values, values_init, num_bins); + + #pragma omp target is_device_ptr(values, bins, data) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + MULTI_REDUCE_BODY; + } + + getOpenMPDeviceData(values_final, values, num_bins); + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(values, values_init, num_bins); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + MULTI_REDUCE_RAJA_BODY(RAJA::omp_atomic); + }); + + getOpenMPDeviceData(values_final, values, num_bins); + + } + stopTimer(); + + } else { + getCout() << "\n MULTI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; + } + + MULTI_REDUCE_DATA_TEARDOWN; + +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic/MULTI_REDUCE-Seq.cpp b/src/basic/MULTI_REDUCE-Seq.cpp new file mode 100644 index 000000000..ac3bd41bb --- /dev/null +++ b/src/basic/MULTI_REDUCE-Seq.cpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void MULTI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + for (Index_type i = ibegin; i < iend; ++i ) { + MULTI_REDUCE_BODY; + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto multi_reduce_base_lam = [=](Index_type i) { + MULTI_REDUCE_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + for (Index_type i = ibegin; i < iend; ++i ) { + multi_reduce_base_lam(i); + } + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + MULTI_REDUCE_INIT_VALUES; + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + MULTI_REDUCE_RAJA_BODY(RAJA::seq_atomic); + }); + + MULTI_REDUCE_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n MULTI_REDUCE : Unknown variant id = " << vid << std::endl; + } + + } + + MULTI_REDUCE_DATA_TEARDOWN; + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp new file mode 100644 index 000000000..9bb26201b --- /dev/null +++ b/src/basic/MULTI_REDUCE.cpp @@ -0,0 +1,105 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULTI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace basic +{ + + +MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) + : KernelBase(rajaperf::Basic_MULTI_REDUCE, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(50); + + setActualProblemSize( getTargetProblemSize() ); + + m_num_bins = 10; + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type))*m_num_bins + + (1*sizeof(Real_type) + 0*sizeof(Real_type) + + 1*sizeof(Index_type) + 0*sizeof(Index_type)) * getActualProblemSize() ); + setFLOPsPerRep(1 * getActualProblemSize()); + + setUsesFeature(Forall); + setUsesFeature(Atomic); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( Lambda_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( Lambda_HIP ); + setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); +} + +MULTI_REDUCE::~MULTI_REDUCE() +{ +} + +void MULTI_REDUCE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocData(m_bins, getActualProblemSize(), vid); + allocAndInitDataRandValue(m_data, getActualProblemSize(), vid); + { + auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); + auto reset_data = scopedMoveData(m_data, getActualProblemSize(), vid); + + for (Index_type i = 0; i < getActualProblemSize(); ++i) { + m_bins[i] = static_cast(m_data[i] * m_num_bins); + if (m_bins[i] >= m_num_bins) { + m_bins[i] = m_num_bins - 1; + } + if (m_bins[i] < 0) { + m_bins[i] = 0; + } + } + } + + m_values_init.resize(m_num_bins, 0.0); + m_values_final.resize(m_num_bins, 0.0); +} + +void MULTI_REDUCE::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += calcChecksum(m_values_final.data(), m_num_bins, vid); +} + +void MULTI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + deallocData(m_bins, vid); + deallocData(m_data, vid); + m_values_init.clear(); m_values_init.shrink_to_fit(); + m_values_final.clear(); m_values_final.shrink_to_fit(); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp new file mode 100644 index 000000000..ae3190c52 --- /dev/null +++ b/src/basic/MULTI_REDUCE.hpp @@ -0,0 +1,120 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// MULTI_REDUCE kernel reference implementation: +/// +/// double* values = calloc(num_bins, sizeof(double)); +/// for (Index_type i = 0; i < N; ++i ) { +/// values[bins[i]] += data[i]; +/// } +/// + +#ifndef RAJAPerf_Basic_MULTI_REDUCE_HPP +#define RAJAPerf_Basic_MULTI_REDUCE_HPP + +#define MULTI_REDUCE_DATA_SETUP \ + Index_type num_bins = m_num_bins; \ + Index_ptr bins = m_bins; \ + Real_ptr data = m_data; \ + Real_ptr values_init = m_values_init.data(); \ + Real_ptr values_final = m_values_final.data(); \ + Real_ptr values; \ + allocData(getReductionDataSpace(vid), values, num_bins); + +#define MULTI_REDUCE_DATA_TEARDOWN \ + deallocData(values, vid); + +#define MULTI_REDUCE_GPU_DATA_SETUP \ + Index_type num_bins = m_num_bins; \ + Index_ptr bins = m_bins; \ + Real_ptr data = m_data; \ + Real_ptr values_init = m_values_init.data(); \ + Real_ptr values_final = m_values_final.data(); + +#define MULTI_REDUCE_BODY \ + values[bins[i]] += data[i]; + +#define MULTI_REDUCE_RAJA_BODY(policy) \ + RAJA::atomicAdd(&values[bins[i]], data[i]); + +#define MULTI_REDUCE_GPU_RAJA_BODY(policy) \ + RAJA::atomicAdd(&values[bins[i]*replication + (i%replication)], data[i]); + +#define MULTI_REDUCE_INIT_VALUES \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + values[b] = values_init[b]; \ + } + +#define MULTI_REDUCE_FINALIZE_VALUES \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + values_final[b] = values[b]; \ + } + +#define MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication) \ + for (Index_type b = 0; b < (num_bins); ++b) { \ + Real_type val_final = 0.0; \ + for (size_t r = 0; r < (replication); ++r) { \ + val_final += (hvalues)[b*(replication) + r]; \ + } \ + values_final[b] = val_final; \ + } + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace basic +{ + +class MULTI_REDUCE : public KernelBase +{ +public: + + MULTI_REDUCE(const RunParams& params); + + ~MULTI_REDUCE(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateGlobal(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateGlobal(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + static const size_t default_atomic_replication = 4096; + using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; + + Index_type m_num_bins; + Index_ptr m_bins; + Real_ptr m_data; + std::vector m_values_init; + std::vector m_values_final; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 17bd15bb7..e72e2ee88 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -35,6 +35,7 @@ #include "basic/REDUCE3_INT.hpp" #include "basic/REDUCE_STRUCT.hpp" #include "basic/TRAP_INT.hpp" +#include "basic/MULTI_REDUCE.hpp" // // Lcals kernels... @@ -186,6 +187,7 @@ static const std::string KernelNames [] = std::string("Basic_REDUCE3_INT"), std::string("Basic_REDUCE_STRUCT"), std::string("Basic_TRAP_INT"), + std::string("Basic_MULTI_REDUCE"), // // Lcals kernels... @@ -771,6 +773,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::TRAP_INT(run_params); break; } + case Basic_MULTI_REDUCE : { + kernel = new basic::MULTI_REDUCE(run_params); + break; + } // // Lcals kernels... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index fdb2878c4..e9525c5ac 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -95,6 +95,7 @@ enum KernelID { Basic_REDUCE3_INT, Basic_REDUCE_STRUCT, Basic_TRAP_INT, + Basic_MULTI_REDUCE, // // Lcals kernels... From 7ac439d0a3e33a2cd91d80ed50a09631d8ffa01d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 18 Feb 2024 21:47:47 -0800 Subject: [PATCH 266/454] Add HISTOGRAM kernel This is meant to be in some ways a simplification of the MULTI_REDUCE kernel and a way to compare with the performance of histogram implemenations in performance libraries like cub and rocprim --- src/CMakeLists.txt | 3 + src/algorithm/CMakeLists.txt | 6 + src/algorithm/HISTOGRAM-Cuda.cpp | 288 +++++++++++++++++++++++ src/algorithm/HISTOGRAM-Hip.cpp | 317 ++++++++++++++++++++++++++ src/algorithm/HISTOGRAM-OMP.cpp | 113 +++++++++ src/algorithm/HISTOGRAM-OMPTarget.cpp | 85 +++++++ src/algorithm/HISTOGRAM-Seq.cpp | 106 +++++++++ src/algorithm/HISTOGRAM.cpp | 105 +++++++++ src/algorithm/HISTOGRAM.hpp | 121 ++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 1151 insertions(+) create mode 100644 src/algorithm/HISTOGRAM-Cuda.cpp create mode 100644 src/algorithm/HISTOGRAM-Hip.cpp create mode 100644 src/algorithm/HISTOGRAM-OMP.cpp create mode 100644 src/algorithm/HISTOGRAM-OMPTarget.cpp create mode 100644 src/algorithm/HISTOGRAM-Seq.cpp create mode 100644 src/algorithm/HISTOGRAM.cpp create mode 100644 src/algorithm/HISTOGRAM.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 13584d7d2..b7a16296f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -250,6 +250,9 @@ blt_add_executable( algorithm/ATOMIC.cpp algorithm/ATOMIC-Seq.cpp algorithm/ATOMIC-OMPTarget.cpp + algorithm/HISTOGRAM.cpp + algorithm/HISTOGRAM-Seq.cpp + algorithm/HISTOGRAM-OMPTarget.cpp comm/HALO_base.cpp comm/HALO_PACKING.cpp comm/HALO_PACKING-Seq.cpp diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index 43e3279e0..d7bd0be56 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -48,5 +48,11 @@ blt_add_library( ATOMIC-Cuda.cpp ATOMIC-OMP.cpp ATOMIC-OMPTarget.cpp + HISTOGRAM.cpp + HISTOGRAM-Seq.cpp + HISTOGRAM-Hip.cpp + HISTOGRAM-Cuda.cpp + HISTOGRAM-OMP.cpp + HISTOGRAM-OMPTarget.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp new file mode 100644 index 000000000..9dc0c25d1 --- /dev/null +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -0,0 +1,288 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "cub/device/device_histogram.cuh" +#include "cub/util_allocator.cuh" + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void histogram(HISTOGRAM::Data_ptr counts, + Index_ptr bins, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic); + } +} + + +void HISTOGRAM::runCudaVariantLibrary(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1); + + RAJAPERF_UNUSED_VAR(counts_init); + + if ( vid == Base_CUDA ) { + + cudaStream_t stream = res.get_stream(); + + int len = iend - ibegin; + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); + + // Allocate temporary storage + unsigned char* temp_storage; + allocData(DataSpace::CudaDevice, temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, 1); + + } + stopTimer(); + + // Free temporary storage + deallocData(DataSpace::CudaDevice, temp_storage); + + } else { + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + } + + RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); + +} + +template < size_t block_size, size_t replication > +void HISTOGRAM::runCudaVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (histogram), + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + iend ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == Lambda_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + + auto histogram_lambda = [=] __device__ (Index_type i) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic); + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (lambda_cuda_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, histogram_lambda ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + + RAJA::forall< RAJA::cuda_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic); + }); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + } + + RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); + +} + +void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + runCudaVariantLibrary(vid); + + } + + t += 1; + + } + + if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + +void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) +{ + if ( vid == Base_CUDA ) { + + addVariantTuningName(vid, "cub"); + + } + + if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp new file mode 100644 index 000000000..63475e79b --- /dev/null +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -0,0 +1,317 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#if defined(__HIPCC__) +#define ROCPRIM_HIP_API 1 +#include "rocprim/device/device_histogram.hpp" +#elif defined(__CUDACC__) +#include "cub/device/device_histogram.cuh" +#include "cub/util_allocator.cuh" +#endif + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + +template < size_t block_size, size_t replication > +__launch_bounds__(block_size) +__global__ void histogram(HISTOGRAM::Data_ptr counts, + Index_ptr bins, + Index_type iend) +{ + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic); + } +} + + +void HISTOGRAM::runHipVariantLibrary(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1); + + RAJAPERF_UNUSED_VAR(counts_init); + + if ( vid == Base_HIP ) { + + hipStream_t stream = res.get_stream(); + + int len = iend - ibegin; + + // Determine temporary device storage requirements + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; +#if defined(__HIPCC__) + hipErrchk(::rocprim::histogram_even(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + len, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + stream)); +#elif defined(__CUDACC__) + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); +#endif + + // Allocate temporary storage + unsigned char* temp_storage; + allocData(DataSpace::HipDevice, temp_storage, temp_storage_bytes); + d_temp_storage = temp_storage; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Run +#if defined(__HIPCC__) + hipErrchk(::rocprim::histogram_even(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + len, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + stream)); +#elif defined(__CUDACC__) + cudaErrchk(::cub::DeviceHistogram::HistogramEven(d_temp_storage, + temp_storage_bytes, + bins+ibegin, + counts, + static_cast(num_bins+1), + static_cast(0), + num_bins, + len, + stream)); +#endif + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, 1); + + } + stopTimer(); + + // Free temporary storage + deallocData(DataSpace::HipDevice, temp_storage); + + } else { + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + } + + RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); + +} + +template < size_t block_size, size_t replication > +void HISTOGRAM::runHipVariantReplicateGlobal(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (histogram), + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + iend ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == Lambda_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + + auto histogram_lambda = [=] __device__ (Index_type i) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic); + }; + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (lambda_hip_forall), + grid_size, block_size, + shmem, res.get_stream(), + ibegin, iend, histogram_lambda ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + + RAJA::forall< RAJA::hip_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic); + }); + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + } + + RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); + +} + +void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) +{ + size_t t = 0; + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + runHipVariantLibrary(vid); + + } + + t += 1; + + } + + if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantReplicateGlobal(vid); + + } + + t += 1; + + } + + }); + + } + + }); + + } else { + + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void HISTOGRAM::setHipTuningDefinitions(VariantID vid) +{ + if ( vid == Base_HIP ) { + + addVariantTuningName(vid, "rocprim"); + + } + + if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(replication)) { + + addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + "_global_"+std::to_string(block_size)); + + } + + }); + + } + + }); + + } + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/algorithm/HISTOGRAM-OMP.cpp b/src/algorithm/HISTOGRAM-OMP.cpp new file mode 100644 index 000000000..2ab07a367 --- /dev/null +++ b/src/algorithm/HISTOGRAM-OMP.cpp @@ -0,0 +1,113 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void HISTOGRAM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_VALUES; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + HISTOGRAM_BODY; + } + + HISTOGRAM_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto histogram_base_lam = [=](Index_type i) { + #pragma omp atomic + HISTOGRAM_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_VALUES; + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + histogram_base_lam(i); + } + + HISTOGRAM_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_VALUES; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + HISTOGRAM_RAJA_BODY(RAJA::omp_atomic); + }); + + HISTOGRAM_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n HISTOGRAM : Unknown variant id = " << vid << std::endl; + } + + } + + HISTOGRAM_DATA_TEARDOWN; + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM-OMPTarget.cpp b/src/algorithm/HISTOGRAM-OMPTarget.cpp new file mode 100644 index 000000000..93217c194 --- /dev/null +++ b/src/algorithm/HISTOGRAM-OMPTarget.cpp @@ -0,0 +1,85 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void HISTOGRAM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(counts, counts_init, num_bins); + + #pragma omp target is_device_ptr(counts, bins) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + #pragma omp atomic + HISTOGRAM_BODY; + } + + getOpenMPDeviceData(counts_final, counts, num_bins); + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(counts, counts_init, num_bins); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + HISTOGRAM_RAJA_BODY(RAJA::omp_atomic); + }); + + getOpenMPDeviceData(counts_final, counts, num_bins); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown OMP Target variant id = " << vid << std::endl; + } + + HISTOGRAM_DATA_TEARDOWN; + +} + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/algorithm/HISTOGRAM-Seq.cpp b/src/algorithm/HISTOGRAM-Seq.cpp new file mode 100644 index 000000000..c75463ed4 --- /dev/null +++ b/src/algorithm/HISTOGRAM-Seq.cpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void HISTOGRAM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_VALUES; + + for (Index_type i = ibegin; i < iend; ++i ) { + HISTOGRAM_BODY; + } + + HISTOGRAM_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto histogram_base_lam = [=](Index_type i) { + HISTOGRAM_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_VALUES; + + for (Index_type i = ibegin; i < iend; ++i ) { + histogram_base_lam(i); + } + + HISTOGRAM_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_VALUES; + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + HISTOGRAM_RAJA_BODY(RAJA::seq_atomic); + }); + + HISTOGRAM_FINALIZE_VALUES; + + } + stopTimer(); + + break; + } +#endif + + default : { + getCout() << "\n HISTOGRAM : Unknown variant id = " << vid << std::endl; + } + + } + + HISTOGRAM_DATA_TEARDOWN; + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp new file mode 100644 index 000000000..8de9690e3 --- /dev/null +++ b/src/algorithm/HISTOGRAM.cpp @@ -0,0 +1,105 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HISTOGRAM.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace algorithm +{ + + +HISTOGRAM::HISTOGRAM(const RunParams& params) + : KernelBase(rajaperf::Algorithm_HISTOGRAM, params) +{ + setDefaultProblemSize(1000000); + setDefaultReps(50); + + setActualProblemSize( getTargetProblemSize() ); + + m_num_bins = 10; + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + setBytesPerRep( (1*sizeof(Data_type) + 1*sizeof(Data_type))*m_num_bins + + (1*sizeof(Index_type) + 0*sizeof(Index_type)) * getActualProblemSize() ); + setFLOPsPerRep(1 * getActualProblemSize()); + + setUsesFeature(Forall); + setUsesFeature(Atomic); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( Lambda_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( Lambda_HIP ); + setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); +} + +HISTOGRAM::~HISTOGRAM() +{ +} + +void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocData(m_bins, getActualProblemSize(), vid); + { + auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); + Real_ptr data; + allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq); + + for (Index_type i = 0; i < getActualProblemSize(); ++i) { + m_bins[i] = static_cast(data[i] * m_num_bins); + if (m_bins[i] >= m_num_bins) { + m_bins[i] = m_num_bins - 1; + } + if (m_bins[i] < 0) { + m_bins[i] = 0; + } + } + + deallocData(data, Base_Seq); + } + + m_counts_init.resize(m_num_bins, 0); + m_counts_final.resize(m_num_bins, 0); +} + +void HISTOGRAM::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid][tune_idx] += calcChecksum(m_counts_final.data(), m_num_bins, vid); +} + +void HISTOGRAM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + deallocData(m_bins, vid); + m_counts_init.clear(); m_counts_init.shrink_to_fit(); + m_counts_final.clear(); m_counts_final.shrink_to_fit(); +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp new file mode 100644 index 000000000..bed41658c --- /dev/null +++ b/src/algorithm/HISTOGRAM.hpp @@ -0,0 +1,121 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// HISTOGRAM kernel reference implementation: +/// +/// Index_type* counts = calloc(num_bins, sizeof(Index_type)); +/// for (Index_type i = 0; i < N; ++i ) { +/// counts[bins[i]] += 1; +/// } +/// + +#ifndef RAJAPerf_Algorithm_HISTOGRAM_HPP +#define RAJAPerf_Algorithm_HISTOGRAM_HPP + +#define HISTOGRAM_DATA_SETUP \ + Index_type num_bins = m_num_bins; \ + Index_ptr bins = m_bins; \ + Data_ptr counts_init = m_counts_init.data(); \ + Data_ptr counts_final = m_counts_final.data(); \ + Data_ptr counts; \ + allocData(getReductionDataSpace(vid), counts, num_bins); + +#define HISTOGRAM_DATA_TEARDOWN \ + deallocData(counts, vid); + +#define HISTOGRAM_GPU_DATA_SETUP \ + Index_type num_bins = m_num_bins; \ + Index_ptr bins = m_bins; \ + Data_ptr counts_init = m_counts_init.data(); \ + Data_ptr counts_final = m_counts_final.data(); + +#define HISTOGRAM_BODY \ + counts[bins[i]] += static_cast(1); + +#define HISTOGRAM_RAJA_BODY(policy) \ + RAJA::atomicAdd(&counts[bins[i]], static_cast(1)); + +#define HISTOGRAM_GPU_RAJA_BODY(policy) \ + RAJA::atomicAdd(&counts[bins[i]*replication + (i%replication)], static_cast(1)); + +#define HISTOGRAM_INIT_VALUES \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + counts[b] = counts_init[b]; \ + } + +#define HISTOGRAM_FINALIZE_VALUES \ + for (Index_type b = 0; b < num_bins; ++b ) { \ + counts_final[b] = counts[b]; \ + } + +#define HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication) \ + for (Index_type b = 0; b < (num_bins); ++b) { \ + Data_type count_final = 0; \ + for (size_t r = 0; r < (replication); ++r) { \ + count_final += (hcounts)[b*(replication) + r]; \ + } \ + counts_final[b] = count_final; \ + } + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace algorithm +{ + +class HISTOGRAM : public KernelBase +{ +public: + using Data_type = unsigned long long; + using Data_ptr = Data_type*; + + HISTOGRAM(const RunParams& params); + + ~HISTOGRAM(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + void runCudaVariantLibrary(VariantID vid); + void runHipVariantLibrary(VariantID vid); + template < size_t block_size, size_t replication > + void runCudaVariantReplicateGlobal(VariantID vid); + template < size_t block_size, size_t replication > + void runHipVariantReplicateGlobal(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + static const size_t default_atomic_replication = 4096; + using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; + + Index_type m_num_bins; + Index_ptr m_bins; + std::vector m_counts_init; + std::vector m_counts_final; +}; + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index e72e2ee88..837b1eccb 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -106,6 +106,7 @@ #include "algorithm/MEMSET.hpp" #include "algorithm/MEMCPY.hpp" #include "algorithm/ATOMIC.hpp" +#include "algorithm/HISTOGRAM.hpp" // // Comm kernels... @@ -258,6 +259,7 @@ static const std::string KernelNames [] = std::string("Algorithm_MEMSET"), std::string("Algorithm_MEMCPY"), std::string("Algorithm_ATOMIC"), + std::string("Algorithm_HISTOGRAM"), // // Comm kernels... @@ -998,6 +1000,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new algorithm::ATOMIC(run_params); break; } + case Algorithm_HISTOGRAM: { + kernel = new algorithm::HISTOGRAM(run_params); + break; + } // // Comm kernels... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index e9525c5ac..de631f4bc 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -166,6 +166,7 @@ enum KernelID { Algorithm_MEMSET, Algorithm_MEMCPY, Algorithm_ATOMIC, + Algorithm_HISTOGRAM, // // Comm kernels... From d6acda662579fc1b4797834528ec2e803d513335 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 13 Mar 2024 12:23:27 -0700 Subject: [PATCH 267/454] Update BLT and RAJA submodules to match develop --- blt | 2 +- tpl/RAJA | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/blt b/blt index a7f0a6ecc..148c53ecc 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit a7f0a6ecc4fdfa1724399b1454c3909b9ee02e81 +Subproject commit 148c53ecc8bcaad5eaa4c1e39cb8144b8f1388ae diff --git a/tpl/RAJA b/tpl/RAJA index ac4d5e5cd..82d1b926a 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit ac4d5e5cd00b18cd2b827055b25a904532ba25c0 +Subproject commit 82d1b926ada0fbb15a4a6e0adadc30c715cfda7b From 5575c23f6f00ccc50a416ae120f75d867fbc6fb0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 13 Mar 2024 12:46:19 -0700 Subject: [PATCH 268/454] Remove some merge conflict cruft missed earlier --- src/common/RunParams.cpp | 3 +-- src/common/RunParams.hpp | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index b7046822f..d9accbcac 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -585,7 +585,7 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) opt == std::string("--hip-data-space") || opt == std::string("-hds") || opt == std::string("--sycl-data-space") || - opt == std::string("-syds")) + opt == std::string("-syds") || opt == std::string("--kokkos-data-space") || opt == std::string("-kds") || opt == std::string("--seq-reduction-data-space") || @@ -1244,7 +1244,6 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t (name of data space to use with MPI and kokkos execution)\n"; str << "\t\t Examples...\n" << "\t\t --kokkos-mpi-data-space Copy (run KOKKOS variants and copy to Host memory for MPI buffers)\n\n"; ->>>>>>> develop #if defined(RAJA_PERFSUITE_USE_CALIPER) str << "\t --add-to-spot-config, -atsc [Default is none]\n" diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 785a3d37c..8b5aff224 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -153,6 +153,7 @@ class RunParams { DataSpace getOmpTargetReductionDataSpace() const { return ompTargetReductionDataSpace; } DataSpace getCudaReductionDataSpace() const { return cudaReductionDataSpace; } DataSpace getHipReductionDataSpace() const { return hipReductionDataSpace; } + DataSpace getSyclReductionDataSpace() const { return syclReductionDataSpace; } DataSpace getKokkosReductionDataSpace() const { return kokkosReductionDataSpace; } DataSpace getSeqMPIDataSpace() const { return seqMPIDataSpace; } @@ -160,6 +161,7 @@ class RunParams { DataSpace getOmpTargetMPIDataSpace() const { return ompTargetMPIDataSpace; } DataSpace getCudaMPIDataSpace() const { return cudaMPIDataSpace; } DataSpace getHipMPIDataSpace() const { return hipMPIDataSpace; } + DataSpace getSyclMPIDataSpace() const { return syclMPIDataSpace; } DataSpace getKokkosMPIDataSpace() const { return kokkosMPIDataSpace; } double getPFTolerance() const { return pf_tol; } @@ -260,6 +262,7 @@ class RunParams { DataSpace ompTargetReductionDataSpace = DataSpace::OmpTarget; DataSpace cudaReductionDataSpace = DataSpace::CudaManagedDevicePreferredHostAccessed; DataSpace hipReductionDataSpace = DataSpace::HipDevice; + DataSpace syclReductionDataSpace = DataSpace::SyclDevice; DataSpace kokkosReductionDataSpace = DataSpace::Host; DataSpace seqMPIDataSpace = DataSpace::Host; @@ -267,6 +270,7 @@ class RunParams { DataSpace ompTargetMPIDataSpace = DataSpace::Copy; DataSpace cudaMPIDataSpace = DataSpace::CudaPinned; DataSpace hipMPIDataSpace = DataSpace::HipPinned; + DataSpace syclMPIDataSpace = DataSpace::SyclPinned; DataSpace kokkosMPIDataSpace = DataSpace::Copy; // From c65162e0f339ac4864d7f2bc660bccf1235bfbfd Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Mar 2024 13:53:26 -0700 Subject: [PATCH 269/454] Fix dataspace enum issue and add build script for SYCL compiler on corona --- scripts/lc-builds/corona_sycl.sh | 30 ++++++++++++++++++++---------- src/common/DataUtils.cpp | 4 ++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index ee0bbd23d..2f978f0d7 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -1,19 +1,19 @@ #!/usr/bin/env bash ############################################################################### -# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### if [[ $# -lt 1 ]]; then echo - echo "You must pass 1 argument to the script (in this order): " + echo "You must pass 1 argument to the script: " echo " 1) SYCL compiler installation path" echo echo "For example: " - echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_hip_gcc10.2.1_rocm5.1.0/install" + echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_a0117ab8692a_hip_gcc10.2.1_rocm5.6.0" exit fi @@ -36,6 +36,7 @@ mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER} DATE=$(printf '%(%Y-%m-%d)T\n' -1) export PATH=${SYCL_PATH}/bin:$PATH +export LD_LIBRARY_PATH=${SYCL_PATH}/lib:${SYCL_PATH}/lib64:$LD_LIBRARY_PATH ## NOTE: RAJA tests are turned off due to compilation issues. @@ -47,21 +48,30 @@ cmake \ -DENABLE_CUDA=Off \ -DRAJA_ENABLE_TARGET_OPENMP=Off \ -DENABLE_ALL_WARNINGS=Off \ - -DENABLE_SYCL=On \ + -DRAJA_ENABLE_SYCL=On \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_LINKER=clang++ \ - -DCMAKE_CXX_STANDARD=17 \ - -DENABLE_TESTS=Off \ + -DBLT_CXX_STD=c++17 \ + -DENABLE_TESTS=On \ -DENABLE_EXAMPLES=On \ "$@" \ .. echo echo "***********************************************************************" -echo -echo "Remember to export PATH=${SYCL_PATH}/bin:\$PATH to obtain the correct compiler paths." -echo +echo echo "cd into directory build_${BUILD_SUFFIX}_${USER} and run make to build RAJA" +echo +echo "To run RAJA tests, exercises, etc. with the build, please do the following:" +echo +echo " 1) Load the ROCm module version matching the version in the compiler path" +echo " you passed to this script." +echo +echo " 2) Set the "LD_LIBRARY_PATH environment variable to " +echo " ${SYCL_PATH}/lib:${SYCL_PATH}/lib64:${LD_LIBRARY_PATH}" +echo +echo " where SYCL_PATH is set to the compiler installation path you passed" +echo " to this script." echo echo "***********************************************************************" diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 1bb0d7e84..ebf007e83 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -704,6 +704,10 @@ DataSpace hostCopyDataSpace(DataSpace dataSpace) case DataSpace::HipDeviceFine: return DataSpace::HipPinned; + case DataSpace::SyclManaged: + case DataSpace::SyclDevice: + return DataSpace::SyclPinned; + default: { throw std::invalid_argument("hostCopyDataSpace : Unknown data space"); From bdf16382db59367e893163ae30f0d275c6ff8ea2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 14 Mar 2024 13:58:27 -0700 Subject: [PATCH 270/454] update RAJA to develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 82d1b926a..7b1e5248e 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 82d1b926ada0fbb15a4a6e0adadc30c715cfda7b +Subproject commit 7b1e5248e113ee2df40a8878aac2f54b6ee2b74e From 70bea43bf58a309e2b03b5533fa62c9864971f53 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 14 Mar 2024 14:56:05 -0700 Subject: [PATCH 271/454] Fix indexing bug --- src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp index 53cc292f0..2f014e4b5 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -32,8 +32,8 @@ template void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize()+1; INIT_VIEW1D_OFFSET_DATA_SETUP; @@ -44,13 +44,13 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend-ibegin, work_group_size); qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), [=] (sycl::nd_item<1> item ) { - Index_type i = item.get_global_id(0); + Index_type i = ibegin + item.get_global_id(0); if (i < iend) { INIT_VIEW1D_OFFSET_BODY } @@ -68,10 +68,10 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend), + h.parallel_for(sycl::range<1>(iend-ibegin), [=] (sycl::item<1> item ) { - Index_type i = item.get_id(0); + Index_type i = ibegin + item.get_id(0); INIT_VIEW1D_OFFSET_BODY }); From 19368d6b53a4ebda9d7128801615aa89f1aa1f37 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 15 Mar 2024 09:10:31 -0700 Subject: [PATCH 272/454] Fix bug in script and clarify usage steps --- scripts/lc-builds/corona_sycl.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 2f978f0d7..aae3a177d 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -68,10 +68,10 @@ echo echo " 1) Load the ROCm module version matching the version in the compiler path" echo " you passed to this script." echo -echo " 2) Set the "LD_LIBRARY_PATH environment variable to " -echo " ${SYCL_PATH}/lib:${SYCL_PATH}/lib64:${LD_LIBRARY_PATH}" +echo " 2) Prefix the LD_LIBRARY_PATH environment variable with " +echo " SYCL_PATH/lib:SYCL_PATH/lib64" echo echo " where SYCL_PATH is set to the compiler installation path you passed" -echo " to this script." +echo " to this script (using the proper command for your shell)." echo -echo "***********************************************************************" +echo "***********************************************************************" From 9a6c6733b4c2d335bb542da719b0a9c95b5dbdcd Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 18 Mar 2024 10:16:16 -0700 Subject: [PATCH 273/454] Fix up all file headers --- src/apps/DEL_DOT_VEC_2D-Sycl.cpp | 15 ++++----------- src/apps/ENERGY-Sycl.cpp | 15 ++++----------- src/apps/FIR-Sycl.cpp | 15 ++++----------- src/apps/LTIMES-Sycl.cpp | 15 ++++----------- src/apps/LTIMES_NOVIEW-Sycl.cpp | 15 ++++----------- src/apps/PRESSURE-Sycl.cpp | 15 ++++----------- src/apps/VOL3D-Sycl.cpp | 15 ++++----------- src/basic/DAXPY-Sycl.cpp | 15 ++++----------- src/basic/IF_QUAD-Sycl.cpp | 15 ++++----------- src/basic/INDEXLIST.hpp | 4 ++-- src/basic/INDEXLIST_3LOOP.hpp | 4 ++-- src/basic/INIT3-Sycl.cpp | 15 ++++----------- src/basic/INIT_VIEW1D-Sycl.cpp | 15 ++++----------- src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 15 ++++----------- src/basic/MAT_MAT_SHARED.hpp | 2 +- src/basic/MULADDSUB-Sycl.cpp | 15 ++++----------- src/basic/NESTED_INIT-Sycl.cpp | 15 ++++----------- src/basic/REDUCE3_INT-Sycl.cpp | 15 ++++----------- src/basic/TRAP_INT-Sycl.cpp | 15 ++++----------- src/common/SyclDataUtils.hpp | 4 ++-- src/lcals/DIFF_PREDICT-Sycl.cpp | 15 ++++----------- src/lcals/EOS-Sycl.cpp | 15 ++++----------- src/lcals/FIRST_DIFF-Sycl.cpp | 15 ++++----------- src/lcals/GEN_LIN_RECUR-Sycl.cpp | 4 ++-- src/lcals/HYDRO_1D-Sycl.cpp | 15 ++++----------- src/lcals/HYDRO_2D-Sycl.cpp | 15 ++++----------- src/lcals/INT_PREDICT-Sycl.cpp | 15 ++++----------- src/lcals/PLANCKIAN-Sycl.cpp | 15 ++++----------- src/lcals/TRIDIAG_ELIM-Sycl.cpp | 4 ++-- src/polybench/POLYBENCH_2MM-Sycl.cpp | 16 ++++------------ src/stream/ADD-Sycl.cpp | 15 ++++----------- src/stream/COPY-Sycl.cpp | 15 ++++----------- src/stream/DOT-Sycl.cpp | 2 +- src/stream/MUL-Sycl.cpp | 15 ++++----------- src/stream/TRIAD-Sycl.cpp | 15 ++++----------- 35 files changed, 124 insertions(+), 321 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp index 23d70ba78..83a20ed3e 100644 --- a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "DEL_DOT_VEC_2D.hpp" diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp index 713b80256..d94907963 100644 --- a/src/apps/ENERGY-Sycl.cpp +++ b/src/apps/ENERGY-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "ENERGY.hpp" diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp index 6ced4d5cf..eabd9e78e 100644 --- a/src/apps/FIR-Sycl.cpp +++ b/src/apps/FIR-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "FIR.hpp" diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index 799e247b1..d88082bf3 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "LTIMES.hpp" diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index 310d7cd60..4d50bfc58 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "LTIMES_NOVIEW.hpp" diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp index 269709fc7..c27054e04 100644 --- a/src/apps/PRESSURE-Sycl.cpp +++ b/src/apps/PRESSURE-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "PRESSURE.hpp" diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp index a61cef626..6940f345e 100644 --- a/src/apps/VOL3D-Sycl.cpp +++ b/src/apps/VOL3D-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "VOL3D.hpp" diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index dc7802d3b..950455102 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "DAXPY.hpp" diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index bb7f8c010..3d2aae511 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "IF_QUAD.hpp" diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index 0836d8197..3aea44950 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -1,7 +1,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. +// See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index e19ee5508..113908093 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -1,7 +1,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-22, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. +// See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index 9761fc639..1125d546d 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "INIT3.hpp" diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index 47de6058a..afb0abbd4 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "INIT_VIEW1D.hpp" diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp index 2f014e4b5..21b2a6fd4 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "INIT_VIEW1D_OFFSET.hpp" diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index 095721c27..74ec47dff 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp index f4f13c681..27e0113c9 100644 --- a/src/basic/MULADDSUB-Sycl.cpp +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "MULADDSUB.hpp" diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index 94abf0f7e..714bdd99c 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "NESTED_INIT.hpp" diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index e4827d24b..3574bffd3 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "REDUCE3_INT.hpp" diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index 8671c10b2..7c472884c 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "TRAP_INT.hpp" diff --git a/src/common/SyclDataUtils.hpp b/src/common/SyclDataUtils.hpp index 8301f5006..e426476c4 100644 --- a/src/common/SyclDataUtils.hpp +++ b/src/common/SyclDataUtils.hpp @@ -1,7 +1,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. +// See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp index 28d01cd98..161fc174b 100644 --- a/src/lcals/DIFF_PREDICT-Sycl.cpp +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "DIFF_PREDICT.hpp" diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp index 6b8beaacc..796d39ead 100644 --- a/src/lcals/EOS-Sycl.cpp +++ b/src/lcals/EOS-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "EOS.hpp" diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp index 323a0313b..656006e5c 100644 --- a/src/lcals/FIRST_DIFF-Sycl.cpp +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "FIRST_DIFF.hpp" diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp index 6c98c8908..44ff78037 100644 --- a/src/lcals/GEN_LIN_RECUR-Sycl.cpp +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -1,7 +1,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. +// See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp index 12d29b335..959c4d805 100644 --- a/src/lcals/HYDRO_1D-Sycl.cpp +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "HYDRO_1D.hpp" diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index a737c97f7..516084ec4 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "HYDRO_2D.hpp" diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp index 5d09278e2..b13603f42 100644 --- a/src/lcals/INT_PREDICT-Sycl.cpp +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "INT_PREDICT.hpp" diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp index 9a2d29d6c..ae17b10d0 100644 --- a/src/lcals/PLANCKIAN-Sycl.cpp +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "PLANCKIAN.hpp" diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp index e8237882f..4c59c29ef 100644 --- a/src/lcals/TRIDIAG_ELIM-Sycl.cpp +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -1,7 +1,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. +// See the RAJAPerf/LICENSE file for details. // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index bd051daf0..8bade3b7d 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -1,17 +1,9 @@ - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "POLYBENCH_2MM.hpp" diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp index 884860b01..1fd1ee166 100644 --- a/src/stream/ADD-Sycl.cpp +++ b/src/stream/ADD-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "ADD.hpp" diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp index c03ea9d72..978cff141 100644 --- a/src/stream/COPY-Sycl.cpp +++ b/src/stream/COPY-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "COPY.hpp" diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp index 13277bdb0..113a605e9 100644 --- a/src/stream/DOT-Sycl.cpp +++ b/src/stream/DOT-Sycl.cpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. // See the RAJAPerf/LICENSE file for details. // diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp index a9c946d63..8e2a77d01 100644 --- a/src/stream/MUL-Sycl.cpp +++ b/src/stream/MUL-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "MUL.hpp" diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp index eeb04d94f..3a5e40e31 100644 --- a/src/stream/TRIAD-Sycl.cpp +++ b/src/stream/TRIAD-Sycl.cpp @@ -1,16 +1,9 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -// -// Produced at the Lawrence Livermore National Laboratory -// -// LLNL-CODE-738930 -// -// All rights reserved. -// -// This file is part of the RAJA Performance Suite. -// -// For details about use and distribution, please read RAJAPerf/LICENSE. +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. // +// SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "TRIAD.hpp" From 7b574a6f2626379fbb4573633976c38d97d249b3 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 18 Mar 2024 10:24:21 -0700 Subject: [PATCH 274/454] Fix file header in build script --- scripts/alcf-builds/sycl.sh | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh index c4421b08f..cb7f67d49 100755 --- a/scripts/alcf-builds/sycl.sh +++ b/scripts/alcf-builds/sycl.sh @@ -1,18 +1,11 @@ #!/usr/bin/env bash -## -## Copyright (c) 2017-19, Lawrence Livermore National Security, LLC. -## -## Produced at the Lawrence Livermore National Laboratory. -## -## LLNL-CODE-738930 -## -## All rights reserved. -## -## This file is part of the RAJA Performance Suite. -## -## For details about use and distribution, please read RAJAPerf/LICENSE. -## +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### BUILD_SUFFIX=sycl From 93fe8fe60442d0d7f8678ecc991eba18ac1b807a Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 20 Mar 2024 09:40:40 -0700 Subject: [PATCH 275/454] Fix Sycl kernel variants and make more consistent with other GPU variants --- src/basic/NESTED_INIT-Sycl.cpp | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index 94abf0f7e..8d1215fd1 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -28,6 +28,13 @@ namespace rajaperf namespace basic { + // + // Define work-group shape for SYCL execution + // +#define i_block_sz (32) +#define j_block_sz (work_group_size / i_block_sz) +#define k_block_sz (1) + template void NESTED_INIT::runSyclVariantImpl(VariantID vid) { @@ -38,23 +45,24 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { if (work_group_size > 0) { + + sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(nk, k_block_sz), + RAJA_DIVIDE_CEILING_INT(nj, j_block_sz), + RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)); + sycl::range<3> wkgroup_dim(k_block_sz, j_block_sz, i_block_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(ni, work_group_size); - + qu->submit([&] (cl::sycl::handler& h) { - h.parallel_for(sycl::nd_range<3> ( - sycl::range<3> (nk, nj, global_size), - sycl::range<3> (1, 1, work_group_size)), + h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), [=] (sycl::nd_item<3> item) { Index_type i = item.get_global_id(2); Index_type j = item.get_global_id(1); Index_type k = item.get_global_id(0); - if (i < ni) { + if (i < ni && j < nj && k < nk) { NESTED_INIT_BODY } }); @@ -98,9 +106,9 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::SyclKernelAsync< - RAJA::statement::For<0, RAJA::sycl_global_2, // i - RAJA::statement::For<1, RAJA::sycl_global_1<1>, // j - RAJA::statement::For<2, RAJA::sycl_global_0<1>, // i + RAJA::statement::For<2, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<0, RAJA::sycl_global_2, RAJA::statement::Lambda<0> > > @@ -111,9 +119,9 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, nk), + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, ni)), + RAJA::RangeSegment(0, nk)), [=] (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }); From efce54c21ccca47ba6b5ff77c8cd3010df61d532 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 20 Mar 2024 10:09:30 -0700 Subject: [PATCH 276/454] Clean up some memory issues in Sycl variants --- src/basic/DAXPY-Sycl.cpp | 2 +- src/basic/IF_QUAD-Sycl.cpp | 2 +- src/basic/REDUCE3_INT-Sycl.cpp | 7 +------ src/basic/TRAP_INT-Sycl.cpp | 14 +++++++++----- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index 950455102..b0464f2ee 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -87,7 +87,7 @@ void DAXPY::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { DAXPY_BODY; }); diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index 3d2aae511..2fd6ae10b 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -84,7 +84,7 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { IF_QUAD_BODY; }); diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index 3574bffd3..eabe80120 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -107,7 +107,6 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) initSyclDeviceData(hmin, &m_vmin_init, 1, qu); initSyclDeviceData(hmax, &m_vmax_init, 1, qu); - qu->submit([&] (sycl::handler& h) { auto sum_reduction = sycl::reduction(hsum, sycl::plus<>()); @@ -156,8 +155,6 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) return; } - REDUCE3_INT_DATA_SETUP_SYCL; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -165,7 +162,7 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) RAJA::ReduceMin vmin(m_vmin_init); RAJA::ReduceMax vmax(m_vmax_init); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { REDUCE3_INT_BODY_RAJA; }); @@ -178,8 +175,6 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) qu->wait(); stopTimer(); - REDUCE3_INT_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n REDUCE3_INT : Unknown Sycl variant id = " << vid << std::endl; } diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index 7c472884c..3db1d183d 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -47,15 +47,17 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { + Real_ptr sumx; + allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu); + if (work_group_size > 0) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_ptr sumx; - allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu); - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(sumx, &m_sumx_init, 1, qu); qu->submit([&] (sycl::handler& hdl) { @@ -87,8 +89,7 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_ptr sumx; - allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu); + initSyclDeviceData(sumx, &m_sumx_init, 1, qu); qu->submit([&] (sycl::handler& hdl) { @@ -114,6 +115,9 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) stopTimer(); } + + deallocSyclDeviceData(sumx, qu); \ + } else if ( vid == RAJA_SYCL ) { if ( work_group_size == 0 ) { From 640b429db23e72418a7976ce474bc42fe4a98885 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 20 Mar 2024 13:53:22 -0700 Subject: [PATCH 277/454] Update a few things in the GItLab CI for consistency with RAJA --- .gitlab-ci.yml | 2 +- .gitlab/custom-jobs-and-variables.yml | 9 +++------ .gitlab/jobs/lassen.yml | 14 +++++++------- .gitlab/jobs/poodle.yml | 10 +++++----- .gitlab/jobs/ruby.yml | 13 ++++++------- .gitlab/jobs/tioga.yml | 4 ++-- .uberenv_config.json | 2 +- 7 files changed, 25 insertions(+), 29 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0d5f6bc6c..342c05c36 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -72,7 +72,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: 'v2023.12.0' + ref: 'v2023.12.3' file: 'pipelines/${CI_MACHINE}.yml' - artifact: '${CI_MACHINE}-jobs.yml' job: 'generate-job-lists' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 4744a2052..90651210f 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -18,7 +18,7 @@ variables: RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=14 --nodes=2" # Arguments for job level allocation # Note: We repeat the reservation, necessary when jobs are manually re-triggered. - RUBY_JOB_ALLOC: "--overlap --reservation=ci --nodes=1" + RUBY_JOB_ALLOC: "--reservation=ci --nodes=1" # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp" # Project specific deps for ruby @@ -26,12 +26,9 @@ variables: # Poodle # Arguments for top level allocation -# Optimization notes: We have 4 jobs lasting at max 5 minutes and using 28 -# cores out of 112 available (see -j in scripts/gitlab/build_and_test.sh). -# We allow allocation overlapping. - POODLE_SHARED_ALLOC: "--exclusive --partition=pdebug --time=14 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=14 --nodes=1" # Arguments for job level allocation - POODLE_JOB_ALLOC: "--overlap --nodes=1" + POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp" # Project specific deps for poodle diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index e895a38a8..7fdf16794 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -22,7 +22,7 @@ # Overriding shared spec: Longer allocation + extra flags xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl=@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.2.0" LASSEN_JOB_ALLOC: "1 -W 60 -q pci" extends: .job_on_lassen @@ -37,20 +37,20 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: gcc_8_3_1: variables: - SPEC: " ~shared +openmp %gcc@8.3.1 ${PROJECT_LASSEN_DEPS}" + SPEC: " ~shared +openmp %gcc@=8.3.1 ${PROJECT_LASSEN_DEPS}" extends: .job_on_lassen gcc_8_3_1_cuda_11_5_0_ats_disabled: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + SPEC: " ~shared +openmp +cuda %gcc@=8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.5.0" LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: extends: .job_on_lassen variables: - SPEC: " ~shared +openmp +cuda +mpi %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ${PROJECT_LASSEN_DEPS}" + SPEC: " ~shared +openmp +cuda +mpi %gcc@=8.3.1 cuda_arch=70 ^cuda@11.5.0+allow-unsupported-compilers ^spectrum-mpi ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.5.0" LASSEN_JOB_ALLOC: "1 --atsdisable -W 30 -q pci" @@ -60,12 +60,12 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled_mpi: clang_13_0_1_libcpp: variables: - SPEC: " ~shared +openmp %clang@13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ${PROJECT_LASSEN_DEPS}" + SPEC: " ~shared +openmp %clang@=13.0.1 cflags==\"-DGTEST_HAS_CXXABI_H_=0\" cxxflags==\"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0\" ${PROJECT_LASSEN_DEPS}" extends: .job_on_lassen #clang_14_0_5_asan: # variables: -# SPEC: " ~shared +openmp %clang@14.0.5 cxxflags==\"-fsanitize=address\" ${PROJECT_LASSEN_DEPS}" +# SPEC: " ~shared +openmp %clang@=14.0.5 cxxflags==\"-fsanitize=address\" ${PROJECT_LASSEN_DEPS}" # ASAN_OPTIONS: "detect_leaks=1" # LSAN_OPTIONS: "suppressions=${CI_PROJECT_DIR}/tpl/RAJA/suppressions.asan" # extends: .job_on_lassen @@ -73,5 +73,5 @@ clang_13_0_1_libcpp: # Activated in RAJA, but we don't use desul atomics here #gcc_8_3_1_cuda_10_1_168_desul_atomics: # variables: -# SPEC: "+openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" +# SPEC: "+openmp +cuda +desul %gcc@=8.3.1 cuda_arch=70 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" # extends: .job_on_lassen diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 96fa9b8a4..9ba4d3f3d 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -21,22 +21,22 @@ clang_14_0_6: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %clang@=14.0.6 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle gcc_10_3_1: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} +omptask %gcc@=10.3.1 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle intel_19_1_2_gcc_10_3_1: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle intel_2022_1_0: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@2022.1.0 ${PROJECT_POODLE_DEPS}" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=2022.1.0 ${PROJECT_POODLE_DEPS}" allow_failure: true extends: .job_on_poodle @@ -49,6 +49,6 @@ intel_2022_1_0: intel_2022_1_0_mpi: variables: - SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ${PROJECT_POODLE_DEPS}" + SPEC: "~shared +openmp +mpi %intel@=2022.1.0 ^mvapich2 ${PROJECT_POODLE_DEPS}" allow_failure: true extends: .job_on_poodle diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index 6944bb010..c1d4e043c 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -12,34 +12,33 @@ echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" -######################## +####################### # Overridden shared jobs ######################## # We duplicate the shared jobs description and add necessary changes for RAJA. # We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that # the comparison with the original job is easier. -# Overriding shared config for longer run and algorithm variants clang_14_0_6: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@14.0.6 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %clang@=14.0.6 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby gcc_10_3_1: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@10.3.1 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} +omptask %gcc@=10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" extends: .job_on_ruby intel_19_1_2_gcc_10_3_1: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}" RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" extends: .job_on_ruby intel_2022_1_0: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@2022.1.0 ${PROJECT_RUBY_DEPS}" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby ############ @@ -51,5 +50,5 @@ intel_2022_1_0: intel_2022_1_0_mpi: variables: - SPEC: "~shared +openmp +mpi %intel@2022.1.0 ^mvapich2 ${PROJECT_RUBY_DEPS}" + SPEC: "~shared +openmp +mpi %intel@=2022.1.0 ^mvapich2 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index a3e03cdce..968d6c268 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -30,10 +30,10 @@ rocmcc_5_7_1_hip_openmp: variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" extends: .job_on_tioga rocmcc_5_7_1_hip_openmp_mpi: variables: - SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" + SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" extends: .job_on_tioga diff --git a/.uberenv_config.json b/.uberenv_config.json index 1568498cc..cb9f9a5dd 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -4,7 +4,7 @@ "package_final_phase" : "initconfig", "package_source_dir" : "../..", "spack_url": "https://github.com/spack/spack.git", -"spack_branch": "develop-2024-01-21", +"spack_branch": "develop-2024-02-18", "spack_activate" : {}, "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", "spack_packages_path": "tpl/RAJA/scripts/radiuss-spack-configs/packages", From 76a41b8a9068e711d469fbd79bf7bd8375835860 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:58:08 +0100 Subject: [PATCH 278/454] Fix typo + Update syntax --- .gitlab/jobs/corona.yml | 3 ++- .gitlab/jobs/lassen.yml | 5 +++-- .gitlab/jobs/poodle.yml | 3 ++- .gitlab/jobs/ruby.yml | 3 ++- .gitlab/jobs/tioga.yml | 3 ++- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index bc9e0c7d1..8fec233c5 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -7,7 +7,8 @@ ############################################################################# # Override reproducer section to define project specific variables. -.corona_reproducer_vars: &corona_reproducer_vars +.corona_reproducer_vars: + script: - | echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index 7fdf16794..c6eacf864 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -7,7 +7,8 @@ ############################################################################## # Override reproducer section to define project specific variables. -.lassen_reproducer_vars: &lassen_reproducer_vars +.lassen_reproducer_vars: + script: - | echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" @@ -22,7 +23,7 @@ # Overriding shared spec: Longer allocation + extra flags xl_2022_08_19_gcc_8_3_1_cuda_11_2_0: variables: - SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl=@16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda cxxflags==\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" %xl@=16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}" MODULE_LIST: "cuda/11.2.0" LASSEN_JOB_ALLOC: "1 -W 60 -q pci" extends: .job_on_lassen diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index 9ba4d3f3d..ed18f60f5 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -7,7 +7,8 @@ ############################################################################## # Override reproducer section to define projet specific variables. -.poodle_reproducer_vars: &poodle_reproducer_vars +.poodle_reproducer_vars: + script: - | echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index c1d4e043c..3502ed3fb 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -7,7 +7,8 @@ ############################################################################## # Override reproducer section to define project specific variables. -.ruby_reproducer_vars: &ruby_reproducer_vars +.ruby_reproducer_vars: + script: - | echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index 968d6c268..688d428c0 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -7,7 +7,8 @@ ############################################################################# # Override reproducer section to define project specific variables. -.tioga_reproducer_vars: &tioga_reproducer_vars +.tioga_reproducer_vars: + script: - | echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" From 1b0c7fca3df2d02715d6864ad0187073bf923e9f Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 21 Mar 2024 16:33:39 +0100 Subject: [PATCH 279/454] Increase poodle ruby and lassen shared allocation --- .gitlab/custom-jobs-and-variables.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 90651210f..c5a207f91 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -15,7 +15,7 @@ variables: # Ruby # Arguments for top level allocation - RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=14 --nodes=2" + RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=20 --nodes=2" # Arguments for job level allocation # Note: We repeat the reservation, necessary when jobs are manually re-triggered. RUBY_JOB_ALLOC: "--reservation=ci --nodes=1" @@ -26,7 +26,7 @@ variables: # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --time=14 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=20 --nodes=1" # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle @@ -57,7 +57,7 @@ variables: # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_JOB_ALLOC: "1 -W 16 -q pci" + LASSEN_JOB_ALLOC: "1 -W 20 -q pci" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp cuda_arch=70" # Project specific deps for lassen From eb2dd8b9757250eaee129ab130bf967521e26f1c Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 21 Mar 2024 14:04:27 -0700 Subject: [PATCH 280/454] More code changes for consistency --- src/basic/DAXPY-Sycl.cpp | 52 +++------- src/basic/IF_QUAD-Sycl.cpp | 60 ++++-------- src/basic/INIT3-Sycl.cpp | 63 ++++--------- src/basic/INIT_VIEW1D-Sycl.cpp | 50 +++------- src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 63 ++++--------- src/basic/MULADDSUB-Sycl.cpp | 63 ++++--------- src/basic/NESTED_INIT-Sycl.cpp | 69 ++++---------- src/basic/REDUCE3_INT-Sycl.cpp | 131 ++++++++------------------ src/basic/TRAP_INT-Sycl.cpp | 85 +++++------------ src/common/KernelBase.hpp | 8 +- src/stream/ADD-Sycl.cpp | 5 +- src/stream/COPY-Sycl.cpp | 1 - src/stream/DOT-Sycl.cpp | 46 ++++----- src/stream/MUL-Sycl.cpp | 3 +- src/stream/TRIAD-Sycl.cpp | 3 +- 15 files changed, 207 insertions(+), 495 deletions(-) diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index b0464f2ee..4a38ece00 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -33,57 +33,29 @@ void DAXPY::runSyclVariantImpl(VariantID vid) DAXPY_DATA_SETUP; if ( vid == Base_SYCL ) { - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>{global_size, work_group_size}, - [=] (sycl::nd_item<1> item ) { - - Index_type i = item.get_global_id(0); - if (i < iend) { - DAXPY_BODY - } - - }); - }); - } - qu->wait(); // Wait for computation to finish before stopping timer - - stopTimer(); - } else { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend), - [=] (sycl::item<1> item) { + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { - Index_type i = item.get_id(0); + Index_type i = item.get_global_id(0); + if (i < iend) { DAXPY_BODY + } - }); }); - } - qu->wait(); // Wait for computation to finish before stopping timer - - stopTimer(); - + }); } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); } else if ( vid == RAJA_SYCL ) { - if ( work_group_size == 0 ) { - std::cout << "\n DAXPY : RAJA_SYCL does not support auto work group size" << std::endl; - return; - } - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index 2fd6ae10b..6d989d513 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -32,55 +32,29 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) IF_QUAD_DATA_SETUP; if ( vid == Base_SYCL ) { - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - [=] (sycl::nd_item<1> item ) { - - Index_type i = item.get_global_id(0); - - if (i < iend) { - IF_QUAD_BODY - } - }); - }); - } - qu->wait(); // Wait for computation to finish before stopping timer - stopTimer(); - - } else { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend), - [=] (sycl::item<1> item) { - - Index_type i = item.get_id(0); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + + if (i < iend) { IF_QUAD_BODY - - }); + } }); - } - qu->wait(); // Wait for computation to finish before stopping timer - stopTimer(); - + }); } + qu->wait(); // Wait for computation to finish before stopping timer + stopTimer(); } else if ( vid == RAJA_SYCL ) { - if ( work_group_size == 0 ) { - std::cout << "\n IF_QUAD : RAJA_SYCL does not support auto work group size" << std::endl; - return; - } - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index 1125d546d..7ba60ff3a 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -32,55 +32,28 @@ void INIT3::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - [=] (sycl::nd_item<1> item ) { - - Index_type i = item.get_global_id(0); - if (i < iend) { - INIT3_BODY - } - - }); - }); - - } - qu->wait(); - stopTimer(); - - } else { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend), - [=] (sycl::item<1> item ) { - - Index_type i = item.get_id(0); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { INIT3_BODY - - }); + } + }); - - } - qu->wait(); - stopTimer(); - - } - } else if ( vid == RAJA_SYCL ) { + }); - if ( work_group_size == 0 ) { - std::cout << "\n INIT3 : RAJA_SYCL does not support auto work group size" << std::endl; - return; } + qu->wait(); + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index afb0abbd4..fb2a18f7f 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -33,54 +33,28 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - qu->submit([&] (sycl::handler& h) { + qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>{global_size, work_group_size}, - [=] (sycl::nd_item<1> item ) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { - Index_type i = item.get_global_id(0); - if (i < iend) { - INIT_VIEW1D_BODY - } - }); - }); - } - qu->wait(); - stopTimer(); - - } else { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend), - [=] (sycl::item<1> item) { - - Index_type i = item.get_id(0); + Index_type i = item.get_global_id(0); + if (i < iend) { INIT_VIEW1D_BODY - - }); + } }); - } - qu->wait(); - stopTimer(); - + }); } + qu->wait(); + stopTimer(); } else if ( vid == RAJA_SYCL ) { - if ( work_group_size == 0 ) { - std::cout << "\n INIT_VIEW1D : RAJA_SYCL does not support auto work group size" << std::endl; - return; - } - INIT_VIEW1D_VIEW_RAJA; startTimer(); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp index 21b2a6fd4..2764d3089 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -32,55 +32,28 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend-ibegin, work_group_size); - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - [=] (sycl::nd_item<1> item ) { - - Index_type i = ibegin + item.get_global_id(0); - if (i < iend) { - INIT_VIEW1D_OFFSET_BODY - } - - }); - }); - - } - qu->wait(); - stopTimer(); - - } else { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend-ibegin), - [=] (sycl::item<1> item ) { - - Index_type i = ibegin + item.get_id(0); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend-ibegin, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = ibegin + item.get_global_id(0); + if (i < iend) { INIT_VIEW1D_OFFSET_BODY - - }); + } + }); - - } - qu->wait(); - stopTimer(); - - } - } else if ( vid == RAJA_SYCL ) { + }); - if ( work_group_size == 0 ) { - std::cout << "\n INIT_VIEW1D_OFFSET : RAJA_SYCL does not support auto work group size" << std::endl; - return; } + qu->wait(); + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp index 27e0113c9..0185dc7ba 100644 --- a/src/basic/MULADDSUB-Sycl.cpp +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -32,55 +32,28 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - [=] (sycl::nd_item<1> item ) { - - Index_type i = item.get_global_id(0); - if (i < iend) { - MULADDSUB_BODY - } - - }); - }); - - } - qu->wait(); - stopTimer(); - - } else { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<1>(iend), - [=] (sycl::item<1> item ) { - - Index_type i = item.get_id(0); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { MULADDSUB_BODY - - }); + } + }); - - } - qu->wait(); - stopTimer(); - - } - } else if ( vid == RAJA_SYCL ) { + }); - if ( work_group_size == 0 ) { - std::cout << "\n MULADDSUB : RAJA_SYCL does not support auto work group size" << std::endl; - return; } + qu->wait(); + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index f7e8b87a1..f0ffc022e 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -37,64 +37,33 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - if (work_group_size > 0) { - - sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(nk, k_block_sz), - RAJA_DIVIDE_CEILING_INT(nj, j_block_sz), - RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)); - sycl::range<3> wkgroup_dim(k_block_sz, j_block_sz, i_block_sz); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (cl::sycl::handler& h) { - h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), - [=] (sycl::nd_item<3> item) { - - Index_type i = item.get_global_id(2); - Index_type j = item.get_global_id(1); - Index_type k = item.get_global_id(0); - - if (i < ni && j < nj && k < nk) { - NESTED_INIT_BODY - } - }); - }); - - } - qu->wait(); - stopTimer(); + sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(nk, k_block_sz), + RAJA_DIVIDE_CEILING_INT(nj, j_block_sz), + RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)); + sycl::range<3> wkgroup_dim(k_block_sz, j_block_sz, i_block_sz); - } else { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::range<3> (nk, nj, ni), - [=] (sycl::item<3> item) { + qu->submit([&] (cl::sycl::handler& h) { + h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_id(2); - Index_type j = item.get_id(1); - Index_type k = item.get_id(0); + Index_type i = item.get_global_id(2); + Index_type j = item.get_global_id(1); + Index_type k = item.get_global_id(0); + if (i < ni && j < nj && k < nk) { NESTED_INIT_BODY - - }); + } }); - - } - qu->wait(); - stopTimer(); - - } - - } else if ( vid == RAJA_SYCL ) { + }); - if ( work_group_size == 0 ) { - std::cout << "\n NESTED_INIT : RAJA_SYCL does not support auto work group size" << std::endl; - return; } + qu->wait(); + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { using EXEC_POL = RAJA::KernelPolicy< diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index eabe80120..448e35d30 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -47,114 +47,59 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) REDUCE3_INT_DATA_SETUP_SYCL; - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - initSyclDeviceData(hsum, &m_vsum_init, 1, qu); - initSyclDeviceData(hmin, &m_vmin_init, 1, qu); - initSyclDeviceData(hmax, &m_vmax_init, 1, qu); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - qu->submit([&] (sycl::handler& h) { + initSyclDeviceData(hsum, &m_vsum_init, 1, qu); + initSyclDeviceData(hmin, &m_vmin_init, 1, qu); + initSyclDeviceData(hmax, &m_vmax_init, 1, qu); - auto sum_reduction = sycl::reduction(hsum, sycl::plus<>()); - auto min_reduction = sycl::reduction(hmin, sycl::minimum<>()); - auto max_reduction = sycl::reduction(hmax, sycl::maximum<>()); + qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - sum_reduction, min_reduction, max_reduction, - [=] (sycl::nd_item<1> item, auto& vsum, auto& vmin, auto& vmax) { - - Index_type i = item.get_global_id(0); - if (i < iend) { - // REDUCE3_INT_BODY - vsum += vec[i]; - vmin.combine(vec[i]); - vmax.combine(vec[i]); - } - - }); - }); - - Int_type lsum; - Int_ptr plsum = &lsum; - getSyclDeviceData(plsum, hsum, 1, qu); - m_vsum += lsum; - - Int_type lmin; - Int_ptr plmin = &lmin; - getSyclDeviceData(plmin, hmin, 1, qu); - m_vmin = RAJA_MIN(m_vmin, lmin); - - Int_type lmax; - Int_ptr plmax = &lmax; - getSyclDeviceData(plmax, hmax, 1, qu); - m_vmax = RAJA_MAX(m_vmax, lmax); - - } // for (RepIndex_type irep = ... - qu->wait(); - stopTimer(); - - } else { // work_group_size <= 0 - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto sum_reduction = sycl::reduction(hsum, sycl::plus<>()); + auto min_reduction = sycl::reduction(hmin, sycl::minimum<>()); + auto max_reduction = sycl::reduction(hmax, sycl::maximum<>()); - initSyclDeviceData(hsum, &m_vsum_init, 1, qu); - initSyclDeviceData(hmin, &m_vmin_init, 1, qu); - initSyclDeviceData(hmax, &m_vmax_init, 1, qu); + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, min_reduction, max_reduction, + [=] (sycl::nd_item<1> item, auto& vsum, auto& vmin, auto& vmax) { - qu->submit([&] (sycl::handler& h) { + Index_type i = item.get_global_id(0); + if (i < iend) { + // REDUCE3_INT_BODY + vsum += vec[i]; + vmin.combine(vec[i]); + vmax.combine(vec[i]); + } - auto sum_reduction = sycl::reduction(hsum, sycl::plus<>()); - auto min_reduction = sycl::reduction(hmin, sycl::minimum<>()); - auto max_reduction = sycl::reduction(hmax, sycl::maximum<>()); + }); + }); - h.parallel_for(sycl::range<1>(iend), - sum_reduction, min_reduction, max_reduction, - [=] (sycl::item<1> item, auto& vsum, auto& vmin, auto& vmax ) { - - Index_type i = item.get_id(0); - vsum += vec[i]; - vmin.combine(vec[i]); - vmax.combine(vec[i]); + Int_type lsum; + Int_ptr plsum = &lsum; + getSyclDeviceData(plsum, hsum, 1, qu); + m_vsum += lsum; - }); - }); + Int_type lmin; + Int_ptr plmin = &lmin; + getSyclDeviceData(plmin, hmin, 1, qu); + m_vmin = RAJA_MIN(m_vmin, lmin); - Int_type lsum; - Int_ptr plsum = &lsum; - getSyclDeviceData(plsum, hsum, 1, qu); - m_vsum += lsum; - - Int_type lmin; - Int_ptr plmin = &lmin; - getSyclDeviceData(plmin, hmin, 1, qu); - m_vmin = RAJA_MIN(m_vmin, lmin); - - Int_type lmax; - Int_ptr plmax = &lmax; - getSyclDeviceData(plmax, hmax, 1, qu); - m_vmax = RAJA_MAX(m_vmax, lmax); - - } - qu->wait(); - stopTimer(); - - } + Int_type lmax; + Int_ptr plmax = &lmax; + getSyclDeviceData(plmax, hmax, 1, qu); + m_vmax = RAJA_MAX(m_vmax, lmax); + } // for (RepIndex_type irep = ... + qu->wait(); + stopTimer(); + REDUCE3_INT_DATA_TEARDOWN_SYCL; } else if ( vid == RAJA_SYCL ) { - if ( work_group_size == 0 ) { - std::cout << "\n REDUCE3_INT : RAJA_SYCL does not support auto work group size" << std::endl; - return; - } - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index 3db1d183d..b9caf2dad 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -49,82 +49,43 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) Real_ptr sumx; allocAndInitSyclDeviceData(sumx, &m_sumx_init, 1, qu); - - if (work_group_size > 0) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - initSyclDeviceData(sumx, &m_sumx_init, 1, qu); - - qu->submit([&] (sycl::handler& hdl) { - - auto sum_reduction = sycl::reduction(sumx, sycl::plus<>()); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - sum_reduction, - [=] (sycl::nd_item<1> item, auto& sumx) { - - Index_type i = item.get_global_id(0); - if (i < iend) { - TRAP_INT_BODY - } - - }); - }); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - Real_type lsumx; - Real_ptr plsumx = &lsumx; - getSyclDeviceData(plsumx, sumx, 1, qu); - m_sumx += lsumx * h; - - } - qu->wait(); - stopTimer(); - - } else { + initSyclDeviceData(sumx, &m_sumx_init, 1, qu); - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initSyclDeviceData(sumx, &m_sumx_init, 1, qu); + qu->submit([&] (sycl::handler& hdl) { - qu->submit([&] (sycl::handler& hdl) { + auto sum_reduction = sycl::reduction(sumx, sycl::plus<>()); - auto sum_reduction = sycl::reduction(sumx, sycl::plus<>()); + hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, + [=] (sycl::nd_item<1> item, auto& sumx) { - hdl.parallel_for(sycl::range<1>(iend), - sum_reduction, - [=] (sycl::item<1> item, auto& sumx ) { - - Index_type i = item.get_id(0); + Index_type i = item.get_global_id(0); + if (i < iend) { TRAP_INT_BODY - - }); + } + }); + }); - Real_type lsumx; - Real_ptr plsumx = &lsumx; - getSyclDeviceData(plsumx, sumx, 1, qu); - m_sumx += lsumx * h; - - } - qu->wait(); - stopTimer(); - - } + Real_type lsumx; + Real_ptr plsumx = &lsumx; + getSyclDeviceData(plsumx, sumx, 1, qu); + m_sumx += lsumx * h; - deallocSyclDeviceData(sumx, qu); \ + } + qu->wait(); + stopTimer(); + + deallocSyclDeviceData(sumx, qu); } else if ( vid == RAJA_SYCL ) { - if ( work_group_size == 0 ) { - std::cout << "\n TRAP_INT : RAJA_SYCL does not support auto work group size" << std::endl; - return; - } - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 1c3c8626f..850893d93 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -252,9 +252,11 @@ class KernelBase #if defined(RAJA_ENABLE_SYCL) camp::resources::Sycl getSyclResource() { -/* if (run_params.getGPUStream() == 0) { - return camp::resources::Cuda::CudaFromStream(0); - }*/ + /* + if (run_params.getGPUStream() == 0) { + return camp::resources::Sycl::SyclFromStream(0); + } + */ return camp::resources::Sycl::get_default(); } #endif diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp index 1fd1ee166..69bfb472e 100644 --- a/src/stream/ADD-Sycl.cpp +++ b/src/stream/ADD-Sycl.cpp @@ -40,8 +40,8 @@ void ADD::runSyclVariantImpl(VariantID vid) const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -51,7 +51,6 @@ void ADD::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp index 978cff141..703add0fb 100644 --- a/src/stream/COPY-Sycl.cpp +++ b/src/stream/COPY-Sycl.cpp @@ -49,7 +49,6 @@ void COPY::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp index 113a605e9..08f82bcb1 100644 --- a/src/stream/DOT-Sycl.cpp +++ b/src/stream/DOT-Sycl.cpp @@ -33,40 +33,40 @@ void DOT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - if (work_group_size != 0) { + Real_ptr dot; + allocAndInitSyclDeviceData(dot, &m_dot_init, 1, qu); - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type dot = m_dot_init; - - { - sycl::buffer buf_dot(&dot, 1); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + initSyclDeviceData(dot, &m_dot_init, 1, qu); - qu->submit([&] (sycl::handler& h) { + qu->submit([&] (sycl::handler& h) { - auto sumReduction = reduction(buf_dot, h, sycl::plus()); + auto sumReduction = sycl::reduction(dot, sycl::plus()); - h.parallel_for(sycl::nd_range<1>{global_size, work_group_size}, - sumReduction, - [=] (sycl::nd_item<1> item, auto& dot) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sumReduction, + [=] (sycl::nd_item<1> item, auto& dot) { - Index_type i = item.get_global_id(0); - if (i < iend) { - DOT_BODY; - } + Index_type i = item.get_global_id(0); + if (i < iend) { + DOT_BODY; + } - }); - }); - } + }); + }); - m_dot += dot; + Real_type ldot; + Real_ptr pldot = &ldot; + getSyclDeviceData(pldot, dot, 1, qu); + m_dot += ldot; - } - stopTimer(); } + qu->wait(); + stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp index 8e2a77d01..e234712ba 100644 --- a/src/stream/MUL-Sycl.cpp +++ b/src/stream/MUL-Sycl.cpp @@ -39,7 +39,7 @@ void MUL::runSyclVariantImpl(VariantID vid) qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), - [=] (sycl::nd_item<1> item) { + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -48,7 +48,6 @@ void MUL::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp index 3a5e40e31..851aa56d0 100644 --- a/src/stream/TRIAD-Sycl.cpp +++ b/src/stream/TRIAD-Sycl.cpp @@ -39,7 +39,7 @@ void TRIAD::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), - [=] (sycl::nd_item<1> item) { + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -49,7 +49,6 @@ void TRIAD::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); From 33228aefbb31e58ade78876ac00108d6bc335c4a Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Mon, 25 Mar 2024 09:30:54 -0700 Subject: [PATCH 281/454] Code organization changes for consistency --- src/basic/DAXPY.cpp | 5 ++--- src/basic/IF_QUAD.cpp | 4 ++-- src/basic/INIT3.cpp | 4 ++-- src/basic/INIT_VIEW1D.cpp | 3 +-- src/basic/INIT_VIEW1D_OFFSET.cpp | 4 ++-- src/basic/MULADDSUB.cpp | 4 ++-- src/basic/NESTED_INIT-Sycl.cpp | 20 ++++++++++---------- src/basic/NESTED_INIT.cpp | 4 ++-- src/basic/REDUCE3_INT.cpp | 4 ++-- src/basic/TRAP_INT.cpp | 4 ++-- src/stream/ADD.cpp | 3 +-- src/stream/COPY.cpp | 4 ++-- src/stream/DOT.cpp | 4 ++-- src/stream/MUL.cpp | 4 ++-- src/stream/TRIAD.cpp | 4 ++-- 15 files changed, 36 insertions(+), 39 deletions(-) diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index b0fe6985f..c324a8df5 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -52,11 +52,10 @@ DAXPY::DAXPY(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } DAXPY::~DAXPY() diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index bf96ddd0d..70c90adc5 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -56,10 +56,10 @@ IF_QUAD::IF_QUAD(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } IF_QUAD::~IF_QUAD() diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 3527b3bdd..902de8ec6 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -52,10 +52,10 @@ INIT3::INIT3(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } INIT3::~INIT3() diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 6572cbdc6..3c101f6ce 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -53,11 +53,10 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } INIT_VIEW1D::~INIT_VIEW1D() diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 56776f4ec..aa89b6112 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -53,10 +53,10 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET() diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 8bc3e9426..09a310275 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -52,10 +52,10 @@ MULADDSUB::MULADDSUB(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } MULADDSUB::~MULADDSUB() diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index f0ffc022e..b4dd7d594 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -24,9 +24,9 @@ namespace basic // // Define work-group shape for SYCL execution // -#define i_block_sz (32) -#define j_block_sz (work_group_size / i_block_sz) -#define k_block_sz (1) +#define i_wg_sz (32) +#define j_wg_sz (work_group_size / i_wg_sz) +#define k_wg_sz (1) template void NESTED_INIT::runSyclVariantImpl(VariantID vid) @@ -37,10 +37,10 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(nk, k_block_sz), - RAJA_DIVIDE_CEILING_INT(nj, j_block_sz), - RAJA_DIVIDE_CEILING_INT(ni, i_block_sz)); - sycl::range<3> wkgroup_dim(k_block_sz, j_block_sz, i_block_sz); + sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(nk, k_wg_sz), + RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz), + RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz)); + sycl::range<3> wkgroup_dim(k_wg_sz, j_wg_sz, i_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -68,9 +68,9 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::SyclKernelAsync< - RAJA::statement::For<2, RAJA::sycl_global_0, - RAJA::statement::For<1, RAJA::sycl_global_1, - RAJA::statement::For<0, RAJA::sycl_global_2, + RAJA::statement::For<2, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<0, RAJA::sycl_global_2, RAJA::statement::Lambda<0> > > diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index a3d52c023..f827bd7f4 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -63,10 +63,10 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } NESTED_INIT::~NESTED_INIT() diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 0766eccf9..670209336 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -57,10 +57,10 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } REDUCE3_INT::~REDUCE3_INT() diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 7d0ce8247..859a72815 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -52,10 +52,10 @@ TRAP_INT::TRAP_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } TRAP_INT::~TRAP_INT() diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 473f74f4e..e3b96ae62 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -53,11 +53,10 @@ ADD::ADD(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } ADD::~ADD() diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 066ebf2a7..997dbfecf 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -53,10 +53,10 @@ COPY::COPY(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } COPY::~COPY() diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 85e933fb6..9a615b583 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -53,10 +53,10 @@ DOT::DOT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } DOT::~DOT() diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 796bdb4e8..eee8a69a4 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -53,10 +53,10 @@ MUL::MUL(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } MUL::~MUL() diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 43443ba3f..472dcdbd8 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -57,10 +57,10 @@ TRIAD::TRIAD(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda ); - setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); + + setVariantDefined( Kokkos_Lambda ); } TRIAD::~TRIAD() From 0f9a0634cafc6de3c65284e0c744c741369222ff Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Mon, 25 Mar 2024 10:26:17 -0700 Subject: [PATCH 282/454] Work through remaining initial impls of first batch of Sycl variants --- src/apps/CMakeLists.txt | 6 ++ src/apps/DEL_DOT_VEC_2D-Sycl.cpp | 17 +----- src/apps/DEL_DOT_VEC_2D.cpp | 1 - src/apps/DEL_DOT_VEC_2D.hpp | 3 +- src/apps/ENERGY-Sycl.cpp | 98 +++++++++----------------------- src/apps/ENERGY.cpp | 3 + src/apps/ENERGY.hpp | 5 ++ src/apps/FIR-Sycl.cpp | 25 ++++---- src/apps/FIR.cpp | 3 + src/apps/FIR.hpp | 5 ++ src/apps/LTIMES-Sycl.cpp | 60 +++++++++---------- src/apps/LTIMES.cpp | 3 + src/apps/LTIMES.hpp | 5 ++ src/apps/LTIMES_NOVIEW-Sycl.cpp | 60 +++++++++---------- src/apps/LTIMES_NOVIEW.cpp | 3 + src/apps/LTIMES_NOVIEW.hpp | 5 ++ src/apps/PRESSURE-Sycl.cpp | 54 +++++------------- src/apps/PRESSURE.cpp | 3 + src/apps/PRESSURE.hpp | 5 ++ src/apps/VOL3D-Sycl.cpp | 60 +++++-------------- src/apps/VOL3D.cpp | 3 + src/apps/VOL3D.hpp | 5 ++ src/basic/NESTED_INIT-Sycl.cpp | 2 +- 23 files changed, 181 insertions(+), 253 deletions(-) diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 697aa0912..c612ac55f 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -40,24 +40,28 @@ blt_add_library( ENERGY-Cuda.cpp ENERGY-OMP.cpp ENERGY-OMPTarget.cpp + ENERGY-Sycl.cpp FIR.cpp FIR-Seq.cpp FIR-Hip.cpp FIR-Cuda.cpp FIR-OMP.cpp FIR-OMPTarget.cpp + FIR-Sycl.cpp LTIMES.cpp LTIMES-Seq.cpp LTIMES-Hip.cpp LTIMES-Cuda.cpp LTIMES-OMP.cpp LTIMES-OMPTarget.cpp + LTIMES-Sycl.cpp LTIMES_NOVIEW.cpp LTIMES_NOVIEW-Seq.cpp LTIMES_NOVIEW-Hip.cpp LTIMES_NOVIEW-Cuda.cpp LTIMES_NOVIEW-OMP.cpp LTIMES_NOVIEW-OMPTarget.cpp + LTIMES_NOVIEW-Sycl.cpp MASS3DEA.cpp MASS3DEA-Cuda.cpp MASS3DEA-Hip.cpp @@ -82,12 +86,14 @@ blt_add_library( PRESSURE-Cuda.cpp PRESSURE-OMP.cpp PRESSURE-OMPTarget.cpp + PRESSURE-Sycl.cpp VOL3D.cpp VOL3D-Seq.cpp VOL3D-Hip.cpp VOL3D-Cuda.cpp VOL3D-OMP.cpp VOL3D-OMPTarget.cpp + VOL3D-Sycl.cpp ZONAL_ACCUMULATION_3D.cpp ZONAL_ACCUMULATION_3D-Seq.cpp ZONAL_ACCUMULATION_3D-Hip.cpp diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp index 83a20ed3e..9754dca6c 100644 --- a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -27,6 +27,7 @@ template void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; auto res{getSyclResource()}; @@ -34,12 +35,6 @@ void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) DEL_DOT_VEC_2D_DATA_SETUP; if ( vid == Base_SYCL ) { - if (work_group_size != 0) { - -/* NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;*/ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -60,19 +55,11 @@ void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); - } } else if ( vid == RAJA_SYCL ) { - -/* NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;*/ - - //RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones, sycl_res); RAJA::TypedListSegment zones(real_zones, iend, res, RAJA::Unowned); diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index f67e21049..1d211daf3 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -65,7 +65,6 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setVariantDefined( Base_SYCL ); setVariantDefined( RAJA_SYCL ); - } DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 02f54d81e..d7c0d20f6 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -123,11 +123,12 @@ class DEL_DOT_VEC_2D : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp index d94907963..1e5fd8d2c 100644 --- a/src/apps/ENERGY-Sycl.cpp +++ b/src/apps/ENERGY-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -22,49 +21,8 @@ namespace rajaperf namespace apps { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define ENERGY_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(e_new, m_e_new, iend, qu); \ - allocAndInitSyclDeviceData(e_old, m_e_old, iend, qu); \ - allocAndInitSyclDeviceData(delvc, m_delvc, iend, qu); \ - allocAndInitSyclDeviceData(p_new, m_p_new, iend, qu); \ - allocAndInitSyclDeviceData(p_old, m_p_old, iend, qu); \ - allocAndInitSyclDeviceData(q_new, m_q_new, iend, qu); \ - allocAndInitSyclDeviceData(q_old, m_q_old, iend, qu); \ - allocAndInitSyclDeviceData(work, m_work, iend, qu); \ - allocAndInitSyclDeviceData(compHalfStep, m_compHalfStep, iend, qu); \ - allocAndInitSyclDeviceData(pHalfStep, m_pHalfStep, iend, qu); \ - allocAndInitSyclDeviceData(bvc, m_bvc, iend, qu); \ - allocAndInitSyclDeviceData(pbvc, m_pbvc, iend, qu); \ - allocAndInitSyclDeviceData(ql_old, m_ql_old, iend, qu); \ - allocAndInitSyclDeviceData(qq_old, m_qq_old, iend, qu); \ - allocAndInitSyclDeviceData(vnewc, m_vnewc, iend, qu); - -#define ENERGY_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_e_new, e_new, iend, qu); \ - getSyclDeviceData(m_q_new, q_new, iend, qu); \ - deallocSyclDeviceData(e_new, qu); \ - deallocSyclDeviceData(e_old, qu); \ - deallocSyclDeviceData(delvc, qu); \ - deallocSyclDeviceData(p_new, qu); \ - deallocSyclDeviceData(p_old, qu); \ - deallocSyclDeviceData(q_new, qu); \ - deallocSyclDeviceData(q_old, qu); \ - deallocSyclDeviceData(work, qu); \ - deallocSyclDeviceData(compHalfStep, qu); \ - deallocSyclDeviceData(pHalfStep, qu); \ - deallocSyclDeviceData(bvc, qu); \ - deallocSyclDeviceData(pbvc, qu); \ - deallocSyclDeviceData(ql_old, qu); \ - deallocSyclDeviceData(qq_old, qu); \ - deallocSyclDeviceData(vnewc, qu); - -void ENERGY::runSyclVariant(VariantID vid) +template +void ENERGY::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -75,18 +33,18 @@ void ENERGY::runSyclVariant(VariantID vid) using sycl::sqrt; using sycl::fabs; - if ( vid == Base_SYCL ) { + auto res{getSyclResource()}; - ENERGY_DATA_SETUP_SYCL; + if ( vid == Base_SYCL ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if(i < iend) { @@ -97,8 +55,8 @@ void ENERGY::runSyclVariant(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if(i < iend) { @@ -109,8 +67,8 @@ void ENERGY::runSyclVariant(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if(i < iend) { @@ -120,8 +78,8 @@ void ENERGY::runSyclVariant(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if(i < iend) { @@ -132,8 +90,8 @@ void ENERGY::runSyclVariant(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if(i < iend) { @@ -145,8 +103,8 @@ void ENERGY::runSyclVariant(VariantID vid) qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if(i < iend) { @@ -156,15 +114,11 @@ void ENERGY::runSyclVariant(VariantID vid) }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); - ENERGY_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - ENERGY_DATA_SETUP_SYCL; - const bool async = true; startTimer(); @@ -172,32 +126,32 @@ void ENERGY::runSyclVariant(VariantID vid) RAJA::region( [=]() { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY1; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY2; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY3; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY4; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY5; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY6; }); @@ -208,13 +162,13 @@ void ENERGY::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - ENERGY_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n ENERGY : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ENERGY, Sycl) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index f24072f8b..bf5e35c5e 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -62,6 +62,9 @@ ENERGY::ENERGY(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } ENERGY::~ENERGY() diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 8cee73a4c..ba5b69949 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -203,13 +203,18 @@ class ENERGY : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp index eabd9e78e..81c965729 100644 --- a/src/apps/FIR-Sycl.cpp +++ b/src/apps/FIR-Sycl.cpp @@ -15,7 +15,6 @@ #include #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -23,11 +22,6 @@ namespace rajaperf namespace apps { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - #define FIR_DATA_SETUP_SYCL \ Real_ptr coeff; \ \ @@ -36,14 +30,15 @@ namespace apps Real_ptr tcoeff = &coeff_array[0]; \ allocAndInitSyclDeviceData(coeff, tcoeff, FIR_COEFFLEN, qu); - #define FIR_DATA_TEARDOWN_SYCL \ getSyclDeviceData(m_out, out, getActualProblemSize(), qu); \ deallocSyclDeviceData(in, qu); \ deallocSyclDeviceData(out, qu); \ deallocSyclDeviceData(coeff, qu); -void FIR::runSyclVariant(VariantID vid) + +template +void FIR::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -51,6 +46,8 @@ void FIR::runSyclVariant(VariantID vid) FIR_DATA_SETUP; + auto res{getSyclResource()}; + if ( vid == Base_SYCL ) { FIR_COEFF; @@ -60,11 +57,11 @@ void FIR::runSyclVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -74,7 +71,7 @@ void FIR::runSyclVariant(VariantID vid) }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); FIR_DATA_TEARDOWN_SYCL; @@ -88,7 +85,7 @@ void FIR::runSyclVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { FIR_BODY; }); @@ -104,6 +101,8 @@ void FIR::runSyclVariant(VariantID vid) } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIR, Sycl) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 2700bb487..96cc28296 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -56,6 +56,9 @@ FIR::FIR(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } FIR::~FIR() diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 1686e3250..72968045f 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -78,13 +78,18 @@ class FIR : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index d88082bf3..36e87a58a 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -22,18 +21,15 @@ namespace rajaperf namespace apps { -#define LTIMES_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(phidat, m_phidat, m_philen, qu); \ - allocAndInitSyclDeviceData(elldat, m_elldat, m_elllen, qu); \ - allocAndInitSyclDeviceData(psidat, m_psidat, m_psilen, qu); - -#define LTIMES_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_phidat, phidat, m_philen, qu); \ - deallocSyclDeviceData(phidat, qu); \ - deallocSyclDeviceData(elldat, qu); \ - deallocSyclDeviceData(psidat, qu); +// +// Define work-group shape for SYCL execution +// +#define m_wg_sz (32) +#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz)) +#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz)) -void LTIMES::runSyclVariant(VariantID vid) +template +void LTIMES::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -41,45 +37,45 @@ void LTIMES::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - LTIMES_DATA_SETUP_SYCL; + sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), + RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), + RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); + sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<3> ( - sycl::range<3>(num_z, num_g, num_m), - sycl::range<3>(1,1,1)), - [=] (sycl::nd_item<3> item) { + h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type z = item.get_global_id(0); - Index_type g = item.get_global_id(1); Index_type m = item.get_global_id(2); + Index_type g = item.get_global_id(1); + Index_type z = item.get_global_id(0); - for (Index_type d = 0; d < num_d; ++d) { - LTIMES_BODY + if (m < num_m && g < num_g && z < num_z) { + for (Index_type d = 0; d < num_d; ++d) { + LTIMES_BODY; + } } + }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); - LTIMES_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - LTIMES_DATA_SETUP_SYCL; - LTIMES_VIEWS_RANGES_RAJA; using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::SyclKernel< - RAJA::statement::For<1, RAJA::sycl_global_2<1>, //z - RAJA::statement::For<2, RAJA::sycl_global_1<1>, //g - RAJA::statement::For<3, RAJA::sycl_global_0<1>, //m - RAJA::statement::For<0, RAJA::seq_exec, //d + RAJA::statement::For<1, RAJA::sycl_global_2, //z + RAJA::statement::For<2, RAJA::sycl_global_1, //g + RAJA::statement::For<3, RAJA::sycl_global_0, //m + RAJA::statement::For<0, RAJA::seq_exec, //d RAJA::statement::Lambda<0> > > @@ -103,13 +99,13 @@ void LTIMES::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - LTIMES_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n LTIMES : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(LTIMES, Sycl) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 4db36b287..83ba52774 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -77,6 +77,9 @@ LTIMES::LTIMES(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } LTIMES::~LTIMES() diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index be270804e..0e94e8d0c 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -116,13 +116,18 @@ class LTIMES : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index 4d50bfc58..d7202e450 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -22,19 +21,15 @@ namespace rajaperf namespace apps { -#define LTIMES_NOVIEW_DATA_SETUP_SYCL \ -\ - allocAndInitSyclDeviceData(phidat, m_phidat, m_philen, qu); \ - allocAndInitSyclDeviceData(elldat, m_elldat, m_elllen, qu); \ - allocAndInitSyclDeviceData(psidat, m_psidat, m_psilen, qu); - -#define LTIMES_NOVIEW_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_phidat, phidat, m_philen, qu); \ - deallocSyclDeviceData(phidat, qu); \ - deallocSyclDeviceData(elldat, qu); \ - deallocSyclDeviceData(psidat, qu); +// +// Define work-group shape for SYCL execution +// +#define m_wg_sz (32) +#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz)) +#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz)) -void LTIMES_NOVIEW::runSyclVariant(VariantID vid) +template +void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -42,44 +37,43 @@ void LTIMES_NOVIEW::runSyclVariant(VariantID vid) if ( vid == Base_SYCL ) { - LTIMES_NOVIEW_DATA_SETUP_SYCL; + sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), + RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), + RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); + sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<3> ( - sycl::range<3>(num_m, num_g, num_z), - sycl::range<3>(1, 1, 1)), - [=] (sycl::nd_item<3> item) { + h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type z = item.get_global_id(2); + Index_type m = item.get_global_id(2); Index_type g = item.get_global_id(1); - Index_type m = item.get_global_id(0); + Index_type z = item.get_global_id(0); - for (Index_type d = 0; d < num_d; ++d) { - LTIMES_NOVIEW_BODY + if (m < num_m && g < num_g && z < num_z) { + for (Index_type d = 0; d < num_d; ++d) { + LTIMES_NOVIEW_BODY; + } } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); - LTIMES_NOVIEW_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - LTIMES_NOVIEW_DATA_SETUP_SYCL; - using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::SyclKernel< - RAJA::statement::For<1, RAJA::sycl_global_2<1>, //z - RAJA::statement::For<2, RAJA::sycl_global_1<1>, //g - RAJA::statement::For<3, RAJA::sycl_global_0<1>, //m - RAJA::statement::For<0, RAJA::seq_exec, //d + RAJA::statement::For<1, RAJA::sycl_global_2, //z + RAJA::statement::For<2, RAJA::sycl_global_1, //g + RAJA::statement::For<3, RAJA::sycl_global_0, //m + RAJA::statement::For<0, RAJA::seq_exec, //d RAJA::statement::Lambda<0> > > @@ -103,13 +97,13 @@ void LTIMES_NOVIEW::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - LTIMES_NOVIEW_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n LTIMES_NOVIEW : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(LTIMES_NOVIEW, Sycl) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 22c7bf43e..0154ad060 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -76,6 +76,9 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } LTIMES_NOVIEW::~LTIMES_NOVIEW() diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index b85a96497..ddd8c9ade 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -66,13 +66,18 @@ class LTIMES_NOVIEW : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp index c27054e04..8f0c1c4a7 100644 --- a/src/apps/PRESSURE-Sycl.cpp +++ b/src/apps/PRESSURE-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -22,48 +21,29 @@ namespace rajaperf namespace apps { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define PRESSURE_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(compression, m_compression, iend, qu); \ - allocAndInitSyclDeviceData(bvc, m_bvc, iend, qu); \ - allocAndInitSyclDeviceData(p_new, m_p_new, iend, qu); \ - allocAndInitSyclDeviceData(e_old, m_e_old, iend, qu); \ - allocAndInitSyclDeviceData(vnewc, m_vnewc, iend, qu); - -#define PRESSURE_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_p_new, p_new, iend, qu); \ - deallocSyclDeviceData(compression, qu); \ - deallocSyclDeviceData(bvc, qu); \ - deallocSyclDeviceData(p_new, qu); \ - deallocSyclDeviceData(e_old, qu); \ - deallocSyclDeviceData(vnewc, qu); - -void PRESSURE::runSyclVariant(VariantID vid) +template +void PRESSURE::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); PRESSURE_DATA_SETUP; + using sycl::fabs; - if ( vid == Base_SYCL ) { + auto res{getSyclResource()}; - PRESSURE_DATA_SETUP_SYCL; + if ( vid == Base_SYCL ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -74,8 +54,8 @@ void PRESSURE::runSyclVariant(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { @@ -86,15 +66,11 @@ void PRESSURE::runSyclVariant(VariantID vid) }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); - PRESSURE_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - PRESSURE_DATA_SETUP_SYCL; - const bool async = true; startTimer(); @@ -102,12 +78,12 @@ void PRESSURE::runSyclVariant(VariantID vid) RAJA::region( [=]() { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { PRESSURE_BODY1; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { PRESSURE_BODY2; }); @@ -118,13 +94,13 @@ void PRESSURE::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - PRESSURE_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n PRESSURE : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PRESSURE, Sycl) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index 2e344b843..a37130072 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -52,6 +52,9 @@ PRESSURE::PRESSURE(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } PRESSURE::~PRESSURE() diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 0ba273c34..7f1c92d6b 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -72,13 +72,18 @@ class PRESSURE : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp index 6940f345e..37deff8ab 100644 --- a/src/apps/VOL3D-Sycl.cpp +++ b/src/apps/VOL3D-Sycl.cpp @@ -16,7 +16,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -24,77 +23,46 @@ namespace rajaperf namespace apps { - // - // Define thread block size for SYCL execution - // - const size_t block_size = 256; - - -#define VOL3D_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(x, m_x, m_array_length, qu); \ - allocAndInitSyclDeviceData(y, m_y, m_array_length, qu); \ - allocAndInitSyclDeviceData(z, m_z, m_array_length, qu); \ - allocAndInitSyclDeviceData(vol, m_vol, m_array_length, qu); - -#define VOL3D_DATA_TEARDOWN_SYCL \ - getSyclDeviceData(m_vol, vol, m_array_length, qu); \ - deallocSyclDeviceData(x, qu); \ - deallocSyclDeviceData(y, qu); \ - deallocSyclDeviceData(z, qu); \ - deallocSyclDeviceData(vol, qu); - -void VOL3D::runSyclVariant(VariantID vid) +template +void VOL3D::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; const Index_type iend = m_domain->lpz+1; + auto res{getSyclResource()}; + VOL3D_DATA_SETUP; if ( vid == Base_SYCL ) { - VOL3D_DATA_SETUP_SYCL; - - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; - NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; - - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = block_size * RAJA_DIVIDE_CEILING_INT(iend - ibegin, block_size); + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, block_size), - [=] (sycl::nd_item<1> item) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { - Index_type i = item.get_global_id(0); - i += ibegin; - if(i < iend) { + Index_type ii = item.get_global_id(0); + Index_type i = ii + ibegin; + if (i < iend) { VOL3D_BODY } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); - VOL3D_DATA_TEARDOWN_SYCL; - } else if ( vid == RAJA_SYCL ) { - VOL3D_DATA_SETUP_SYCL; - - NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; - NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { VOL3D_BODY; }); @@ -103,13 +71,13 @@ void VOL3D::runSyclVariant(VariantID vid) qu->wait(); stopTimer(); - VOL3D_DATA_TEARDOWN_SYCL; - } else { std::cout << "\n VOL3D : Unknown Sycl variant id = " << vid << std::endl; } } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(VOL3D, Sycl) + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index dd388e178..45cd4b0a9 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -64,6 +64,9 @@ VOL3D::VOL3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } VOL3D::~VOL3D() diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index f341b5739..6847ce13f 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -173,13 +173,18 @@ class VOL3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index b4dd7d594..94248fc59 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -54,7 +54,7 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) Index_type k = item.get_global_id(0); if (i < ni && j < nj && k < nk) { - NESTED_INIT_BODY + NESTED_INIT_BODY; } }); }); From 2cd69c18d71f7bd64d233b78f73a32d59cf96e1b Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Mon, 25 Mar 2024 13:45:53 -0700 Subject: [PATCH 283/454] More code org for consistency --- src/apps/CONVECTION3DPA.hpp | 1 + src/apps/DIFFUSION3DPA.hpp | 1 + src/apps/EDGE3D.hpp | 1 + src/apps/MASS3DEA.hpp | 7 +++++-- src/apps/MASS3DPA.hpp | 1 + src/apps/NODAL_ACCUMULATION_3D.hpp | 1 + src/apps/PRESSURE.hpp | 1 + src/apps/ZONAL_ACCUMULATION_3D.hpp | 1 + src/basic/ARRAY_OF_PTRS.hpp | 2 ++ src/basic/COPY8.hpp | 2 ++ src/basic/DAXPY-Sycl.cpp | 4 ++-- src/basic/DAXPY.hpp | 5 +++-- src/basic/DAXPY_ATOMIC.hpp | 2 ++ src/basic/IF_QUAD.hpp | 6 ++++-- src/basic/INDEXLIST.hpp | 1 + src/basic/INDEXLIST_3LOOP.hpp | 1 + src/basic/INIT3.hpp | 6 ++++-- src/basic/INIT_VIEW1D.hpp | 6 ++++-- src/basic/INIT_VIEW1D_OFFSET.hpp | 6 ++++-- src/basic/MAT_MAT_SHARED.hpp | 1 + src/basic/MULADDSUB.hpp | 6 ++++-- src/basic/NESTED_INIT.hpp | 6 ++++-- src/basic/PI_ATOMIC.hpp | 2 ++ src/basic/PI_REDUCE.hpp | 2 ++ src/basic/REDUCE_STRUCT.hpp | 2 ++ src/stream/ADD.hpp | 2 +- src/stream/COPY.hpp | 6 ++++-- src/stream/DOT.hpp | 2 +- src/stream/MUL.hpp | 6 ++++-- src/stream/TRIAD.hpp | 6 ++++-- 30 files changed, 70 insertions(+), 26 deletions(-) diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index 9d8eea6e8..8c8d6066a 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -381,6 +381,7 @@ class CONVECTION3DPA : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 0c5271ddc..a7757f503 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -484,6 +484,7 @@ class DIFFUSION3DPA : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp index ac87bf331..6a3e1f903 100644 --- a/src/apps/EDGE3D.hpp +++ b/src/apps/EDGE3D.hpp @@ -420,6 +420,7 @@ class EDGE3D : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index 300af975b..0fbdb03e4 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -157,8 +157,11 @@ class MASS3DEA : public KernelBase { void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template void runCudaVariantImpl(VariantID vid); - template void runHipVariantImpl(VariantID vid); + + template + void runCudaVariantImpl(VariantID vid); + template + void runHipVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = MEA_D1D * MEA_D1D * MEA_D1D; diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 7489ee7af..9fe634cc4 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -366,6 +366,7 @@ class MASS3DPA : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index 18a4864f9..085c0099a 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -100,6 +100,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 7f1c92d6b..d1cffe62a 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -82,6 +82,7 @@ class PRESSURE : public KernelBase void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 0e9c292d6..2e15e3d60 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -84,6 +84,7 @@ class ZONAL_ACCUMULATION_3D : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index f94528fa9..b32e10254 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -72,10 +72,12 @@ class ARRAY_OF_PTRS : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index cd5ee83bc..572754540 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -79,10 +79,12 @@ class COPY8 : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index 4a38ece00..f42e74ac6 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -45,13 +45,13 @@ void DAXPY::runSyclVariantImpl(VariantID vid) Index_type i = item.get_global_id(0); if (i < iend) { - DAXPY_BODY + DAXPY_BODY; } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index c081f74a2..c63683844 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -52,9 +52,10 @@ class DAXPY : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); @@ -62,7 +63,7 @@ class DAXPY : public KernelBase void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index 8cc44ffe2..17bf3979b 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -55,10 +55,12 @@ class DAXPY_ATOMIC : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index 6412c14e8..a742d22eb 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -69,17 +69,19 @@ class IF_QUAD : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index 4ff60a60a..24f88c60c 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -63,6 +63,7 @@ class INDEXLIST : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index 64926b446..5cd2ac8ab 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -74,6 +74,7 @@ class INDEXLIST_3LOOP : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index e42627769..7e5f6a026 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -55,17 +55,19 @@ class INIT3 : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index d3f2b3803..0a3be36c3 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -66,17 +66,19 @@ class INIT_VIEW1D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 75879ab69..92a75935d 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -65,17 +65,19 @@ class INIT_VIEW1D_OFFSET : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index c8564dbb5..fffad8f10 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -142,6 +142,7 @@ class MAT_MAT_SHARED : public KernelBase { void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index c7ead77f8..1846a49a7 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -58,17 +58,19 @@ class MULADDSUB : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 2c9074de9..0c579dd3b 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -58,17 +58,19 @@ class NESTED_INIT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 399a2c172..26a3a7016 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -61,10 +61,12 @@ class PI_ATOMIC : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index bd906e17d..3a00fc638 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -59,10 +59,12 @@ class PI_REDUCE : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index 0deaf4254..f3bdd8a16 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -89,10 +89,12 @@ class REDUCE_STRUCT : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 058645977..7b96dbf9e 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -64,7 +64,7 @@ class ADD : public KernelBase void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index c7883ae91..991406624 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -51,17 +51,19 @@ class COPY : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index e5ae0ca77..f2cd455be 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -69,7 +69,7 @@ class DOT : public KernelBase template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 4d3530d21..6edd6381a 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -52,17 +52,19 @@ class MUL : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index e4a023b80..afb06cd3c 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -53,17 +53,19 @@ class TRIAD : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: From ce5c9008b30abad0c481e57d32a287809b9fe7df Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Mon, 25 Mar 2024 15:46:27 -0700 Subject: [PATCH 284/454] Work through kernels in lcals --- src/lcals/DIFF_PREDICT-Sycl.cpp | 6 ++++-- src/lcals/DIFF_PREDICT.hpp | 6 ++++-- src/lcals/EOS-Sycl.cpp | 3 ++- src/lcals/EOS.hpp | 6 ++++-- src/lcals/FIRST_DIFF-Sycl.cpp | 4 ++-- src/lcals/FIRST_DIFF.hpp | 6 ++++-- src/lcals/FIRST_MIN.hpp | 4 +++- src/lcals/FIRST_SUM.hpp | 3 ++- src/lcals/GEN_LIN_RECUR-Sycl.cpp | 2 ++ src/lcals/GEN_LIN_RECUR.hpp | 6 ++++-- src/lcals/HYDRO_1D-Sycl.cpp | 5 +++-- src/lcals/HYDRO_1D.hpp | 6 ++++-- src/lcals/HYDRO_2D-Sycl.cpp | 18 +++++++++--------- src/lcals/HYDRO_2D.hpp | 6 ++++-- src/lcals/INT_PREDICT-Sycl.cpp | 5 +++-- src/lcals/INT_PREDICT.hpp | 6 ++++-- src/lcals/PLANCKIAN-Sycl.cpp | 5 +++-- src/lcals/PLANCKIAN.hpp | 6 ++++-- src/lcals/TRIDIAG_ELIM-Sycl.cpp | 1 + src/lcals/TRIDIAG_ELIM.hpp | 6 ++++-- 20 files changed, 70 insertions(+), 40 deletions(-) diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp index 161fc174b..67ccacea6 100644 --- a/src/lcals/DIFF_PREDICT-Sycl.cpp +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -28,6 +28,8 @@ void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + DIFF_PREDICT_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -44,13 +46,13 @@ void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) Index_type i = item.get_global_id(0); if (i < iend) { - DIFF_PREDICT_BODY + DIFF_PREDICT_BODY; } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index d2bbbb5c8..7bd77eade 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -93,17 +93,19 @@ class DIFF_PREDICT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp index 796d39ead..8b6faba01 100644 --- a/src/lcals/EOS-Sycl.cpp +++ b/src/lcals/EOS-Sycl.cpp @@ -43,7 +43,7 @@ void EOS::runSyclVariantImpl(VariantID vid) Index_type i = item.get_global_id(0); if (i < iend) { - EOS_BODY + EOS_BODY; } }); @@ -70,6 +70,7 @@ void EOS::runSyclVariantImpl(VariantID vid) std::cout << "\n EOS : Unknown Sycl variant id = " << vid << std::endl; } } + RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(EOS, Sycl) } // end namespace lcals diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 8d4791e1b..fed56916d 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -62,17 +62,19 @@ class EOS : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp index 656006e5c..a97fbd622 100644 --- a/src/lcals/FIRST_DIFF-Sycl.cpp +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -42,13 +42,13 @@ void FIRST_DIFF::runSyclVariantImpl(VariantID vid) Index_type i = item.get_global_id(0); if (i < iend) { - FIRST_DIFF_BODY + FIRST_DIFF_BODY; } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 5070c2d50..c01907f9b 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -52,17 +52,19 @@ class FIRST_DIFF : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 001e4a1f7..ed2ddc286 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -79,15 +79,17 @@ class FIRST_MIN : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); -// void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); + template < size_t block_size, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename MappingHelper > diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 04d3c775e..52a3841a1 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -55,11 +55,12 @@ class FIRST_SUM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); -// void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp index 44ff78037..0b5442ab4 100644 --- a/src/lcals/GEN_LIN_RECUR-Sycl.cpp +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -34,6 +34,7 @@ void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t global_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1> (global_size1, work_group_size), [=] (sycl::nd_item<1> item) { @@ -47,6 +48,7 @@ void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) }); const size_t global_size2 = work_group_size * RAJA_DIVIDE_CEILING_INT(N+1, work_group_size); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1> (global_size2, work_group_size), [=] (sycl::nd_item<1> item) { diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index ee1635957..33c0895af 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -76,17 +76,19 @@ class GEN_LIN_RECUR : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp index 959c4d805..83a8553d1 100644 --- a/src/lcals/HYDRO_1D-Sycl.cpp +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -36,19 +36,20 @@ void HYDRO_1D::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), [=] (sycl::nd_item<1> item ) { Index_type i = item.get_global_id(0); if (i < iend) { - HYDRO_1D_BODY + HYDRO_1D_BODY; } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 4ed3288cb..4827fcecd 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -57,17 +57,19 @@ class HYDRO_1D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index 516084ec4..83eca34ca 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf @@ -58,6 +57,7 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { }); }); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_global_size, jn_global_size), sycl::range<2>(k_block_sz,j_block_sz)), @@ -96,16 +96,16 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { HYDRO_2D_VIEWS_RAJA; - using EXECPOL = - RAJA::KernelPolicy< - RAJA::statement::SyclKernelAsync< - RAJA::statement::For<0, RAJA::sycl_global_1, // k - RAJA::statement::For<1, RAJA::sycl_global_2, // j - RAJA::statement::Lambda<0> - > + using EXECPOL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> > > - >; + > + >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index 1e594d52f..1c9cc8d1c 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -151,17 +151,19 @@ class HYDRO_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp index b13603f42..3d275e42a 100644 --- a/src/lcals/INT_PREDICT-Sycl.cpp +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -36,6 +36,7 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), @@ -43,13 +44,13 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) Index_type i = item.get_global_id(0); if (i < iend) { - INT_PREDICT_BODY + INT_PREDICT_BODY; } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index d5b913f1e..5435af4f4 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -72,17 +72,19 @@ class INT_PREDICT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp index ae17b10d0..a36d50a47 100644 --- a/src/lcals/PLANCKIAN-Sycl.cpp +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -40,19 +40,20 @@ void PLANCKIAN::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); if (i < iend) { - PLANCKIAN_BODY + PLANCKIAN_BODY; } }); }); } - qu->wait(); // Wait for computation to finish before stopping timer + qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index fef88629f..a999d2178 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -57,17 +57,19 @@ class PLANCKIAN : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp index 4c59c29ef..87c269344 100644 --- a/src/lcals/TRIDIAG_ELIM-Sycl.cpp +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -36,6 +36,7 @@ void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), [=] (sycl::nd_item<1> item) { diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index a961822b5..69c1a2d9c 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -57,17 +57,19 @@ class TRIDIAG_ELIM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - void runKokkosVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void runKokkosVariant(VariantID vid, size_t tune_idx); + void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: From ad541304b4a1bf49cb126bf4b06391c504a10935 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Tue, 26 Mar 2024 15:01:13 -0700 Subject: [PATCH 285/454] Remove block-size zero from build script since that has been removed from the code. --- scripts/alcf-builds/sycl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh index cb7f67d49..f002631f3 100755 --- a/scripts/alcf-builds/sycl.sh +++ b/scripts/alcf-builds/sycl.sh @@ -20,7 +20,7 @@ cmake \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=Off \ -DENABLE_CUDA=Off \ - -DRAJA_PERFSUITE_GPU_BLOCKSIZES=0,64,128,256,512,1024 \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=64,128,256,512,1024 \ -DENABLE_TARGET_OPENMP=Off \ -DENABLE_ALL_WARNINGS=Off \ -DENABLE_SYCL=On \ From 0c009a857a9ac7b20e2cbf7d2b4077ab40f9a70c Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Tue, 26 Mar 2024 15:28:05 -0700 Subject: [PATCH 286/454] Fix curly brace nesting of switch case statements --- src/common/KernelBase.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index e5d66f3e3..a7d70bf84 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -168,12 +168,7 @@ void KernelBase::setVariantDefined(VariantID vid) #endif break; } -// Required for running Kokkos - case Kokkos_Lambda : - { -#if defined(RUN_KOKKOS) - setKokkosTuningDefinitions(vid); -#endif + case Base_SYCL: case RAJA_SYCL: { @@ -182,7 +177,14 @@ void KernelBase::setVariantDefined(VariantID vid) #endif break; } - break; + +// Required for running Kokkos + case Kokkos_Lambda : + { +#if defined(RUN_KOKKOS) + setKokkosTuningDefinitions(vid); +#endif + break; } default : { From 2fc33b489fab2f6cf4069a25c3ec23d144913be6 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 28 Mar 2024 10:52:45 -0700 Subject: [PATCH 287/454] Make SYCL GPU memory management same as CUDA and HIP --- src/common/DataUtils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index ebf007e83..3e61f289a 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -746,6 +746,8 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace) case DataSpace::HipManagedAdviseCoarse: case DataSpace::HipDevice: case DataSpace::HipDeviceFine: + case DataSpace::SyclPinned: + case DataSpace::SyclManaged: return dataSpace; case DataSpace::OmpTarget: @@ -754,7 +756,6 @@ DataSpace hostAccessibleDataSpace(DataSpace dataSpace) case DataSpace::CudaDevice: return DataSpace::CudaPinned; - case DataSpace::SyclManaged: case DataSpace::SyclDevice: return DataSpace::SyclPinned; From 22293b96ca934477218e80ee04289fdd95e1a2a5 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 28 Mar 2024 15:13:06 -0700 Subject: [PATCH 288/454] Disallow GPU block size of zero --- src/common/GPUUtils.hpp | 8 +++++++- src/common/RunParams.cpp | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 16a6b3e3c..74cd29c02 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -112,6 +112,12 @@ struct AllowAny static constexpr bool valid(size_t RAJAPERF_UNUSED_ARG(i)) { return true; } }; +// true only if i > 0 +struct PositiveOnly +{ + static constexpr bool valid(size_t i) { return i > 0; } +}; + // true if of i is a multiple of N, false otherwise template < size_t N > struct MultipleOf @@ -129,7 +135,7 @@ struct ExactSqrt // If gpu_block_sizes from the configuration is not empty it is those gpu_block_sizes, // otherwise it is a list containing just default_block_size. // Invalid entries are removed according to validity_checker in either case. -template < size_t default_block_size, typename validity_checker = AllowAny > +template < size_t default_block_size, typename validity_checker = PositiveOnly > using make_gpu_block_size_list_type = typename detail::remove_invalid::value > 0), diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 276d01ebc..ea6b6fd5d 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1120,9 +1120,11 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --gpu_block_size [no default]\n" << "\t (block sizes to run for all GPU kernels)\n" + << "\t Given int values must be > 0\n." << "\t GPU kernels not supporting gpu_block_size option will be skipped.\n" - << "\t Behavior depends on kernel implementations and \n" - << "\t values give via CMake variable RAJA_PERFSUITE_GPU_BLOCKSIZES.\n"; + << "\t Behavior depends on individual kernel implementations and \n" + << "\t compile configuration values given via CMake variable \n" + << "\t RAJA_PERFSUITE_GPU_BLOCKSIZES.\n"; str << "\t\t Example...\n" << "\t\t --gpu_block_size 128 256 512 (runs kernels with gpu_block_size 128, 256, and 512)\n\n"; From a5823fecb5720d634b16a96c17d17111a2e68971 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 28 Mar 2024 15:28:56 -0700 Subject: [PATCH 289/454] Establish consistent pattern for base SYCL variants of nested loop kernels --- src/apps/LTIMES-Sycl.cpp | 8 ++++---- src/apps/LTIMES_NOVIEW-Sycl.cpp | 8 ++++---- src/basic/NESTED_INIT-Sycl.cpp | 10 +++++----- src/lcals/HYDRO_2D-Sycl.cpp | 27 ++++++++++++++------------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index 36e87a58a..749b7024e 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -37,16 +37,16 @@ void LTIMES::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), - RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), - RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); + sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), + g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), + m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), + h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim), [=] (sycl::nd_item<3> item) { Index_type m = item.get_global_id(2); diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index d7202e450..bb7a885d0 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -37,16 +37,16 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), - RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), - RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); + sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), + g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), + m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), + h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim), [=] (sycl::nd_item<3> item) { Index_type m = item.get_global_id(2); diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index 94248fc59..f3b06496c 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -37,17 +37,17 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<3> ndrange_dim(RAJA_DIVIDE_CEILING_INT(nk, k_wg_sz), - RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz), - RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz)); + sycl::range<3> global_dim(k_wg_sz * RAJA_DIVIDE_CEILING_INT(nk, k_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz), + i_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz)); sycl::range<3> wkgroup_dim(k_wg_sz, j_wg_sz, i_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (cl::sycl::handler& h) { - h.parallel_for(sycl::nd_range<3> ( ndrange_dim * wkgroup_dim, wkgroup_dim), - [=] (sycl::nd_item<3> item) { + h.parallel_for(sycl::nd_range<3> ( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { Index_type i = item.get_global_id(2); Index_type j = item.get_global_id(1); diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index 83eca34ca..879d9d5e5 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -21,8 +21,11 @@ namespace rajaperf namespace lcals { -#define j_block_sz (32) -#define k_block_sz (work_group_size / j_block_sz) + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define k_wg_sz (work_group_size / j_wg_sz) template void HYDRO_2D::runSyclVariantImpl(VariantID vid) { @@ -36,16 +39,16 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { HYDRO_2D_DATA_SETUP; if ( vid == Base_SYCL ) { - - auto kn_global_size = k_block_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_block_sz); - auto jn_global_size = j_block_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_block_sz); + sycl::range<2> global_dim(k_wg_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_wg_sz)); + sycl::range<2> wkgroup_dim(k_wg_sz, j_wg_sz); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_global_size, jn_global_size), - sycl::range<2>(k_block_sz,j_block_sz)), + h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), [=] (sycl::nd_item<2> item) { int j = item.get_global_id(1) + 1; @@ -59,8 +62,7 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_global_size, jn_global_size), - sycl::range<2>(k_block_sz,j_block_sz)), + h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), [=] (sycl::nd_item<2> item) { int j = item.get_global_id(1) + 1; @@ -74,8 +76,7 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>(sycl::range<2>(kn_global_size, jn_global_size), - sycl::range<2>(k_block_sz,j_block_sz)), + h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), [=] (sycl::nd_item<2> item) { int j = item.get_global_id(1) + 1; @@ -99,8 +100,8 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { using EXECPOL = RAJA::KernelPolicy< RAJA::statement::SyclKernelAsync< - RAJA::statement::For<0, RAJA::sycl_global_1, - RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0> > > From cb1f7d984fed1fea7b10dd2163f63a7d37a1e340 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 29 Mar 2024 11:31:49 -0700 Subject: [PATCH 290/454] Centralize function used in TRAP_INT kernel to avoid redundancy --- src/basic/TRAP_INT-Cuda.cpp | 17 ++-------------- src/basic/TRAP_INT-Hip.cpp | 17 ++-------------- src/basic/TRAP_INT-OMP.cpp | 16 ++------------- src/basic/TRAP_INT-OMPTarget.cpp | 16 ++------------- src/basic/TRAP_INT-Seq.cpp | 16 ++------------- src/basic/TRAP_INT-Sycl.cpp | 16 ++------------- src/basic/TRAP_INT-func.hpp | 35 ++++++++++++++++++++++++++++++++ 7 files changed, 47 insertions(+), 86 deletions(-) create mode 100644 src/basic/TRAP_INT-func.hpp diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 233563269..fea2de43e 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -12,6 +12,8 @@ #if defined(RAJA_ENABLE_CUDA) +#include "TRAP_INT-func.hpp" + #include "common/CudaDataUtils.hpp" #include @@ -25,21 +27,6 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -RAJA_DEVICE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 51b6f0da3..62cd3a2d0 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -12,6 +12,8 @@ #if defined(RAJA_ENABLE_HIP) +#include "TRAP_INT-func.hpp" + #include "common/HipDataUtils.hpp" #include @@ -25,21 +27,6 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -RAJA_DEVICE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - template < size_t block_size > __launch_bounds__(block_size) diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index 9f89ba0b6..d21542999 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#include "TRAP_INT-func.hpp" + #include namespace rajaperf @@ -17,20 +19,6 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index acf52a61d..a1a88e30a 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -12,6 +12,8 @@ #if defined(RAJA_ENABLE_TARGET_OPENMP) +#include "TRAP_INT-func.hpp" + #include "common/OpenMPTargetDataUtils.hpp" #include @@ -21,20 +23,6 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - // // Define threads per team for target execution // diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index 967d3e93e..5aa253dec 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -10,6 +10,8 @@ #include "RAJA/RAJA.hpp" +#include "TRAP_INT-func.hpp" + #include namespace rajaperf @@ -17,20 +19,6 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index b9caf2dad..f9dc5cf74 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -12,6 +12,8 @@ #if defined(RAJA_ENABLE_SYCL) +#include "TRAP_INT-func.hpp" + #include "common/SyclDataUtils.hpp" #include @@ -21,20 +23,6 @@ namespace rajaperf namespace basic { -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -RAJA_DEVICE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} template void TRAP_INT::runSyclVariantImpl(VariantID vid) diff --git a/src/basic/TRAP_INT-func.hpp b/src/basic/TRAP_INT-func.hpp new file mode 100644 index 000000000..9c4b90c52 --- /dev/null +++ b/src/basic/TRAP_INT-func.hpp @@ -0,0 +1,35 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_Basic_TRAP_INT_FUNC_HPP +#define RAJAPerf_Basic_TRAP_INT_FUNC_HPP + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop in each variant. +// +RAJA_INLINE +RAJA_HOST_DEVICE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard From ff6c18eb818290c52f6331057e4151d22a8c7756 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Tue, 2 Apr 2024 15:41:48 -0700 Subject: [PATCH 291/454] Update RAJA to v2024.02.1 RC branch to quiet SYCL compiler warnings --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 7b1e5248e..b21c8c42b 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 7b1e5248e113ee2df40a8878aac2f54b6ee2b74e +Subproject commit b21c8c42be2fa6381b3ff8cbf85033dadbc8b280 From ab1006aceac2f01cff42c008fd42051b4c76234c Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 3 Apr 2024 10:26:49 -0700 Subject: [PATCH 292/454] Update RAJA to v2024.02.1 release --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index b21c8c42b..3ada0950b 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit b21c8c42be2fa6381b3ff8cbf85033dadbc8b280 +Subproject commit 3ada0950b0774ec907d30a9eceaf6af7478b833b From 0a5cf4d9bc91921136c3ca815c39ebe514b7ee99 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 3 Apr 2024 10:27:08 -0700 Subject: [PATCH 293/454] Update BLT to v0.6.2 release --- blt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blt b/blt index 148c53ecc..9ff77344f 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 148c53ecc8bcaad5eaa4c1e39cb8144b8f1388ae +Subproject commit 9ff77344f0b2a6ee345e452bddd6bfd46cbbfa35 From 0bc48dedd977bfb43158d55b00dc0ac0317830fe Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 4 Apr 2024 15:06:10 -0700 Subject: [PATCH 294/454] Move inclusion of sycl.hpp, remove unecessary includes, and format case statements in switchyards for consistency with others. --- src/basic/IF_QUAD-Sycl.cpp | 1 - src/basic/INIT_VIEW1D-Sycl.cpp | 1 - src/common/KernelBase.hpp | 2 +- src/common/RAJAPerfSuite.cpp | 60 +++++++++++++++++++--------- src/lcals/PLANCKIAN-Sycl.cpp | 1 - src/polybench/POLYBENCH_2MM-Sycl.cpp | 1 - src/stream/TRIAD-Sycl.cpp | 1 - 7 files changed, 42 insertions(+), 25 deletions(-) diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index 6d989d513..4539a4c8c 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index fb2a18f7f..2469e4d1e 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 850893d93..f37d57f6c 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -27,9 +27,9 @@ #endif #if defined(RAJA_ENABLE_SYCL) #include -#include "camp/resource.hpp" #endif +#include "camp/resource.hpp" #include #include diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index a7ce954d2..02882e2a0 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -633,17 +633,24 @@ bool isDataSpaceAvailable(DataSpace dataSpace) bool ret_val = false; switch (dataSpace) { - case DataSpace::Host: - ret_val = true; break; + + case DataSpace::Host: { + ret_val = true; + break; + } #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - case DataSpace::Omp: - ret_val = true; break; + case DataSpace::Omp: { + ret_val = true; + break; + } #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - case DataSpace::OmpTarget: - ret_val = true; break; + case DataSpace::OmpTarget: { + ret_val = true; + break; + } #endif #if defined(RAJA_ENABLE_CUDA) @@ -653,8 +660,10 @@ bool isDataSpaceAvailable(DataSpace dataSpace) case DataSpace::CudaManagedDevicePreferred: case DataSpace::CudaManagedHostPreferredDeviceAccessed: case DataSpace::CudaManagedDevicePreferredHostAccessed: - case DataSpace::CudaDevice: - ret_val = true; break; + case DataSpace::CudaDevice: { + ret_val = true; + break; + } #endif #if defined(RAJA_ENABLE_HIP) @@ -671,20 +680,27 @@ bool isDataSpaceAvailable(DataSpace dataSpace) case DataSpace::HipManagedAdviseCoarse: #endif case DataSpace::HipDevice: - case DataSpace::HipDeviceFine: - ret_val = true; break; + case DataSpace::HipDeviceFine: { + ret_val = true; + break; + } #endif #if defined(RAJA_ENABLE_SYCL) case DataSpace::SyclPinned: case DataSpace::SyclManaged: - case DataSpace::SyclDevice: - ret_val = true; break; + case DataSpace::SyclDevice: { + ret_val = true; + break; + } #endif - default: - ret_val = false; break; - } + default: { + ret_val = false; + break; + } + + } // close switch (dataSpace) return ret_val; } @@ -701,10 +717,16 @@ bool isPseudoDataSpace(DataSpace dataSpace) bool ret_val = false; switch (dataSpace) { - case DataSpace::Copy: - ret_val = true; break; - default: - ret_val = false; break; + + case DataSpace::Copy: { + ret_val = true; + break; + } + default: { + ret_val = false; + break; + } + } return ret_val; diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp index a36d50a47..8773fdc07 100644 --- a/src/lcals/PLANCKIAN-Sycl.cpp +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -15,7 +15,6 @@ #include #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index 8bade3b7d..67def563e 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -15,7 +15,6 @@ #include #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp index 851aa56d0..56d466fef 100644 --- a/src/stream/TRIAD-Sycl.cpp +++ b/src/stream/TRIAD-Sycl.cpp @@ -14,7 +14,6 @@ #include -#include #include "common/SyclDataUtils.hpp" namespace rajaperf From c2c148f912dfefd6f0b448a34d5810531dd1965f Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 4 Apr 2024 16:20:15 -0700 Subject: [PATCH 295/454] Rework use of SYCL queue and make more consistent with other GPU back-ends --- src/apps/DEL_DOT_VEC_2D-Sycl.cpp | 3 +-- src/apps/ENERGY-Sycl.cpp | 10 ++++------ src/apps/FIR-Sycl.cpp | 7 +++---- src/apps/LTIMES-Sycl.cpp | 5 +++-- src/apps/LTIMES_NOVIEW-Sycl.cpp | 5 +++-- src/apps/PRESSURE-Sycl.cpp | 7 +++---- src/apps/VOL3D-Sycl.cpp | 5 ++--- src/basic/DAXPY-Sycl.cpp | 3 +-- src/basic/IF_QUAD-Sycl.cpp | 5 +++-- src/basic/INIT3-Sycl.cpp | 5 +++-- src/basic/INIT_VIEW1D-Sycl.cpp | 7 ++++--- src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 5 +++-- src/basic/MULADDSUB-Sycl.cpp | 5 +++-- src/basic/NESTED_INIT-Sycl.cpp | 6 ++++-- src/basic/REDUCE3_INT-Sycl.cpp | 5 +++-- src/basic/TRAP_INT-Sycl.cpp | 5 +++-- src/common/DataUtils.cpp | 8 ++++---- src/common/Executor.cpp | 4 ---- src/common/KernelBase.hpp | 17 ++++++++--------- src/common/RAJAPerfSuite.cpp | 3 --- src/lcals/DIFF_PREDICT-Sycl.cpp | 5 ++--- src/lcals/EOS-Sycl.cpp | 5 +++-- src/lcals/FIRST_DIFF-Sycl.cpp | 6 ++++-- src/lcals/GEN_LIN_RECUR-Sycl.cpp | 5 +++-- src/lcals/HYDRO_1D-Sycl.cpp | 5 +++-- src/lcals/HYDRO_2D-Sycl.cpp | 6 ++++-- src/lcals/INT_PREDICT-Sycl.cpp | 8 ++++---- src/lcals/PLANCKIAN-Sycl.cpp | 5 +++-- src/lcals/TRIDIAG_ELIM-Sycl.cpp | 5 +++-- src/polybench/POLYBENCH_2MM-Sycl.cpp | 17 +++++++++-------- src/stream/ADD-Sycl.cpp | 3 +-- src/stream/COPY-Sycl.cpp | 5 +++-- src/stream/DOT-Sycl.cpp | 4 +++- src/stream/MUL-Sycl.cpp | 5 +++-- src/stream/TRIAD-Sycl.cpp | 6 ++++-- 35 files changed, 110 insertions(+), 100 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp index 9754dca6c..da3bc9b4b 100644 --- a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -31,6 +31,7 @@ void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) const Index_type iend = m_domain->n_real_zones; auto res{getSyclResource()}; + auto qu = res.get_queue(); DEL_DOT_VEC_2D_DATA_SETUP; @@ -55,7 +56,6 @@ void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -72,7 +72,6 @@ void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp index 1e5fd8d2c..1fd105f81 100644 --- a/src/apps/ENERGY-Sycl.cpp +++ b/src/apps/ENERGY-Sycl.cpp @@ -28,13 +28,14 @@ void ENERGY::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + ENERGY_DATA_SETUP; using sycl::sqrt; using sycl::fabs; - auto res{getSyclResource()}; - if ( vid == Base_SYCL ) { startTimer(); @@ -101,8 +102,7 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); }); - qu->submit([&] (sycl::handler& h) - { + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), [=] (sycl::nd_item<1> item) { @@ -114,7 +114,6 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -159,7 +158,6 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); // end sequential region (for single-source code) } - qu->wait(); stopTimer(); } else { diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp index 81c965729..178d00b4b 100644 --- a/src/apps/FIR-Sycl.cpp +++ b/src/apps/FIR-Sycl.cpp @@ -44,9 +44,10 @@ void FIR::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize() - m_coefflen; - FIR_DATA_SETUP; - auto res{getSyclResource()}; + auto qu = res.get_queue(); + + FIR_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -71,7 +72,6 @@ void FIR::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); FIR_DATA_TEARDOWN_SYCL; @@ -91,7 +91,6 @@ void FIR::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); FIR_DATA_TEARDOWN_SYCL; diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index 749b7024e..b0f8ad089 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -33,6 +33,9 @@ void LTIMES::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + LTIMES_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -62,7 +65,6 @@ void LTIMES::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -96,7 +98,6 @@ void LTIMES::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index bb7a885d0..ae2f2b000 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -33,6 +33,9 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + LTIMES_NOVIEW_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -62,7 +65,6 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -94,7 +96,6 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp index 8f0c1c4a7..6fd7735bb 100644 --- a/src/apps/PRESSURE-Sycl.cpp +++ b/src/apps/PRESSURE-Sycl.cpp @@ -28,12 +28,13 @@ void PRESSURE::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + PRESSURE_DATA_SETUP; using sycl::fabs; - auto res{getSyclResource()}; - if ( vid == Base_SYCL ) { startTimer(); @@ -66,7 +67,6 @@ void PRESSURE::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -91,7 +91,6 @@ void PRESSURE::runSyclVariantImpl(VariantID vid) }); // end sequential region (for single-source code) } - qu->wait(); stopTimer(); } else { diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp index 37deff8ab..6c18ec1c8 100644 --- a/src/apps/VOL3D-Sycl.cpp +++ b/src/apps/VOL3D-Sycl.cpp @@ -30,7 +30,8 @@ void VOL3D::runSyclVariantImpl(VariantID vid) const Index_type ibegin = m_domain->fpz; const Index_type iend = m_domain->lpz+1; - auto res{getSyclResource()}; + auto res{getSyclResource()}; + auto qu = res.get_queue(); VOL3D_DATA_SETUP; @@ -54,7 +55,6 @@ void VOL3D::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -68,7 +68,6 @@ void VOL3D::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index f42e74ac6..f74634beb 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -29,6 +29,7 @@ void DAXPY::runSyclVariantImpl(VariantID vid) const Index_type iend = getActualProblemSize(); auto res{getSyclResource()}; + auto qu = res.get_queue(); DAXPY_DATA_SETUP; @@ -51,7 +52,6 @@ void DAXPY::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -65,7 +65,6 @@ void DAXPY::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index 4539a4c8c..93ff76c4e 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -28,6 +28,9 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + IF_QUAD_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -49,7 +52,6 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -63,7 +65,6 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index 7ba60ff3a..4b71d1846 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -28,6 +28,9 @@ void INIT3::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + INIT3_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -50,7 +53,6 @@ void INIT3::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -64,7 +66,6 @@ void INIT3::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index 2469e4d1e..d7f99ce35 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -28,6 +28,9 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + INIT_VIEW1D_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -38,7 +41,6 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), [=] (sycl::nd_item<1> item ) { @@ -46,10 +48,10 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) if (i < iend) { INIT_VIEW1D_BODY } + }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -65,7 +67,6 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp index 2764d3089..4834edbe8 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -28,6 +28,9 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 1; const Index_type iend = getActualProblemSize()+1; + auto res{getSyclResource()}; + auto qu = res.get_queue(); + INIT_VIEW1D_OFFSET_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -50,7 +53,6 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -64,7 +66,6 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp index 0185dc7ba..13d690c2b 100644 --- a/src/basic/MULADDSUB-Sycl.cpp +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -28,6 +28,9 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + MULADDSUB_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -50,7 +53,6 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -64,7 +66,6 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index f3b06496c..b01a7d597 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -33,6 +33,9 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + NESTED_INIT_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -56,11 +59,11 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) if (i < ni && j < nj && k < nk) { NESTED_INIT_BODY; } + }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -89,7 +92,6 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index 448e35d30..f90a39d81 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -41,6 +41,9 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + REDUCE3_INT_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -93,7 +96,6 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) m_vmax = RAJA_MAX(m_vmax, lmax); } // for (RepIndex_type irep = ... - qu->wait(); stopTimer(); REDUCE3_INT_DATA_TEARDOWN_SYCL; @@ -117,7 +119,6 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); } - qu->wait(); stopTimer(); } else { diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index f9dc5cf74..e8f066c1d 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -31,6 +31,9 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + TRAP_INT_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -67,7 +70,6 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) m_sumx += lsumx * h; } - qu->wait(); stopTimer(); deallocSyclDeviceData(sumx, qu); @@ -87,7 +89,6 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) m_sumx += static_cast(sumx.get()) * h; } - qu->wait(); stopTimer(); } else { diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp index 3e61f289a..08e1cba95 100644 --- a/src/common/DataUtils.cpp +++ b/src/common/DataUtils.cpp @@ -350,7 +350,7 @@ void copyData(DataSpace dst_dataSpace, void* dst_ptr, else if (isSyclDataSpace(dst_dataSpace) || isSyclDataSpace(src_dataSpace)) { auto qu = camp::resources::Sycl::get_default().get_queue(); - detail::copySyclData(dst_ptr, src_ptr, nbytes,qu); + detail::copySyclData(dst_ptr, src_ptr, nbytes, qu); } #endif @@ -441,17 +441,17 @@ void deallocData(DataSpace dataSpace, void* ptr) case DataSpace::SyclPinned: { auto qu = camp::resources::Sycl::get_default().get_queue(); - detail::deallocSyclPinnedData(ptr,qu); + detail::deallocSyclPinnedData(ptr, qu); } break; case DataSpace::SyclManaged: { auto qu = camp::resources::Sycl::get_default().get_queue(); - detail::deallocSyclManagedData(ptr,qu); + detail::deallocSyclManagedData(ptr, qu); } break; case DataSpace::SyclDevice: { auto qu = camp::resources::Sycl::get_default().get_queue(); - detail::deallocSyclDeviceData(ptr,qu); + detail::deallocSyclDeviceData(ptr, qu); } break; #endif diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index a9fd047f5..bd9d43392 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -247,10 +247,6 @@ void Executor::setupSuite() getCout() << "\nSetting up suite based on input..." << endl; -#if defined(RAJA_ENABLE_SYCL) - KernelBase::qu = camp::resources::Sycl().get_queue(); -#endif - using Svector = vector; // diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index f37d57f6c..c41d33c07 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -280,7 +280,7 @@ class KernelBase #if defined(RAJA_ENABLE_SYCL) if ( running_variant == Base_SYCL || running_variant == RAJA_SYCL ) { - getSyclResource().wait(); + getSyclResource().get_queue()->wait(); } #endif @@ -465,19 +465,18 @@ class KernelBase virtual void runOpenMPTargetVariant(VariantID vid, size_t tune_idx) = 0; #endif -#if defined(RUN_KOKKOS) - virtual void runKokkosVariant(VariantID vid, size_t tune_idx) - { - getCout() << "\n KernelBase: Unimplemented Kokkos variant id = " << vid << std::endl; - } -#endif #if defined(RAJA_ENABLE_SYCL) virtual void runSyclVariant(VariantID vid, size_t tune_idx) { getCout() << "\n KernelBase: Unimplemented Sycl variant id = " << vid << std::endl; } - static sycl::queue* qu; - static camp::resources::Resource sycl_res; +#endif + +#if defined(RUN_KOKKOS) + virtual void runKokkosVariant(VariantID vid, size_t tune_idx) + { + getCout() << "\n KernelBase: Unimplemented Kokkos variant id = " << vid << std::endl; + } #endif diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 02882e2a0..f89ea35e7 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -1078,9 +1078,6 @@ KernelBase* getKernelObject(KernelID kid, return kernel; } -#if defined(RAJA_ENABLE_SYCL) -sycl::queue* KernelBase::qu; -#endif // subclass of streambuf that ignores overflow // never printing anything to the underlying stream diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp index 67ccacea6..e32dd99f2 100644 --- a/src/lcals/DIFF_PREDICT-Sycl.cpp +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -28,7 +28,8 @@ void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); - auto res{getSyclResource()}; + auto res{getSyclResource()}; + auto qu = res.get_queue(); DIFF_PREDICT_DATA_SETUP; @@ -52,7 +53,6 @@ void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -66,7 +66,6 @@ void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp index 8b6faba01..7737aa0e3 100644 --- a/src/lcals/EOS-Sycl.cpp +++ b/src/lcals/EOS-Sycl.cpp @@ -28,6 +28,9 @@ void EOS::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + EOS_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -49,7 +52,6 @@ void EOS::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -63,7 +65,6 @@ void EOS::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp index a97fbd622..20df37f9d 100644 --- a/src/lcals/FIRST_DIFF-Sycl.cpp +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -27,6 +27,9 @@ void FIRST_DIFF::runSyclVariantImpl(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); FIRST_DIFF_DATA_SETUP; @@ -36,6 +39,7 @@ void FIRST_DIFF::runSyclVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), [=] (sycl::nd_item<1> item) { @@ -48,7 +52,6 @@ void FIRST_DIFF::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -62,7 +65,6 @@ void FIRST_DIFF::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp index 0b5442ab4..dc6e7ab95 100644 --- a/src/lcals/GEN_LIN_RECUR-Sycl.cpp +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -26,6 +26,9 @@ void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + GEN_LIN_RECUR_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -61,7 +64,6 @@ void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -80,7 +82,6 @@ void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp index 83a8553d1..9ea248536 100644 --- a/src/lcals/HYDRO_1D-Sycl.cpp +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -28,6 +28,9 @@ void HYDRO_1D::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + HYDRO_1D_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -49,7 +52,6 @@ void HYDRO_1D::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -63,7 +65,6 @@ void HYDRO_1D::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index 879d9d5e5..a37d1fe62 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -36,6 +36,9 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { const Index_type jbeg = 1; const Index_type jend = m_jn - 1; + auto res{getSyclResource()}; + auto qu = res.get_queue(); + HYDRO_2D_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -46,6 +49,7 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), @@ -90,7 +94,6 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { }); } - qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -133,7 +136,6 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp index 3d275e42a..88e7fb60a 100644 --- a/src/lcals/INT_PREDICT-Sycl.cpp +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -28,6 +28,9 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + INT_PREDICT_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -37,8 +40,7 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - qu->submit([&] (sycl::handler& h) - { + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), [=] (sycl::nd_item<1> item) { @@ -50,7 +52,6 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -64,7 +65,6 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp index 8773fdc07..09b294f81 100644 --- a/src/lcals/PLANCKIAN-Sycl.cpp +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -29,6 +29,9 @@ void PLANCKIAN::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + PLANCKIAN_DATA_SETUP; using sycl::exp; @@ -52,7 +55,6 @@ void PLANCKIAN::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -66,7 +68,6 @@ void PLANCKIAN::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp index 87c269344..213de1bfd 100644 --- a/src/lcals/TRIDIAG_ELIM-Sycl.cpp +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -28,6 +28,9 @@ void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 1; const Index_type iend = m_N; + auto res{getSyclResource()}; + auto qu = res.get_queue(); + TRIDIAG_ELIM_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -49,7 +52,6 @@ void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -63,7 +65,6 @@ void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index 67def563e..68a9795aa 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -50,6 +50,9 @@ void POLYBENCH_2MM::runSyclVariant(VariantID vid) { const unsigned long run_reps = getRunReps(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + POLYBENCH_2MM_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -62,12 +65,11 @@ void POLYBENCH_2MM::runSyclVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - qu->submit([&] (sycl::handler& h) - { + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<2> - {sycl::range<2> {ni_grid_size, nj_grid_size}, - sycl::range<2> {block_size, block_size}}, + {sycl::range<2> {ni_grid_size, nj_grid_size}, + sycl::range<2> {block_size, block_size}}, [=] (sycl::nd_item<2> item) { Index_type i = item.get_global_id(0); @@ -80,12 +82,11 @@ void POLYBENCH_2MM::runSyclVariant(VariantID vid) } POLYBENCH_2MM_BODY3; } + }); }); - qu->submit([&] (sycl::handler& h) - { - + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<2> {sycl::range<2> {ni_grid_size, nl_grid_size}, sycl::range<2> {block_size, block_size}}, @@ -104,8 +105,8 @@ void POLYBENCH_2MM::runSyclVariant(VariantID vid) }); }); } - qu->wait(); // Wait for computation to finish before stopping timer stopTimer(); + } POLYBENCH_2MM_TEARDOWN_SYCL; diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp index 69bfb472e..0e56cee30 100644 --- a/src/stream/ADD-Sycl.cpp +++ b/src/stream/ADD-Sycl.cpp @@ -29,6 +29,7 @@ void ADD::runSyclVariantImpl(VariantID vid) const Index_type iend = getActualProblemSize(); auto res{getSyclResource()}; + auto qu = res.get_queue(); ADD_DATA_SETUP; @@ -51,7 +52,6 @@ void ADD::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -65,7 +65,6 @@ void ADD::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp index 703add0fb..2f2e6511b 100644 --- a/src/stream/COPY-Sycl.cpp +++ b/src/stream/COPY-Sycl.cpp @@ -28,6 +28,9 @@ void COPY::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + COPY_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -49,7 +52,6 @@ void COPY::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -63,7 +65,6 @@ void COPY::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp index 08f82bcb1..0475dcc70 100644 --- a/src/stream/DOT-Sycl.cpp +++ b/src/stream/DOT-Sycl.cpp @@ -29,6 +29,9 @@ void DOT::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + DOT_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -65,7 +68,6 @@ void DOT::runSyclVariantImpl(VariantID vid) m_dot += ldot; } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp index e234712ba..ccac06b84 100644 --- a/src/stream/MUL-Sycl.cpp +++ b/src/stream/MUL-Sycl.cpp @@ -28,6 +28,9 @@ void MUL::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + MUL_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -48,7 +51,6 @@ void MUL::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -62,7 +64,6 @@ void MUL::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp index 56d466fef..aaa4011ab 100644 --- a/src/stream/TRIAD-Sycl.cpp +++ b/src/stream/TRIAD-Sycl.cpp @@ -28,6 +28,9 @@ void TRIAD::runSyclVariantImpl(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); + auto res{getSyclResource()}; + auto qu = res.get_queue(); + TRIAD_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -36,6 +39,7 @@ void TRIAD::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { @@ -48,7 +52,6 @@ void TRIAD::runSyclVariantImpl(VariantID vid) }); }); } - qu->wait(); stopTimer(); } else if ( vid == RAJA_SYCL ) { @@ -62,7 +65,6 @@ void TRIAD::runSyclVariantImpl(VariantID vid) }); } - qu->wait(); stopTimer(); } else { From 1465e6d8614a6a97641561213c61b16f3b299b82 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 5 Apr 2024 10:47:05 -0700 Subject: [PATCH 296/454] Update build script to indicate newer compiler build --- scripts/lc-builds/corona_sycl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index aae3a177d..6dbeb9ee5 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -13,7 +13,7 @@ if [[ $# -lt 1 ]]; then echo " 1) SYCL compiler installation path" echo echo "For example: " - echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_a0117ab8692a_hip_gcc10.2.1_rocm5.6.0" + echo " corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1" exit fi From f3fb6d2cc84d031319393766e128a20af8012f51 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 5 Apr 2024 13:30:35 -0700 Subject: [PATCH 297/454] Rework first polybench kernel. It does not run on corona due to "non-uniform workgroup size" issue. --- src/polybench/CMakeLists.txt | 1 + src/polybench/POLYBENCH_2MM-Sycl.cpp | 142 +++++++++++---------------- src/polybench/POLYBENCH_2MM.cpp | 5 + src/polybench/POLYBENCH_2MM.hpp | 5 + 4 files changed, 70 insertions(+), 83 deletions(-) diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index ff076b8f9..09ad7cc21 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -14,6 +14,7 @@ blt_add_library( POLYBENCH_2MM-Cuda.cpp POLYBENCH_2MM-OMP.cpp POLYBENCH_2MM-OMPTarget.cpp + POLYBENCH_2MM-Sycl.cpp POLYBENCH_3MM.cpp POLYBENCH_3MM-Seq.cpp POLYBENCH_3MM-Hip.cpp diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index 68a9795aa..264f8e366 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -12,41 +12,24 @@ #if defined(RAJA_ENABLE_SYCL) +#include "common/SyclDataUtils.hpp" + #include #include -#include "common/SyclDataUtils.hpp" - namespace rajaperf { namespace polybench { // - // Define thread block size for SYCL execution + // Define work-group shape for SYCL execution // - const size_t block_size = 16; - -#define POLYBENCH_2MM_DATA_SETUP_SYCL \ - allocAndInitSyclDeviceData(tmp, m_tmp, m_ni * m_nj, qu); \ - allocAndInitSyclDeviceData(A, m_A, m_ni * m_nk, qu); \ - allocAndInitSyclDeviceData(B, m_B, m_nk * m_nj, qu); \ - allocAndInitSyclDeviceData(C, m_C, m_nj * m_nl, qu); \ - allocAndInitSyclDeviceData(D, m_D, m_ni * m_nl, qu); \ -\ - Real_type alpha = m_alpha; \ - Real_type beta = m_beta; \ - - -#define POLYBENCH_2MM_TEARDOWN_SYCL \ - getSyclDeviceData(m_D, D, m_ni * m_nl, qu); \ - deallocSyclDeviceData(tmp, qu); \ - deallocSyclDeviceData(A, qu); \ - deallocSyclDeviceData(B, qu); \ - deallocSyclDeviceData(C, qu); \ - deallocSyclDeviceData(D, qu); - -void POLYBENCH_2MM::runSyclVariant(VariantID vid) +#define in_wg_sz (32) +#define out_wg_sz (work_group_size / in_wg_sz) + +template +void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) { const unsigned long run_reps = getRunReps(); @@ -56,72 +39,66 @@ void POLYBENCH_2MM::runSyclVariant(VariantID vid) POLYBENCH_2MM_DATA_SETUP; if ( vid == Base_SYCL ) { - { - POLYBENCH_2MM_DATA_SETUP_SYCL; - - const size_t ni_grid_size = block_size * RAJA_DIVIDE_CEILING_INT(m_ni, block_size); - const size_t nj_grid_size = block_size * RAJA_DIVIDE_CEILING_INT(m_nj, block_size); - const size_t nl_grid_size = block_size * RAJA_DIVIDE_CEILING_INT(m_nl, block_size); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2> - {sycl::range<2> {ni_grid_size, nj_grid_size}, - sycl::range<2> {block_size, block_size}}, - [=] (sycl::nd_item<2> item) { - - Index_type i = item.get_global_id(0); - Index_type j = item.get_global_id(1); - - if (i < ni && j < nj) { - POLYBENCH_2MM_BODY1; - for (Index_type k=0; k < nk; ++k) { - POLYBENCH_2MM_BODY2; - } - POLYBENCH_2MM_BODY3; + + sycl::range<2> global_dim1(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz)); + + sycl::range<2> global_dim2(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); + + sycl::range<2> wkgroup_dim(in_wg_sz, out_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim1, wkgroup_dim), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type j = item.get_global_id(1); + + if (i < ni && j < nj) { + POLYBENCH_2MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_2MM_BODY2; } + POLYBENCH_2MM_BODY3; + } - }); }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type l = item.get_global_id(1); + + if (i < ni && l < nl) { + POLYBENCH_2MM_BODY4; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_2MM_BODY5; + } + POLYBENCH_2MM_BODY6; + } - qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2> - {sycl::range<2> {ni_grid_size, nl_grid_size}, - sycl::range<2> {block_size, block_size}}, - [=] (sycl::nd_item<2> item) { - - Index_type i = item.get_global_id(0); - Index_type l = item.get_global_id(1); - - if(i < ni && l < nl) { - POLYBENCH_2MM_BODY4; - for (Index_type j=0; j < nj; ++j) { - POLYBENCH_2MM_BODY5; - } - POLYBENCH_2MM_BODY6; - } - }); }); - } - stopTimer(); + }); } - - POLYBENCH_2MM_TEARDOWN_SYCL; + stopTimer(); } else if (vid == RAJA_SYCL) { - POLYBENCH_2MM_DATA_SETUP_SYCL; - POLYBENCH_2MM_VIEWS_RAJA; using EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::SyclKernelNonTrivial< - RAJA::statement::For<0, RAJA::sycl_global_0<16>, - RAJA::statement::For<1, RAJA::sycl_global_1<16>, + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::Lambda<0, RAJA::Params<0>>, RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> @@ -164,11 +141,11 @@ void POLYBENCH_2MM::runSyclVariant(VariantID vid) POLYBENCH_2MM_BODY4_RAJA; }, [=] (Index_type i, Index_type l, Index_type j, - Real_type &dot) { + Real_type &dot) { POLYBENCH_2MM_BODY5_RAJA; }, [=] (Index_type i, Index_type l, - Real_type &dot) { + Real_type &dot) { POLYBENCH_2MM_BODY6_RAJA; } ); @@ -176,16 +153,15 @@ void POLYBENCH_2MM::runSyclVariant(VariantID vid) } stopTimer(); - POLYBENCH_2MM_TEARDOWN_SYCL; - } else { std::cout << "\n POLYBENCH_2MM : Unknown Sycl variant id = " << vid << std::endl; } - } +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_2MM, Sycl) + } // end namespace polybench } // end namespace rajaperf -#endif // RAJA_ENABLE_Sycl +#endif // RAJA_ENABLE_SYCL diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index af5a4450b..9439a18f0 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -78,6 +78,11 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + +/* Turn off for now. Need to understand "non-uniform work groups" error + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); +*/ } POLYBENCH_2MM::~POLYBENCH_2MM() diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 836b05aee..541682454 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -127,13 +127,18 @@ class POLYBENCH_2MM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 0e70d3af9e6d065b14f93c019d4e726b6e80df9f Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 5 Apr 2024 13:41:01 -0700 Subject: [PATCH 298/454] Enabling RAJA_SYCL kernel variable that runs correctly. --- src/polybench/POLYBENCH_2MM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 9439a18f0..f003ea21a 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -81,8 +81,8 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) /* Turn off for now. Need to understand "non-uniform work groups" error setVariantDefined( Base_SYCL ); - setVariantDefined( RAJA_SYCL ); */ + setVariantDefined( RAJA_SYCL ); } POLYBENCH_2MM::~POLYBENCH_2MM() From 7c4913093a51a0c44e535f20798716220f808072 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 5 Apr 2024 13:58:57 -0700 Subject: [PATCH 299/454] Enable kernel after fix. THere's still an issue with it, however. --- src/polybench/POLYBENCH_2MM-Sycl.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index 264f8e366..8c95403e6 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -46,7 +46,7 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) sycl::range<2> global_dim2(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); - sycl::range<2> wkgroup_dim(in_wg_sz, out_wg_sz); + sycl::range<2> wkgroup_dim(out_wg_sz, in_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index f003ea21a..56830658d 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -79,9 +79,7 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); -/* Turn off for now. Need to understand "non-uniform work groups" error setVariantDefined( Base_SYCL ); -*/ setVariantDefined( RAJA_SYCL ); } From 8982c81f66ca6420e50556f09a8f9d5610dcbfd6 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 5 Apr 2024 16:23:39 -0700 Subject: [PATCH 300/454] Add SYCL variants --- src/polybench/CMakeLists.txt | 1 + src/polybench/POLYBENCH_2MM-Sycl.cpp | 6 +- src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp | 107 +++++++++++++++++++++ src/polybench/POLYBENCH_JACOBI_1D.cpp | 3 + src/polybench/POLYBENCH_JACOBI_1D.hpp | 6 ++ 5 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index 09ad7cc21..2c33a0fce 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -75,6 +75,7 @@ blt_add_library( POLYBENCH_JACOBI_1D-Cuda.cpp POLYBENCH_JACOBI_1D-OMP.cpp POLYBENCH_JACOBI_1D-OMPTarget.cpp + POLYBENCH_JACOBI_1D-Sycl.cpp POLYBENCH_JACOBI_2D.cpp POLYBENCH_JACOBI_2D-Seq.cpp POLYBENCH_JACOBI_2D-Hip.cpp diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index 8c95403e6..3582642c8 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -122,15 +122,15 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) POLYBENCH_2MM_BODY1_RAJA; }, [=] (Index_type i, Index_type j, Index_type k, - Real_type &dot) { + Real_type &dot) { POLYBENCH_2MM_BODY2_RAJA; }, [=] (Index_type i, Index_type j, - Real_type &dot) { + Real_type &dot) { POLYBENCH_2MM_BODY3_RAJA; } ); - + RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp new file mode 100644 index 000000000..0a12baa10 --- /dev/null +++ b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_JACOBI_1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + +template < size_t work_group_size > +void POLYBENCH_JACOBI_1D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_JACOBI_1D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < N-1) { + POLYBENCH_JACOBI_1D_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < N-1) { + POLYBENCH_JACOBI_1D_BODY2; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + const bool async = true; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::forall< RAJA::sycl_exec>( + RAJA::RangeSegment{1, N-1}, [=] (Index_type i) { + POLYBENCH_JACOBI_1D_BODY1; + }); + + RAJA::forall< RAJA::sycl_exec>( + RAJA::RangeSegment{1, N-1}, [=] (Index_type i) { + POLYBENCH_JACOBI_1D_BODY2; + }); + + } + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_JACOBI_1D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_JACOBI_1D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index cd56dd74c..40e2c2c04 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -67,6 +67,9 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D() diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index c86280036..f128e5947 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -68,13 +68,19 @@ class POLYBENCH_JACOBI_1D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 55ac42aadfc104ab9e2ccfc6b5d7e354942037f6 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 15 Apr 2024 10:00:15 -0700 Subject: [PATCH 301/454] mat-mat-shared kernel --- src/basic/CMakeLists.txt | 1 + src/basic/MAT_MAT_SHARED-Sycl.cpp | 195 ++++++++++++++++++++++++++++++ src/basic/MAT_MAT_SHARED.cpp | 3 + src/basic/MAT_MAT_SHARED.hpp | 14 ++- 4 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 src/basic/MAT_MAT_SHARED-Sycl.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 0a0a94062..c5b5f8f57 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -79,6 +79,7 @@ blt_add_library( MAT_MAT_SHARED-Cuda.cpp MAT_MAT_SHARED-OMP.cpp MAT_MAT_SHARED-OMPTarget.cpp + MAT_MAT_SHARED-Sycl.cpp MULADDSUB.cpp MULADDSUB-Seq.cpp MULADDSUB-Hip.cpp diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp new file mode 100644 index 000000000..efb9d0a47 --- /dev/null +++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp @@ -0,0 +1,195 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_MAT_SHARED.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +template < size_t block_size > +void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) +{ + constexpr Index_type tile_size = integer::sqrt(block_size); + static_assert(tile_size*tile_size == block_size, "Invalid block_size"); + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + + const Index_type Nx = RAJA_DIVIDE_CEILING_INT(N, tile_size); + const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, tile_size); + + const ::sycl::range<2> blockSize(tile_size, tile_size); + const ::sycl::range<2> gridSize(Nx*tile_size, Ny*tile_size); + + constexpr size_t shmem = tile_size * tile_size; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MAT_MAT_SHARED_DATA_SETUP; + + if (vid == Base_SYCL) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&](cl::sycl::handler& h) { + + ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); + + h.parallel_for + (cl::sycl::nd_range<2>(gridSize, blockSize), + [=] (cl::sycl::nd_item<2> itm) { + + Index_type tx = itm.get_local_id(0); + Index_type ty = itm.get_local_id(1); + Index_type bx = itm.get_group(0); + Index_type by = itm.get_group(1); + + MAT_MAT_SHARED_BODY_1(tile_size) + + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { + + MAT_MAT_SHARED_BODY_2(tile_size) + + itm.barrier(::sycl::access::fence_space::local_space); + + MAT_MAT_SHARED_BODY_3(tile_size) + + itm.barrier(::sycl::access::fence_space::local_space); + } + + MAT_MAT_SHARED_BODY_4(tile_size) + + }); + + }); + + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + constexpr bool async = true; + + const int local_mats = 3; + constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(double); + + using launch_policy = RAJA::LaunchPolicy>; + + using teams_x = RAJA::LoopPolicy; + + using teams_y = RAJA::LoopPolicy; + + using threads_x = RAJA::LoopPolicy; + + using threads_y = RAJA::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(Nx, Ny), + RAJA::Threads(tile_size, tile_size), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, Ny), + [&](Index_type by) { + RAJA::loop(ctx, RAJA::RangeSegment(0, Nx), + [&](Index_type bx) { + + //We only support dynamic shared memory in Sycl + //Thus requiring a different setup than other backends + //which use static shared memory + MAT_MAT_SHARED_BODY_SYCL_0(tile_size) + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_1(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + for (Index_type k = 0; k < (tile_size + N - 1) / tile_size; k++) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, + RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_2(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + } // for (k) + + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type ty) { + RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), + [&](Index_type tx) { + MAT_MAT_SHARED_BODY_4(tile_size) + } + ); // RAJA::loop + } + ); // RAJA::loop + + } // lambda (bx) + ); // RAJA::loop + } // lambda (by) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + } else { + getCout() << "\n MAT_MAT_SHARED : Unknown Sycl variant id = " << vid + << std::endl; + } + +} + + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MAT_MAT_SHARED, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 2173a1bc6..b1ad9d127 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -60,6 +60,9 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) setVariantDefined(Base_HIP); setVariantDefined(Lambda_HIP); setVariantDefined(RAJA_HIP); + + setVariantDefined(Base_SYCL); + setVariantDefined(RAJA_SYCL); } MAT_MAT_SHARED::~MAT_MAT_SHARED() {} diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index fffad8f10..d2708a0c4 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -94,7 +94,15 @@ constexpr rajaperf::Index_type TL_SZ = 16; RAJA_TEAM_SHARED double Bs[tile_size][tile_size]; \ RAJA_TEAM_SHARED double Cs[tile_size][tile_size]; -#define MAT_MAT_SHARED_BODY_1(tile_size) \ +#define MAT_MAT_SHARED_BODY_SYCL_0(tile_size) \ + double * As_ptr = ctx.getSharedMemory(tile_size * tile_size); \ + double * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); \ + double * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); \ + double (*As)[tile_size] = (double (*)[tile_size]) As_ptr; \ + double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr; \ + double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr; \ + +#define MAT_MAT_SHARED_BODY_1(tile_size) \ Cs[ty][tx] = 0; #define MAT_MAT_SHARED_BODY_2(tile_size) \ @@ -139,14 +147,18 @@ class MAT_MAT_SHARED : public KernelBase { void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = TL_SZ * TL_SZ; From 37fd1cf19f6a6c55c49cd7fbcff1a71142ff5f2f Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 15 Apr 2024 11:39:57 -0700 Subject: [PATCH 302/454] add convection kernel --- src/apps/CMakeLists.txt | 51 ++-- src/apps/CONVECTION3DPA-Sycl.cpp | 423 +++++++++++++++++++++++++++++++ src/apps/CONVECTION3DPA.cpp | 3 + src/apps/CONVECTION3DPA.hpp | 4 + src/apps/FEM_MACROS.hpp | 5 + 5 files changed, 461 insertions(+), 25 deletions(-) create mode 100644 src/apps/CONVECTION3DPA-Sycl.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index c612ac55f..dd5442605 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -15,13 +15,14 @@ blt_add_library( CONVECTION3DPA-Seq.cpp CONVECTION3DPA-OMP.cpp CONVECTION3DPA-OMPTarget.cpp - DEL_DOT_VEC_2D.cpp - DEL_DOT_VEC_2D-Seq.cpp - DEL_DOT_VEC_2D-Hip.cpp - DEL_DOT_VEC_2D-Cuda.cpp - DEL_DOT_VEC_2D-OMP.cpp - DEL_DOT_VEC_2D-OMPTarget.cpp - DEL_DOT_VEC_2D-Sycl.cpp + CONVECTION3DPA-Sycl.cpp + DEL_DOT_VEC_2D.cpp + DEL_DOT_VEC_2D-Seq.cpp + DEL_DOT_VEC_2D-Hip.cpp + DEL_DOT_VEC_2D-Cuda.cpp + DEL_DOT_VEC_2D-OMP.cpp + DEL_DOT_VEC_2D-OMPTarget.cpp + DEL_DOT_VEC_2D-Sycl.cpp DIFFUSION3DPA.cpp DIFFUSION3DPA-Cuda.cpp DIFFUSION3DPA-Hip.cpp @@ -36,11 +37,11 @@ blt_add_library( EDGE3D-OMPTarget.cpp ENERGY.cpp ENERGY-Seq.cpp - ENERGY-Hip.cpp - ENERGY-Cuda.cpp - ENERGY-OMP.cpp - ENERGY-OMPTarget.cpp - ENERGY-Sycl.cpp + ENERGY-Hip.cpp + ENERGY-Cuda.cpp + ENERGY-OMP.cpp + ENERGY-OMPTarget.cpp + ENERGY-Sycl.cpp FIR.cpp FIR-Seq.cpp FIR-Hip.cpp @@ -67,7 +68,7 @@ blt_add_library( MASS3DEA-Hip.cpp MASS3DEA-Seq.cpp MASS3DEA-OMP.cpp - MASS3DEA-OMPTarget.cpp + MASS3DEA-OMPTarget.cpp MASS3DPA.cpp MASS3DPA-Cuda.cpp MASS3DPA-Hip.cpp @@ -80,20 +81,20 @@ blt_add_library( NODAL_ACCUMULATION_3D-Cuda.cpp NODAL_ACCUMULATION_3D-OMP.cpp NODAL_ACCUMULATION_3D-OMPTarget.cpp - PRESSURE.cpp - PRESSURE-Seq.cpp - PRESSURE-Hip.cpp - PRESSURE-Cuda.cpp - PRESSURE-OMP.cpp - PRESSURE-OMPTarget.cpp - PRESSURE-Sycl.cpp + PRESSURE.cpp + PRESSURE-Seq.cpp + PRESSURE-Hip.cpp + PRESSURE-Cuda.cpp + PRESSURE-OMP.cpp + PRESSURE-OMPTarget.cpp + PRESSURE-Sycl.cpp VOL3D.cpp VOL3D-Seq.cpp - VOL3D-Hip.cpp - VOL3D-Cuda.cpp - VOL3D-OMP.cpp - VOL3D-OMPTarget.cpp - VOL3D-Sycl.cpp + VOL3D-Hip.cpp + VOL3D-Cuda.cpp + VOL3D-OMP.cpp + VOL3D-OMPTarget.cpp + VOL3D-Sycl.cpp ZONAL_ACCUMULATION_3D.cpp ZONAL_ACCUMULATION_3D-Seq.cpp ZONAL_ACCUMULATION_3D-Hip.cpp diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp new file mode 100644 index 000000000..d2f302255 --- /dev/null +++ b/src/apps/CONVECTION3DPA-Sycl.cpp @@ -0,0 +1,423 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "CONVECTION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t block_size > +void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + CONVECTION3DPA_DATA_SETUP; + + const ::sycl::range<3> blockSize(CPA_Q1D, CPA_Q1D, CPA_Q1D); + const ::sycl::range<3> gridSize(NE*CPA_Q1D,CPA_Q1D,CPA_Q1D); + + constexpr size_t shmem = 0; + + switch (vid) { + + case Base_SYCL: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + //constexpr size_t shmem = 0; + + qu->submit([&](cl::sycl::handler& h) { + + constexpr int max_D1D = CPA_D1D; + constexpr int max_Q1D = CPA_Q1D; + constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; + + auto sm0_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm1_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm2_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm3_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm4_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + auto sm5_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, blockSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(0); + + double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm2 = sm2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm3 = sm3_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm4 = sm4_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm5 = sm5_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; + double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; + double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; + double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; + double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; + double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; + double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; + double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + + SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + SYCL_FOREACH_THREAD(dx,0,CPA_D1D) + { + CONVECTION3DPA_1; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + { + CONVECTION3DPA_2; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + { + SYCL_FOREACH_THREAD(qx,1,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,0,CPA_Q1D) + { + CONVECTION3DPA_3; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qz,2,CPA_Q1D) + { + CONVECTION3DPA_4; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(qz,2,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + { + CONVECTION3DPA_5; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + { + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) + { + SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + { + CONVECTION3DPA_6; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + { + SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + CONVECTION3DPA_7; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + { + SYCL_FOREACH_THREAD(dy,1,CPA_D1D) + { + SYCL_FOREACH_THREAD(dx,0,CPA_D1D) + { + CONVECTION3DPA_8; + } + } + } + }); + + }); + + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = + RAJA::LaunchPolicy>; + + using outer_x = + RAJA::LoopPolicy; + + using inner_x = + RAJA::LoopPolicy; + + using inner_y = + RAJA::LoopPolicy; + + using inner_z = + RAJA::LoopPolicy; + + //Caclulate amount of shared memory needed + size_t shmem = 0; + { + constexpr int max_D1D = CPA_D1D; + constexpr int max_Q1D = CPA_Q1D; + constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; + + constexpr int no_mats = 6; + shmem += max_DQ*max_DQ*max_DQ * no_mats * sizeof(double); + } + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(CPA_Q1D, CPA_Q1D, CPA_Q1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + //Redefine inside the lambda to keep consistent with base version + constexpr int max_D1D = CPA_D1D; + constexpr int max_Q1D = CPA_Q1D; + constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; + + double * sm0 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm1 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm2 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm3 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm4 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + double * sm5 = ctx.getSharedMemory(max_DQ*max_DQ*max_DQ); + + double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; + double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; + double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; + double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; + double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; + double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; + double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; + double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; + double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; + double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dx) { + + CONVECTION3DPA_1; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + + CONVECTION3DPA_2; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + + CONVECTION3DPA_3; + + } // lambda (dy) + ); // RAJA::loop + } // lambda (dx) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qz) { + + CONVECTION3DPA_4; + + } // lambda (qz) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qx) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + + CONVECTION3DPA_5; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + + CONVECTION3DPA_6; + + } // lambda (dz) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qx) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_Q1D), + [&](int qx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + + CONVECTION3DPA_7; + + } // lambda (dy) + ); // RAJA::loop + } // lambda (qx) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, CPA_D1D), + [&](int dx) { + + CONVECTION3DPA_8; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n CONVECTION3DPA : Unknown Sycl variant id = " << vid + << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(CONVECTION3DPA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index a7973a237..b99238078 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -64,6 +64,9 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } CONVECTION3DPA::~CONVECTION3DPA() diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index 8c8d6066a..c0044ecdc 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -378,14 +378,18 @@ class CONVECTION3DPA : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = CPA_Q1D * CPA_Q1D * CPA_Q1D; diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 258f75a0e..fd4386324 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -24,6 +24,11 @@ for (int i = threadIdx.k; i < N; i += blockDim.k) #endif +#if defined(RAJA_ENABLE_SYCL) +#define SYCL_FOREACH_THREAD(i, k, N) \ + for (int i = itm.get_local_id(k); i < N; i += itm.get_local_range(k)) +#endif + #define CPU_FOREACH(i, k, N) for (int i = 0; i < N; i++) #endif // closing endif for header file include guard From c3d859bc4924e6525f673dd5f2d817ffe24b1012 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 15 Apr 2024 13:58:27 -0700 Subject: [PATCH 303/454] add mass pa --- src/apps/CMakeLists.txt | 1 + src/apps/MASS3DPA-Sycl.cpp | 316 +++++++++++++++++++++++++++++++++++++ src/apps/MASS3DPA.cpp | 3 + src/apps/MASS3DPA.hpp | 4 + 4 files changed, 324 insertions(+) create mode 100644 src/apps/MASS3DPA-Sycl.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index dd5442605..f14498929 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -75,6 +75,7 @@ blt_add_library( MASS3DPA-Seq.cpp MASS3DPA-OMP.cpp MASS3DPA-OMPTarget.cpp + MASS3DPA-Sycl.cpp NODAL_ACCUMULATION_3D.cpp NODAL_ACCUMULATION_3D-Seq.cpp NODAL_ACCUMULATION_3D-Hip.cpp diff --git a/src/apps/MASS3DPA-Sycl.cpp b/src/apps/MASS3DPA-Sycl.cpp new file mode 100644 index 000000000..7d2380eca --- /dev/null +++ b/src/apps/MASS3DPA-Sycl.cpp @@ -0,0 +1,316 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t block_size > +void MASS3DPA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MASS3DPA_DATA_SETUP; + + const ::sycl::range<3> blockSize(MPA_Q1D, MPA_Q1D, 1); + const ::sycl::range<3> gridSize(NE*MPA_Q1D,MPA_Q1D,1); + + switch (vid) { + + case Base_SYCL: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&](cl::sycl::handler& h) { + + constexpr int MQ1 = MPA_Q1D; + constexpr int MD1 = MPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + auto sDQ_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1 * MD1), h); + auto sm0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + auto sm1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, blockSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(0); + + double *sDQ = sDQ_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; + double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; + + double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; + double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; + double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; + double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; + double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; + double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 0, MPA_D1D){ + MASS3DPA_1 + } + SYCL_FOREACH_THREAD(dx, 0, MPA_Q1D) { + MASS3DPA_2 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(qx, 0, MPA_Q1D) { + MASS3DPA_3 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 0, MPA_Q1D) { + MASS3DPA_4 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 0, MPA_Q1D) { + MASS3DPA_5 + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(d, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(q, 0, MPA_Q1D) { + MASS3DPA_6 + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { + SYCL_FOREACH_THREAD(dx, 0, MPA_D1D) { + MASS3DPA_7 + } + } + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 0, MPA_D1D) { + MASS3DPA_8 + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 0, MPA_D1D) { + MASS3DPA_9 + } + } + + }); + }); + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + //Caclulate amount of shared memory needed + size_t shmem = 0; + { + constexpr int MQ1 = MPA_Q1D; + constexpr int MD1 = MPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + constexpr int no_mats = 2; + shmem += MQ1 * MD1 * no_mats * MDQ * MDQ * MDQ * sizeof(double); + } + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MPA_Q1D, MPA_Q1D, 1), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + //Redefine inside the lambda to keep consistent with base version + constexpr int MQ1 = MPA_Q1D; + constexpr int MD1 = MPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + double *sDQ = ctx.getSharedMemory(MQ1 * MD1); + double *sm0 = ctx.getSharedMemory(MDQ * MDQ * MDQ); + double *sm1 = ctx.getSharedMemory(MDQ * MDQ * MDQ); + + double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; + double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; + + double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; + double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; + double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; + double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; + double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; + double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_1 + } + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int dx) { + MASS3DPA_2 + } + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qx) { + MASS3DPA_3 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qx) { + MASS3DPA_4 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qx) { + MASS3DPA_5 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int q) { + MASS3DPA_6 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_7 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_8 + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MPA_D1D), + [&](int dx) { + MASS3DPA_9 + } + ); // RAJA::loop + } + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA : Unknown Sycl variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index b95027f2f..4f918d49c 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -61,6 +61,9 @@ MASS3DPA::MASS3DPA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } MASS3DPA::~MASS3DPA() diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 9fe634cc4..2d6f38006 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -363,14 +363,18 @@ class MASS3DPA : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = MPA_Q1D * MPA_Q1D; From b9d69f301907764e3fb5d7dc3748a3ef45e945df Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 16 Apr 2024 10:53:39 -0700 Subject: [PATCH 304/454] direct -> loop --- src/apps/CMakeLists.txt | 1 + src/apps/CONVECTION3DPA-Sycl.cpp | 10 +++++----- src/apps/DIFFUSION3DPA.cpp | 3 +++ src/apps/DIFFUSION3DPA.hpp | 4 ++++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index f14498929..cea021bda 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -29,6 +29,7 @@ blt_add_library( DIFFUSION3DPA-Seq.cpp DIFFUSION3DPA-OMP.cpp DIFFUSION3DPA-OMPTarget.cpp + DIFFUSION3DPA-Sycl.cpp EDGE3D.cpp EDGE3D-Cuda.cpp EDGE3D-Hip.cpp diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp index d2f302255..6433e786e 100644 --- a/src/apps/CONVECTION3DPA-Sycl.cpp +++ b/src/apps/CONVECTION3DPA-Sycl.cpp @@ -29,7 +29,7 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { CONVECTION3DPA_DATA_SETUP; const ::sycl::range<3> blockSize(CPA_Q1D, CPA_Q1D, CPA_Q1D); - const ::sycl::range<3> gridSize(NE*CPA_Q1D,CPA_Q1D,CPA_Q1D); + const ::sycl::range<3> gridSize(CPA_Q1D,CPA_Q1D,CPA_Q1D*NE); constexpr size_t shmem = 0; @@ -194,16 +194,16 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { RAJA::LaunchPolicy>; using outer_x = - RAJA::LoopPolicy; + RAJA::LoopPolicy; using inner_x = - RAJA::LoopPolicy; + RAJA::LoopPolicy; using inner_y = - RAJA::LoopPolicy; + RAJA::LoopPolicy; using inner_z = - RAJA::LoopPolicy; + RAJA::LoopPolicy; //Caclulate amount of shared memory needed size_t shmem = 0; diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 5a645c53e..0c6e57e2e 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -65,6 +65,9 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } DIFFUSION3DPA::~DIFFUSION3DPA() diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index a7757f503..2d6c8362f 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -481,14 +481,18 @@ class DIFFUSION3DPA : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t block_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; From 98683364fd3cc0340884447efbadaba74c88f577 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 16 Apr 2024 16:37:20 -0700 Subject: [PATCH 305/454] DIFFUSION sycl --- src/apps/CONVECTION3DPA-Sycl.cpp | 4 +- src/apps/DIFFUSION3DPA-Sycl.cpp | 443 +++++++++++++++++++++++++++++++ 2 files changed, 445 insertions(+), 2 deletions(-) create mode 100644 src/apps/DIFFUSION3DPA-Sycl.cpp diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp index 6433e786e..fd7bd1b99 100644 --- a/src/apps/CONVECTION3DPA-Sycl.cpp +++ b/src/apps/CONVECTION3DPA-Sycl.cpp @@ -59,7 +59,7 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { (cl::sycl::nd_range<3>(gridSize, blockSize), [=] (cl::sycl::nd_item<3> itm) { - const Index_type e = itm.get_group(0); + const Index_type e = itm.get_group(2); double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); double *sm1 = sm1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); @@ -194,7 +194,7 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { RAJA::LaunchPolicy>; using outer_x = - RAJA::LoopPolicy; + RAJA::LoopPolicy; using inner_x = RAJA::LoopPolicy; diff --git a/src/apps/DIFFUSION3DPA-Sycl.cpp b/src/apps/DIFFUSION3DPA-Sycl.cpp new file mode 100644 index 000000000..e760098c3 --- /dev/null +++ b/src/apps/DIFFUSION3DPA-Sycl.cpp @@ -0,0 +1,443 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "DIFFUSION3DPA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t block_size > +void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + DIFFUSION3DPA_DATA_SETUP; + + const ::sycl::range<3> blockSize(DPA_Q1D, DPA_Q1D, DPA_Q1D); + const ::sycl::range<3> gridSize(DPA_Q1D*NE,DPA_Q1D,DPA_Q1D); + + switch (vid) { + + case Base_SYCL: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&](cl::sycl::handler& h) { + + constexpr int MQ1 = DPA_Q1D; + constexpr int MD1 = DPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + auto sBG_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1*MD1), h); + + auto sm0_0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm0_1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm0_2_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm1_0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm1_1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + auto sm1_2_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); + + sycl::stream out(1024, 256, h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, blockSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(0); + + double *sBG = sBG_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double *sm0_0 = sm0_0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm0_1 = sm0_1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm0_2 = sm0_2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1_0 = sm1_0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1_1 = sm1_1_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + double *sm1_2 = sm1_2_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + double (*B)[MD1] = (double (*)[MD1]) sBG; + double (*G)[MD1] = (double (*)[MD1]) sBG; + double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; + double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; + + double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_0); + double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_1); + double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_0); + double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_1); + double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_2); + double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_0); + double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_1); + double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_2); + double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_0); + double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_1); + double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_2); + double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_0); + double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_1); + double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_1; + } + } + } + + if (itm.get_local_id(0) == 0) + { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_2; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_3; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_4; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { + SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, DPA_Q1D) { + DIFFUSION3DPA_5; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + if (itm.get_local_id(0) == 0) + { + SYCL_FOREACH_THREAD(d, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(q, 2, DPA_Q1D) { + DIFFUSION3DPA_6; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { + SYCL_FOREACH_THREAD(qy, 1, DPA_Q1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_7; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(qz, 0, DPA_Q1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_8; + } + } + } + itm.barrier(::sycl::access::fence_space::local_space); + SYCL_FOREACH_THREAD(dz, 0, DPA_D1D) { + SYCL_FOREACH_THREAD(dy, 1, DPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, DPA_D1D) { + DIFFUSION3DPA_9; + } + } + } + + + }); + }); + + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = + RAJA::LaunchPolicy>; + + using outer_x = + RAJA::LoopPolicy; + + using inner_x = + RAJA::LoopPolicy; + + using inner_y = + RAJA::LoopPolicy; + + using inner_z = + RAJA::LoopPolicy; + + size_t shmem = 0; + { + constexpr int MQ1 = DPA_Q1D; + constexpr int MD1 = DPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + const size_t local_mats = 6; + shmem += MQ1*MD1*sizeof(double) + local_mats*MDQ*MDQ*MDQ*sizeof(double); + } + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(DPA_Q1D, DPA_Q1D, DPA_Q1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + const bool symmetric = true; + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + //Redefine inside the lambda to keep consistent with base version + constexpr int MQ1 = DPA_Q1D; + constexpr int MD1 = DPA_D1D; + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + double *sBG = ctx.getSharedMemory(MQ1*MD1); + double *sm0_0 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm0_1 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm0_2 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm1_0 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm1_1 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + double *sm1_2 = ctx.getSharedMemory(MDQ*MDQ*MDQ); + + double (*B)[MD1] = (double (*)[MD1]) sBG; + double (*G)[MD1] = (double (*)[MD1]) sBG; + double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; + double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; + + double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_0); + double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0_1); + double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_0); + double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_1); + double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1_2); + double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_0); + double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_1); + double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0_2); + double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_0); + double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_1); + double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1_2); + double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_0); + double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_1); + double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0_2); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_1; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](int RAJA_UNUSED_ARG(dz)) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_2; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_3; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_4; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qx) { + + DIFFUSION3DPA_5; + + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](int RAJA_UNUSED_ARG(dz)) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int q) { + + DIFFUSION3DPA_6; + + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_7; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), + [&](int qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_8; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (qz) + ); //RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), + [&](int dx) { + + DIFFUSION3DPA_9; + + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); //RAJA::loop + } // lambda (dz) + ); //RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n DIFFUSION3DPA : Unknown Sycl variant id = " << vid + << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(DIFFUSION3DPA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL From eced4cbef8b8d05e977601a072425a18da6b3223 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 17 Apr 2024 09:46:48 -0700 Subject: [PATCH 306/454] add MASS3DEA-SYCL --- src/apps/CMakeLists.txt | 1 + src/apps/DIFFUSION3DPA-Sycl.cpp | 15 ++- src/apps/MASS3DEA-Sycl.cpp | 196 ++++++++++++++++++++++++++++++++ src/apps/MASS3DEA.cpp | 7 +- src/apps/MASS3DEA.hpp | 12 +- 5 files changed, 216 insertions(+), 15 deletions(-) create mode 100644 src/apps/MASS3DEA-Sycl.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index cea021bda..0ed6e9b81 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -70,6 +70,7 @@ blt_add_library( MASS3DEA-Seq.cpp MASS3DEA-OMP.cpp MASS3DEA-OMPTarget.cpp + MASS3DEA-Sycl.cpp MASS3DPA.cpp MASS3DPA-Cuda.cpp MASS3DPA-Hip.cpp diff --git a/src/apps/DIFFUSION3DPA-Sycl.cpp b/src/apps/DIFFUSION3DPA-Sycl.cpp index e760098c3..ccea9f5ad 100644 --- a/src/apps/DIFFUSION3DPA-Sycl.cpp +++ b/src/apps/DIFFUSION3DPA-Sycl.cpp @@ -31,13 +31,13 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { DIFFUSION3DPA_DATA_SETUP; - const ::sycl::range<3> blockSize(DPA_Q1D, DPA_Q1D, DPA_Q1D); - const ::sycl::range<3> gridSize(DPA_Q1D*NE,DPA_Q1D,DPA_Q1D); - switch (vid) { case Base_SYCL: { + const ::sycl::range<3> blockSize(DPA_Q1D, DPA_Q1D, DPA_Q1D); + const ::sycl::range<3> gridSize(DPA_Q1D,DPA_Q1D,DPA_Q1D*NE); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -62,7 +62,7 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { (cl::sycl::nd_range<3>(gridSize, blockSize), [=] (cl::sycl::nd_item<3> itm) { - const Index_type e = itm.get_group(0); + const Index_type e = itm.get_group(2); double *sBG = sBG_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); @@ -168,7 +168,6 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { } } - }); }); @@ -197,7 +196,7 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { using inner_z = RAJA::LoopPolicy; - + size_t shmem = 0; { constexpr int MQ1 = DPA_Q1D; @@ -221,7 +220,7 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), [&](int e) { - //Redefine inside the lambda to keep consistent with base version + //Redefine inside the lambda to keep consistent with base version constexpr int MQ1 = DPA_Q1D; constexpr int MD1 = DPA_D1D; constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; @@ -416,7 +415,7 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { } // lambda (e) ); // RAJA::loop - + } // outer lambda (ctx) ); // RAJA::launch diff --git a/src/apps/MASS3DEA-Sycl.cpp b/src/apps/MASS3DEA-Sycl.cpp new file mode 100644 index 000000000..8fccffdc3 --- /dev/null +++ b/src/apps/MASS3DEA-Sycl.cpp @@ -0,0 +1,196 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MASS3DEA.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t block_size > +void MASS3DEA::runSyclVariantImpl(VariantID vid) { + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MASS3DEA_DATA_SETUP; + + switch (vid) { + + case Base_SYCL: { + + const ::sycl::range<3> blockSize(MEA_Q1D, MEA_Q1D, MEA_Q1D); + const ::sycl::range<3> gridSize(MEA_Q1D,MEA_Q1D,MEA_Q1D*NE); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + constexpr size_t shmem = 0; + qu->submit([&](cl::sycl::handler& h) { + + ::sycl::local_accessor s_B(::sycl::range<2>(MEA_Q1D,MEA_D1D),h); + ::sycl::local_accessor s_D(::sycl::range<3>(MEA_Q1D,MEA_Q1D,MEA_Q1D),h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, blockSize), + [=] (cl::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(2); + + SYCL_FOREACH_THREAD(iz, 0, 1) { + SYCL_FOREACH_THREAD(d, 2, MEA_D1D) { + SYCL_FOREACH_THREAD(q, 1, MEA_Q1D) { + MASS3DEA_1 + } + } + } + + //not needed as we dynamicaly allocate shared memory in sycl + //MASS3DEA_2 + + SYCL_FOREACH_THREAD(k1, 2, MEA_Q1D) { + SYCL_FOREACH_THREAD(k2, 1, MEA_Q1D) { + SYCL_FOREACH_THREAD(k3, 0, MEA_Q1D) { + MASS3DEA_3 + } + } + } + + itm.barrier(::sycl::access::fence_space::local_space); + + SYCL_FOREACH_THREAD(i1, 2, MEA_D1D) { + SYCL_FOREACH_THREAD(i2, 1, MEA_D1D) { + SYCL_FOREACH_THREAD(i3, 0, MEA_D1D) { + MASS3DEA_4 + } + } + } + + }); + }); + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + constexpr size_t shmem = (MEA_Q1D*MEA_D1D + MEA_Q1D*MEA_Q1D*MEA_Q1D)*sizeof(double); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](int e) { + + double * s_B_ptr = ctx.getSharedMemory(MEA_Q1D*MEA_D1D); + double * s_D_ptr = ctx.getSharedMemory(MEA_Q1D*MEA_Q1D*MEA_Q1D); + + double (*s_B)[MEA_D1D] = (double (*)[MEA_D1D]) s_B_ptr; + double (*s_D)[MEA_Q1D][MEA_Q1D] = (double (*)[MEA_Q1D][MEA_Q1D]) s_B_ptr; + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](int ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int q) { + MASS3DEA_1 + } + ); // RAJA::loop + } + ); // RAJA::loop + } + ); // RAJA::loop + + //not needed as we dynamicaly allocate shared memory in sycl + //MASS3DEA_2 + + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int k1) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int k2) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_Q1D), + [&](int k3) { + MASS3DEA_3 + } + ); // RAJA::loop + } + ); // RAJA::loop + } + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int i1) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int i2) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MEA_D1D), + [&](int i3) { + MASS3DEA_4 + } + ); // RAJA::loop + } + ); // RAJA::loop + } + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DEA : Unknown Sycl variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DEA, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index 7dfbbb47c..503ccd72d 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -29,7 +29,7 @@ MASS3DEA::MASS3DEA(const RunParams& params) setDefaultReps(1); const int ea_mat_entries = MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D; - + m_NE = std::max(getTargetProblemSize()/(ea_mat_entries), Index_type(1)); setActualProblemSize( m_NE*ea_mat_entries); @@ -42,7 +42,7 @@ MASS3DEA::MASS3DEA(const RunParams& params) ea_mat_entries*m_NE*sizeof(Real_type) ); // M_e setFLOPsPerRep(m_NE * 7 * ea_mat_entries); - + setUsesFeature(Launch); setVariantDefined( Base_Seq ); @@ -57,6 +57,9 @@ MASS3DEA::MASS3DEA(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + } MASS3DEA::~MASS3DEA() diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index 0fbdb03e4..e5a3a5165 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -104,11 +104,9 @@ #define MASS3DEA_1 s_B[q][d] = B_MEA_(q, d); #define MASS3DEA_2 \ - double(*l_B)[MEA_D1D] = (double(*)[MEA_D1D])s_B; \ RAJA_TEAM_SHARED double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; #define MASS3DEA_2_CPU \ - double(*l_B)[MEA_D1D] = (double(*)[MEA_D1D])s_B; \ double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; #define MASS3DEA_3 s_D[k1][k2][k3] = D_MEA_(k1, k2, k3, e); @@ -123,9 +121,9 @@ for (int k2 = 0; k2 < MEA_Q1D; ++k2) { \ for (int k3 = 0; k3 < MEA_Q1D; ++k3) { \ \ - val += l_B[k1][i1] * l_B[k1][j1] * l_B[k2][i2] \ - * l_B[k2][j2] * \ - l_B[k3][i3] * l_B[k3][j3] * s_D[k1][k2][k3]; \ + val += s_B[k1][i1] * s_B[k1][j1] * s_B[k2][i2] \ + * s_B[k2][j2] * \ + s_B[k3][i3] * s_B[k3][j3] * s_D[k1][k2][k3]; \ } \ } \ } \ @@ -154,14 +152,18 @@ class MASS3DEA : public KernelBase { void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template void runCudaVariantImpl(VariantID vid); template void runHipVariantImpl(VariantID vid); + template + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = MEA_D1D * MEA_D1D * MEA_D1D; From c9b2b9b0006dc76d9b891760aea4c1bc12fa167d Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 17 Apr 2024 10:05:01 -0700 Subject: [PATCH 307/454] fixed indexing for convection and mat mat shared --- src/apps/CONVECTION3DPA-Sycl.cpp | 38 +++++++++++++++---------------- src/basic/MAT_MAT_SHARED-Sycl.cpp | 17 +++++++------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp index fd7bd1b99..87ad04a35 100644 --- a/src/apps/CONVECTION3DPA-Sycl.cpp +++ b/src/apps/CONVECTION3DPA-Sycl.cpp @@ -81,11 +81,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; - SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) { SYCL_FOREACH_THREAD(dy,1,CPA_D1D) { - SYCL_FOREACH_THREAD(dx,0,CPA_D1D) + SYCL_FOREACH_THREAD(dx,2,CPA_D1D) { CONVECTION3DPA_1; } @@ -93,11 +93,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) { SYCL_FOREACH_THREAD(dy,1,CPA_D1D) { - SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) { CONVECTION3DPA_2; } @@ -105,11 +105,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) { - SYCL_FOREACH_THREAD(qx,1,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) { - SYCL_FOREACH_THREAD(qy,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) { CONVECTION3DPA_3; } @@ -117,11 +117,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) { SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) { - SYCL_FOREACH_THREAD(qz,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qz,0,CPA_Q1D) { CONVECTION3DPA_4; } @@ -129,11 +129,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qz,2,CPA_Q1D) + SYCL_FOREACH_THREAD(qz,0,CPA_Q1D) { SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) { - SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) { CONVECTION3DPA_5; } @@ -141,11 +141,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) { SYCL_FOREACH_THREAD(qy,1,CPA_Q1D) { - SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) { CONVECTION3DPA_6; } @@ -153,9 +153,9 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) { - SYCL_FOREACH_THREAD(qx,0,CPA_Q1D) + SYCL_FOREACH_THREAD(qx,2,CPA_Q1D) { SYCL_FOREACH_THREAD(dy,1,CPA_D1D) { @@ -165,11 +165,11 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { } itm.barrier(::sycl::access::fence_space::local_space); - SYCL_FOREACH_THREAD(dz,2,CPA_D1D) + SYCL_FOREACH_THREAD(dz,0,CPA_D1D) { SYCL_FOREACH_THREAD(dy,1,CPA_D1D) { - SYCL_FOREACH_THREAD(dx,0,CPA_D1D) + SYCL_FOREACH_THREAD(dx,2,CPA_D1D) { CONVECTION3DPA_8; } @@ -197,13 +197,13 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { RAJA::LoopPolicy; using inner_x = - RAJA::LoopPolicy; + RAJA::LoopPolicy; using inner_y = RAJA::LoopPolicy; using inner_z = - RAJA::LoopPolicy; + RAJA::LoopPolicy; //Caclulate amount of shared memory needed size_t shmem = 0; diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp index efb9d0a47..927a0bebc 100644 --- a/src/basic/MAT_MAT_SHARED-Sycl.cpp +++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp @@ -31,8 +31,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) const Index_type Nx = RAJA_DIVIDE_CEILING_INT(N, tile_size); const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, tile_size); - const ::sycl::range<2> blockSize(tile_size, tile_size); - const ::sycl::range<2> gridSize(Nx*tile_size, Ny*tile_size); + //Right most is the fastest index + const ::sycl::range<3> blockSize(1, tile_size, tile_size); + const ::sycl::range<3> gridSize(1, Ny*tile_size, Nx*tile_size); constexpr size_t shmem = tile_size * tile_size; @@ -53,12 +54,12 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); h.parallel_for - (cl::sycl::nd_range<2>(gridSize, blockSize), - [=] (cl::sycl::nd_item<2> itm) { + (cl::sycl::nd_range<3>(gridSize, blockSize), + [=] (cl::sycl::nd_item<3> itm) { - Index_type tx = itm.get_local_id(0); + Index_type tx = itm.get_local_id(2); Index_type ty = itm.get_local_id(1); - Index_type bx = itm.get_group(0); + Index_type bx = itm.get_group(2); Index_type by = itm.get_group(1); MAT_MAT_SHARED_BODY_1(tile_size) @@ -93,11 +94,11 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) using launch_policy = RAJA::LaunchPolicy>; - using teams_x = RAJA::LoopPolicy; + using teams_x = RAJA::LoopPolicy; using teams_y = RAJA::LoopPolicy; - using threads_x = RAJA::LoopPolicy; + using threads_x = RAJA::LoopPolicy; using threads_y = RAJA::LoopPolicy; From ee01764ecd2c473bf7b0990db9ace6f3a60b5322 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 17 Apr 2024 10:08:34 -0700 Subject: [PATCH 308/454] fix indexing for mass3DPA --- src/apps/MASS3DPA-Sycl.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/apps/MASS3DPA-Sycl.cpp b/src/apps/MASS3DPA-Sycl.cpp index 7d2380eca..1b49ef02c 100644 --- a/src/apps/MASS3DPA-Sycl.cpp +++ b/src/apps/MASS3DPA-Sycl.cpp @@ -31,8 +31,8 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { MASS3DPA_DATA_SETUP; - const ::sycl::range<3> blockSize(MPA_Q1D, MPA_Q1D, 1); - const ::sycl::range<3> gridSize(NE*MPA_Q1D,MPA_Q1D,1); + const ::sycl::range<3> blockSize(1, MPA_Q1D, MPA_Q1D); + const ::sycl::range<3> gridSize(1, MPA_Q1D, MPA_Q1D*NE); switch (vid) { @@ -55,7 +55,7 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { (cl::sycl::nd_range<3>(gridSize, blockSize), [=] (cl::sycl::nd_item<3> itm) { - const Index_type e = itm.get_group(0); + const Index_type e = itm.get_group(2); double *sDQ = sDQ_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); double *sm0 = sm0_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); @@ -72,56 +72,56 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(dx, 0, MPA_D1D){ + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D){ MASS3DPA_1 } - SYCL_FOREACH_THREAD(dx, 0, MPA_Q1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_Q1D) { MASS3DPA_2 } } itm.barrier(::sycl::access::fence_space::local_space); SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(qx, 0, MPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { MASS3DPA_3 } } itm.barrier(::sycl::access::fence_space::local_space); SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { - SYCL_FOREACH_THREAD(qx, 0, MPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { MASS3DPA_4 } } itm.barrier(::sycl::access::fence_space::local_space); SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { - SYCL_FOREACH_THREAD(qx, 0, MPA_Q1D) { + SYCL_FOREACH_THREAD(qx, 2, MPA_Q1D) { MASS3DPA_5 } } itm.barrier(::sycl::access::fence_space::local_space); SYCL_FOREACH_THREAD(d, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(q, 0, MPA_Q1D) { + SYCL_FOREACH_THREAD(q, 2, MPA_Q1D) { MASS3DPA_6 } } itm.barrier(::sycl::access::fence_space::local_space); SYCL_FOREACH_THREAD(qy, 1, MPA_Q1D) { - SYCL_FOREACH_THREAD(dx, 0, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { MASS3DPA_7 } } itm.barrier(::sycl::access::fence_space::local_space); SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(dx, 0, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { MASS3DPA_8 } } itm.barrier(::sycl::access::fence_space::local_space); SYCL_FOREACH_THREAD(dy, 1, MPA_D1D) { - SYCL_FOREACH_THREAD(dx, 0, MPA_D1D) { + SYCL_FOREACH_THREAD(dx, 2, MPA_D1D) { MASS3DPA_9 } } @@ -141,9 +141,9 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { using launch_policy = RAJA::LaunchPolicy>; - using outer_x = RAJA::LoopPolicy; + using outer_x = RAJA::LoopPolicy; - using inner_x = RAJA::LoopPolicy; + using inner_x = RAJA::LoopPolicy; using inner_y = RAJA::LoopPolicy; @@ -163,7 +163,7 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { RAJA::launch( res, RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MPA_Q1D, MPA_Q1D, 1), shmem), + RAJA::Threads(MPA_Q1D, MPA_Q1D), shmem), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, NE), From 1dbe24e3379cf31a1761fcd2c23d6a76f5b1a476 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 17 Apr 2024 10:09:50 -0700 Subject: [PATCH 309/454] clean up diffusion --- src/apps/DIFFUSION3DPA-Sycl.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Sycl.cpp b/src/apps/DIFFUSION3DPA-Sycl.cpp index ccea9f5ad..2a10c9370 100644 --- a/src/apps/DIFFUSION3DPA-Sycl.cpp +++ b/src/apps/DIFFUSION3DPA-Sycl.cpp @@ -56,8 +56,6 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { auto sm1_1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); auto sm1_2_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); - sycl::stream out(1024, 256, h); - h.parallel_for (cl::sycl::nd_range<3>(gridSize, blockSize), [=] (cl::sycl::nd_item<3> itm) { From e57edc151fd6efb21280e7bb387cad32c41b1b9a Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 18 Apr 2024 13:37:26 -0700 Subject: [PATCH 310/454] Fix issue with sycl exec pol --- src/lcals/HYDRO_2D-Sycl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index a37d1fe62..471c13aca 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -103,8 +103,8 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { using EXECPOL = RAJA::KernelPolicy< RAJA::statement::SyclKernelAsync< - RAJA::statement::For<0, RAJA::sycl_global_1, - RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::Lambda<0> > > From 6d45fa581d14d55a7e7ed9473c41a2bcd060a54d Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 18 Apr 2024 15:58:54 -0700 Subject: [PATCH 311/454] Check in SYCL variants of several kernels. Base SYCL vairant is good for each; checksum issues with RAJA_SYCL variants --- src/polybench/CMakeLists.txt | 4 + src/polybench/POLYBENCH_2MM-Sycl.cpp | 3 +- src/polybench/POLYBENCH_3MM-Sycl.cpp | 210 +++++++++++++++++++++++ src/polybench/POLYBENCH_3MM.cpp | 3 + src/polybench/POLYBENCH_3MM.hpp | 5 + src/polybench/POLYBENCH_ATAX-Sycl.cpp | 155 +++++++++++++++++ src/polybench/POLYBENCH_ATAX.cpp | 3 + src/polybench/POLYBENCH_ATAX.hpp | 5 + src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp | 181 +++++++++++++++++++ src/polybench/POLYBENCH_FDTD_2D.cpp | 3 + src/polybench/POLYBENCH_FDTD_2D.hpp | 5 + src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp | 146 ++++++++++++++++ src/polybench/POLYBENCH_HEAT_3D.cpp | 3 + src/polybench/POLYBENCH_HEAT_3D.hpp | 5 + 14 files changed, 730 insertions(+), 1 deletion(-) create mode 100644 src/polybench/POLYBENCH_3MM-Sycl.cpp create mode 100644 src/polybench/POLYBENCH_ATAX-Sycl.cpp create mode 100644 src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp create mode 100644 src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index 2c33a0fce..62ddc37cf 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -21,6 +21,7 @@ blt_add_library( POLYBENCH_3MM-Cuda.cpp POLYBENCH_3MM-OMP.cpp POLYBENCH_3MM-OMPTarget.cpp + POLYBENCH_3MM-Sycl.cpp POLYBENCH_ADI.cpp POLYBENCH_ADI-Seq.cpp POLYBENCH_ADI-Hip.cpp @@ -33,12 +34,14 @@ blt_add_library( POLYBENCH_ATAX-Cuda.cpp POLYBENCH_ATAX-OMP.cpp POLYBENCH_ATAX-OMPTarget.cpp + POLYBENCH_ATAX-Sycl.cpp POLYBENCH_FDTD_2D.cpp POLYBENCH_FDTD_2D-Seq.cpp POLYBENCH_FDTD_2D-Hip.cpp POLYBENCH_FDTD_2D-Cuda.cpp POLYBENCH_FDTD_2D-OMP.cpp POLYBENCH_FDTD_2D-OMPTarget.cpp + POLYBENCH_FDTD_2D-Sycl.cpp POLYBENCH_FLOYD_WARSHALL.cpp POLYBENCH_FLOYD_WARSHALL-Seq.cpp POLYBENCH_FLOYD_WARSHALL-Hip.cpp @@ -69,6 +72,7 @@ blt_add_library( POLYBENCH_HEAT_3D-Cuda.cpp POLYBENCH_HEAT_3D-OMP.cpp POLYBENCH_HEAT_3D-OMPTarget.cpp + POLYBENCH_HEAT_3D-Sycl.cpp POLYBENCH_JACOBI_1D.cpp POLYBENCH_JACOBI_1D-Seq.cpp POLYBENCH_JACOBI_1D-Hip.cpp diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index 3582642c8..ee2125584 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -28,10 +28,11 @@ namespace polybench #define in_wg_sz (32) #define out_wg_sz (work_group_size / in_wg_sz) + template void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) { - const unsigned long run_reps = getRunReps(); + const Index_type run_reps = getRunReps(); auto res{getSyclResource()}; auto qu = res.get_queue(); diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp new file mode 100644 index 000000000..101bb2596 --- /dev/null +++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp @@ -0,0 +1,210 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_3MM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define in_wg_sz (32) +#define out_wg_sz (work_group_size / in_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_3MM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + sycl::range<2> global_dim1(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz)); + + sycl::range<2> global_dim2(out_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); + + sycl::range<2> global_dim3(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); + + sycl::range<2> wkgroup_dim(out_wg_sz, in_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim1, wkgroup_dim), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type j = item.get_global_id(1); + + if (i < ni && j < nj) { + POLYBENCH_3MM_BODY1; + for (Index_type k=0; k < nk; ++k) { + POLYBENCH_3MM_BODY2; + } + POLYBENCH_3MM_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<2> item) { + + Index_type j = item.get_global_id(0); + Index_type l = item.get_global_id(1); + + if (j < nj && l < nl) { + POLYBENCH_3MM_BODY4; + for (Index_type m=0; m < nm; ++m) { + POLYBENCH_3MM_BODY5; + } + POLYBENCH_3MM_BODY6; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type l = item.get_global_id(1); + + if (i < ni && l < nl) { + POLYBENCH_3MM_BODY7; + for (Index_type j=0; j < nj; ++j) { + POLYBENCH_3MM_BODY8; + } + POLYBENCH_3MM_BODY9; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_3MM_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk}), + RAJA::tuple{0.0}, + + [=] (Real_type &dot) { + POLYBENCH_3MM_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Index_type k, + Real_type &dot) { + POLYBENCH_3MM_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY3_RAJA; + } + + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nm}), + RAJA::tuple{0.0}, + + [=] (Real_type &dot) { + POLYBENCH_3MM_BODY4_RAJA; + }, + [=] (Index_type j, Index_type l, Index_type m, + Real_type &dot) { + POLYBENCH_3MM_BODY5_RAJA; + }, + [=] (Index_type j, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY6_RAJA; + } + + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nj}), + RAJA::tuple{0.0}, + + [=] (Real_type &dot) { + POLYBENCH_3MM_BODY7_RAJA; + }, + [=] (Index_type i, Index_type l, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY8_RAJA; + }, + [=] (Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY9_RAJA; + } + + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_3MM : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_3MM, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index d05332c8f..bbb6e072f 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -86,6 +86,9 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_3MM::~POLYBENCH_3MM() diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index a4215289d..0d0cf79af 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -153,13 +153,18 @@ class POLYBENCH_3MM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/polybench/POLYBENCH_ATAX-Sycl.cpp b/src/polybench/POLYBENCH_ATAX-Sycl.cpp new file mode 100644 index 000000000..5081c99eb --- /dev/null +++ b/src/polybench/POLYBENCH_ATAX-Sycl.cpp @@ -0,0 +1,155 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_ATAX.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + +template < size_t work_group_size > +void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_ATAX_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + POLYBENCH_ATAX_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY2; + } + POLYBENCH_ATAX_BODY3; + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type j = item.get_global_id(0); + + POLYBENCH_ATAX_BODY4; + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY5; + } + POLYBENCH_ATAX_BODY6; + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_ATAX_VIEWS_RAJA; + + using EXEC_POL1 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + > + >; + + using EXEC_POL2 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<1, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Segs<1>, RAJA::Params<0>>, + RAJA::statement::For<0, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<1>, RAJA::Params<0>> + > + > + >; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + + [=] (Index_type i, Real_type &dot) { + POLYBENCH_ATAX_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY2_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_ATAX_BODY3_RAJA; + } + + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + + [=] (Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY4_RAJA; + }, + [=] (Index_type i, Index_type j , Real_type &dot) { + POLYBENCH_ATAX_BODY5_RAJA; + }, + [=] (Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY6_RAJA; + } + + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_ATAX : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_ATAX, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 693420ca1..71423d1ce 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -65,6 +65,9 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_ATAX::~POLYBENCH_ATAX() diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index 5e64d125f..e6d43bfbc 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -115,13 +115,18 @@ class POLYBENCH_ATAX : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp new file mode 100644 index 000000000..8613839d2 --- /dev/null +++ b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp @@ -0,0 +1,181 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_FDTD_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_FDTD_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (t = 0; t < tsteps; ++t) { + + const size_t grid_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(ny, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size1, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type j = item.get_global_id(0); + if (j < ny) { + POLYBENCH_FDTD_2D_BODY1; + } + + }); + }); + + sycl::range<2> global_dim234(i_wg_sz * RAJA_DIVIDE_CEILING_INT(nx, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(ny, j_wg_sz)); + + sycl::range<2> wkgroup_dim234(i_wg_sz, j_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type j = item.get_global_id(1); + + if (i > 0 && i < nx && j < ny) { + POLYBENCH_FDTD_2D_BODY2; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type j = item.get_global_id(1); + + if (i < nx && j > 0 && j < ny) { + POLYBENCH_FDTD_2D_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type j = item.get_global_id(1); + + if (i < nx-1 && j < ny-1) { + POLYBENCH_FDTD_2D_BODY4; + } + + }); + }); + + } // tstep loop + + } // run_reps + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_FDTD_2D_VIEWS_RAJA; + + using EXEC_POL1 = RAJA::sycl_exec; + + using EXEC_POL234 = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (t = 0; t < tsteps; ++t) { + + RAJA::forall( RAJA::RangeSegment(0, ny), + [=] (Index_type j) { + POLYBENCH_FDTD_2D_BODY1_RAJA; + }); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, nx}, + RAJA::RangeSegment{0, ny}), + [=] (Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY2_RAJA; + } + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{0, nx}, + RAJA::RangeSegment{1, ny}), + [=] (Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY3_RAJA; + } + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{0, nx-1}, + RAJA::RangeSegment{0, ny-1}), + [=] (Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY4_RAJA; + } + ); + + } // tstep loop + + } // run_reps + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_FDTD_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_FDTD_2D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 575c40e3f..eeac18818 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -84,6 +84,9 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D() diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index 2631b05c1..685c4cf40 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -113,13 +113,18 @@ class POLYBENCH_FDTD_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp new file mode 100644 index 000000000..ff98efa01 --- /dev/null +++ b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp @@ -0,0 +1,146 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_HEAT_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define k_wg_sz (32) +#define j_wg_sz (work_group_size / k_wg_sz) +#define i_wg_sz (1) + + +template < size_t work_group_size > +void POLYBENCH_HEAT_3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_HEAT_3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + sycl::range<3> global_dim(i_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, j_wg_sz), + k_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, k_wg_sz)); + + sycl::range<3> wkgroup_dim(i_wg_sz, j_wg_sz, k_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = 1 + item.get_global_id(0); + Index_type j = 1 + item.get_global_id(1); + Index_type k = 1 + item.get_global_id(2); + + if (i < N-1 && j < N-1 && k < N-1) { + POLYBENCH_HEAT_3D_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = 1 + item.get_global_id(0); + Index_type j = 1 + item.get_global_id(1); + Index_type k = 1 + item.get_global_id(2); + + if (i < N-1 && j < N-1 && k < N-1) { + POLYBENCH_HEAT_3D_BODY2; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_HEAT_3D_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::SyclKernelAsync< + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<2, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + > + >; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + [=] (Index_type i, Index_type j, Index_type k) { + POLYBENCH_HEAT_3D_BODY1_RAJA; + } + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + [=] (Index_type i, Index_type j, Index_type k) { + POLYBENCH_HEAT_3D_BODY2_RAJA; + } + ); + + } + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_HEAT_3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_HEAT_3D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 2c3487c14..430aef043 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -70,6 +70,9 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D() diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 64c394630..590d7b326 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -122,13 +122,18 @@ class POLYBENCH_HEAT_3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From f5c2567e687550eaec9e35082504e5c96e9df664 Mon Sep 17 00:00:00 2001 From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 19 Apr 2024 16:38:52 +0200 Subject: [PATCH 312/454] [WIP] [Woptim] Compiler wrappers + update packages + use spack environments (#438) * From RAJA: From RSC: Update lassen wrappers, update packages * From RSC: changes for Caliper * From RSC: Remove CUDA_ARCH, Fix MPI utility function (used by RAJAPerf) * From RSC: Restore basic MPI support in RAJAPerf * From RSC: RAJAPerf, Umpire, Caliper MPI handling like Axom * Make MPI test allocations overlapping * Move to Environments in RSC * Allocate only one node to avoid jobs scattered with different shared memory * From RSC: Fix: use slurm on toss4 cray machines * Allow failure on tioga for +rocm +mpi * Comment changes in build_and_test.sh * From RSC: add CARE * Point at RADIUSS Spack Configs @ main * Update RAJA with RSC changes merged in develop * Use new pci queue on tioga * Apply CI queue to top level allocation * Apply changes required by LC (token handling) --- .gitlab-ci.yml | 6 ++++-- .gitlab/custom-jobs-and-variables.yml | 4 ++-- .gitlab/jobs/tioga.yml | 1 + scripts/gitlab/build_and_test.sh | 10 ++++++++++ tpl/RAJA | 2 +- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 342c05c36..9f7020a98 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -72,7 +72,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: 'v2023.12.3' + ref: 'v2024.04.0' file: 'pipelines/${CI_MACHINE}.yml' - artifact: '${CI_MACHINE}-jobs.yml' job: 'generate-job-lists' @@ -81,9 +81,11 @@ stages: pipeline_variables: true include: + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' # [Optional] checks preliminary to running the actual CI test #- project: 'radiuss/radiuss-shared-ci' - # ref: 'v2023.12.0' + # ref: 'v2024.04.0' # file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index c5a207f91..2735341b4 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -15,7 +15,7 @@ variables: # Ruby # Arguments for top level allocation - RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=20 --nodes=2" + RUBY_SHARED_ALLOC: "--exclusive --reservation=ci --time=40 --nodes=1" # Arguments for job level allocation # Note: We repeat the reservation, necessary when jobs are manually re-triggered. RUBY_JOB_ALLOC: "--reservation=ci --nodes=1" @@ -46,7 +46,7 @@ variables: # Tioga # Arguments for top level allocation - TIOGA_SHARED_ALLOC: "--exclusive --time-limit=26m --nodes=1 -o per-resource.count=2" + TIOGA_SHARED_ALLOC: "--queue=pci --exclusive --time-limit=26m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for tioga diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index 688d428c0..decc3baa4 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -38,3 +38,4 @@ rocmcc_5_7_1_hip_openmp_mpi: variables: SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" extends: .job_on_tioga + allow_failure: true diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index 2f77a6bd0..430d50b4b 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -187,6 +187,15 @@ then rm -rf ${build_dir} 2>/dev/null mkdir -p ${build_dir} && cd ${build_dir} + # We set the MPI tests command to allow overlapping. + # Shared allocation: Allows build_and_test.sh to run within a sub-allocation (see CI config). + # Use /dev/shm: Prevent MPI tests from running on a node where the build dir doesn't exist. + cmake_options="" + if [[ "${truehostname}" == "ruby" || "${truehostname}" == "poodle" ]] + then + cmake_options="-DBLT_MPI_COMMAND_APPEND:STRING=--overlap" + fi + date if [[ "${truehostname}" == "corona" || "${truehostname}" == "tioga" ]] then @@ -194,6 +203,7 @@ then fi $cmake_exe \ -C ${hostconfig_path} \ + ${cmake_options} \ ${project_dir} if ! $cmake_exe --build . -j ${core_counts[$truehostname]} then diff --git a/tpl/RAJA b/tpl/RAJA index 3ada0950b..784457ea1 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 3ada0950b0774ec907d30a9eceaf6af7478b833b +Subproject commit 784457ea194756bf2a5abaf65e0b9feb863994ff From 502c182c6f297bda86d66e6b4d71c2d8550c57e8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 13:47:55 -0700 Subject: [PATCH 313/454] Change SDomain setup Simplify and separate out calculations for each dimension --- src/apps/AppsData.hpp | 69 +++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index 36f033d65..d9e208c89 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -47,40 +47,57 @@ class ADomain ADomain() = delete; - ADomain( Index_type rzmax, Index_type ndims ) + ADomain( Index_type real_nodes_per_dim, Index_type ndims ) : ndims(ndims), NPNL(2), NPNR(1) { - imin = NPNL; - jmin = NPNL; - imax = rzmax + NPNR; - jmax = rzmax + NPNR; - jp = imax - imin + 1 + NPNL + NPNR; - n_real_zones = (imax - imin); - n_real_nodes = (imax+1 - imin); - - if ( ndims == 2 ) { - kmin = 0; - kmax = 0; - kp = 0; - nnalls = jp * (jmax - jmin + 1 + NPNL + NPNR) ; + int NPZL = NPNL - 1; + int NPZR = NPNR+1 - 1; + + if ( ndims >= 1 ) { + imin = NPNL; + imax = NPNL + real_nodes_per_dim-1; + nnalls = (imax+1 - imin + NPNL + NPNR); + n_real_zones = (imax - imin); + n_real_nodes = (imax+1 - imin); + } else { + imin = 0; + imax = 0; + nnalls = 0; + } + + if ( ndims >= 2 ) { + jmin = NPNL; + jmax = NPNL + real_nodes_per_dim-1; + jp = nnalls; + nnalls *= (jmax+1 - jmin + NPNL + NPNR); n_real_zones *= (jmax - jmin); n_real_nodes *= (jmax+1 - jmin); - } else if ( ndims == 3 ) { + } else { + jmin = 0; + jmax = 0; + jp = 0; + } + + if ( ndims >= 3 ) { kmin = NPNL; - kmax = rzmax + NPNR; - kp = jp * (jmax - jmin + 1 + NPNL + NPNR); - nnalls = kp * (kmax - kmin + 1 + NPNL + NPNR) ; - n_real_zones *= (jmax - jmin) * (kmax - kmin); - n_real_nodes *= (jmax+1 - jmin) * (kmax+1 - kmin); + kmax = NPNL + rzmax-1; + kp = nnalls; + nnalls *= (kmax+1 - kmin + NPNL + NPNR); + n_real_zones *= (kmax - kmin); + n_real_nodes *= (kmax+1 - kmin); + } else { + kmin = 0; + kmax = 0; + kp = 0; } - fpn = 0; - lpn = nnalls - 1; - frn = fpn + NPNL * (kp + jp) + NPNL; - lrn = lpn - NPNR * (kp + jp) - NPNR; + frn = kmin*kp + jmin*jp + imin; + lrn = kmax*kp + jmax*jp + imax; + fpn = (kmin - NPNL)*kp + (jmin - NPNL)*jp + (imin - NPNL); + lpn = (kmax + NPNR)*kp + (jmax + NPNR)*jp + (imax + NPNR); - fpz = frn - jp - kp - 1; - lpz = lrn; + fpz = (kmin - NPZL)*kp + (jmin - NPZL)*jp + (imin - NPZL); + lpz = (kmax-1 + NPZR)*kp + (jmax-1 + NPZR)*jp + (imax-1 + NPZR); } ~ADomain() From daabdf9095fb45c72eef4e127f992a3c4cdc05df Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 13:49:53 -0700 Subject: [PATCH 314/454] simplify setting mesh positions Use a loop over domains instead of a loop over zones --- src/apps/AppsData.cpp | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index f93a13154..bf38c23ee 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -104,20 +104,13 @@ void setMeshPositions_2d(Real_ptr x, Real_type dx, Index_type npnl = domain.NPNL; Index_type npnr = domain.NPNR; - Real_ptr x1, x2, x3, x4; - Real_ptr y1, y2, y3, y4; - NDSET2D(domain.jp, x, x1,x2,x3,x4) ; - NDSET2D(domain.jp, y, y1,y2,y3,y4) ; + for (Index_type j = jmin - npnl; j < jmax+1 + npnr; j++) { + for (Index_type i = imin - npnl; i < imax+1 + npnr; i++) { + Index_type in = i + j*jp ; - for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { - for (Index_type i = imin - npnl; i < imax + npnr; i++) { - Index_type iz = i + j*jp ; + x[in] = i*dx; - x3[iz] = x4[iz] = i*dx; - x1[iz] = x2[iz] = (i+1)*dx; - - y1[iz] = y4[iz] = j*dy; - y2[iz] = y3[iz] = (j+1)*dy; + y[in] = j*dy; } } @@ -150,26 +143,16 @@ void setMeshPositions_3d(Real_ptr x, Real_type dx, Index_type npnl = domain.NPNL; Index_type npnr = domain.NPNR; - Real_ptr x0, x1, x2, x3, x4, x5, x6, x7; - Real_ptr y0, y1, y2, y3, y4, y5, y6, y7; - Real_ptr z0, z1, z2, z3, z4, z5, z6, z7; - NDPTRSET(domain.jp, domain.kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - NDPTRSET(domain.jp, domain.kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; - NDPTRSET(domain.jp, domain.kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; - - for (Index_type k = kmin - npnl; k < kmax + npnr; k++) { - for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { - for (Index_type i = imin - npnl; i < imax + npnr; i++) { - Index_type iz = i + j*jp + k*kp ; + for (Index_type k = kmin - npnl; k < kmax+1 + npnr; k++) { + for (Index_type j = jmin - npnl; j < jmax+1 + npnr; j++) { + for (Index_type i = imin - npnl; i < imax+1 + npnr; i++) { + Index_type in = i + j*jp + k*kp ; - x0[iz] = x2[iz] = x4[iz] = x6[iz] = i*dx; - x1[iz] = x3[iz] = x5[iz] = x7[iz] = (i+1)*dx; + x[in] = i*dx; - y0[iz] = y1[iz] = y4[iz] = y5[iz] = j*dy; - y2[iz] = y3[iz] = y6[iz] = y7[iz] = (j+1)*dy; + y[in] = j*dy; - z0[iz] = z1[iz] = z2[iz] = z3[iz] = k*dz; - z4[iz] = z5[iz] = z6[iz] = z7[iz] = (k+1)*dz; + z[in] = k*dz; } } From 3bf5e2c36584e66191d2a8acd15417540b21d185 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 13:50:34 -0700 Subject: [PATCH 315/454] Simplify making zone lists change names of params to better match terminology --- src/apps/AppsData.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index bf38c23ee..3ff4de9a7 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -38,10 +38,10 @@ void setRealZones_2d(Index_type* real_zones, for (Index_type j = jmin; j < jmax; j++) { for (Index_type i = imin; i < imax; i++) { - Index_type ip = i + j*jp ; + Index_type iz = i + j*jp ; - Index_type id = (i-imin) + (j-jmin)*j_stride ; - real_zones[id] = ip; + Index_type il = (i-imin) + (j-jmin)*j_stride ; + real_zones[il] = iz; } } } @@ -73,10 +73,10 @@ void setRealZones_3d(Index_type* real_zones, for (Index_type k = kmin; k < kmax; k++) { for (Index_type j = jmin; j < jmax; j++) { for (Index_type i = imin; i < imax; i++) { - Index_type ip = i + j*jp + k*kp ; + Index_type iz = i + j*jp + k*kp ; - Index_type id = (i-imin) + (j-jmin)*j_stride + (k-kmin)*k_stride ; - real_zones[id] = ip; + Index_type il = (i-imin) + (j-jmin)*j_stride + (k-kmin)*k_stride ; + real_zones[il] = iz; } } } From 346d7c704eab1d85e19a078a4bfe0c627d1376d6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 13:57:39 -0700 Subject: [PATCH 316/454] fixup ADomain setup --- src/apps/AppsData.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index d9e208c89..a451a3073 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -80,7 +80,7 @@ class ADomain if ( ndims >= 3 ) { kmin = NPNL; - kmax = NPNL + rzmax-1; + kmax = NPNL + real_nodes_per_dim-1; kp = nnalls; nnalls *= (kmax+1 - kmin + NPNL + NPNR); n_real_zones *= (kmax - kmin); From d4010f036d104c58d30a9f501f45726c0e660253 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 14:20:37 -0700 Subject: [PATCH 317/454] Add print method for ADomain --- src/apps/AppsData.cpp | 34 ++++++++++++++++++++++++++++++++++ src/apps/AppsData.hpp | 4 ++++ 2 files changed, 38 insertions(+) diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp index 3ff4de9a7..facb3d592 100644 --- a/src/apps/AppsData.cpp +++ b/src/apps/AppsData.cpp @@ -16,6 +16,40 @@ namespace rajaperf namespace apps { + +std::ostream& operator<<(std::ostream& stream, const ADomain& domain) +{ + return stream + + << "ADomain" + + << " ndims " << domain.ndims + << " NPNL " << domain.NPNL + << " NPNR " << domain.NPNR + + << " imin " << domain.imin + << " jmin " << domain.jmin + << " kmin " << domain.kmin + << " imax " << domain.imax + << " jmax " << domain.jmax + << " kmax " << domain.kmax + + << " jp " << domain.jp + << " kp " << domain.kp + << " nnalls " << domain.nnalls + + << " fpn " << domain.fpn + << " lpn " << domain.lpn + << " frn " << domain.frn + << " lrn " << domain.lrn + + << " fpz " << domain.fpz + << " lpz " << domain.lpz + + << " n_real_zones " << domain.n_real_zones + << " n_real_nodes " << domain.n_real_nodes ; +} + // // Set zone indices for 2d mesh. // diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index a451a3073..b1908b7a5 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -9,6 +9,8 @@ #ifndef RAJAPerf_AppsData_HPP #define RAJAPerf_AppsData_HPP +#include + #include "common/RPTypes.hpp" namespace rajaperf @@ -131,6 +133,8 @@ class ADomain Index_type n_real_nodes; }; +std::ostream& operator<<(std::ostream& stream, const ADomain& domain); + // // Routines for initializing real zone indices for 2d/3d domains. // From 04ee794eb29350d8b0920cab0803d54cb1aafe3a Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 19 Apr 2024 15:26:54 -0700 Subject: [PATCH 318/454] Add ADI kernel SYCL variants. Also, change SYCL exec policies to sync for correctness. This needed to be investigated to resolve. --- src/polybench/CMakeLists.txt | 1 + src/polybench/POLYBENCH_2MM-Sycl.cpp | 4 + src/polybench/POLYBENCH_3MM-Sycl.cpp | 4 + src/polybench/POLYBENCH_ADI-Sycl.cpp | 171 +++++++++++++++++++++++ src/polybench/POLYBENCH_ADI.cpp | 3 + src/polybench/POLYBENCH_ADI.hpp | 5 + src/polybench/POLYBENCH_ATAX-Sycl.cpp | 28 ++-- src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp | 4 + src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp | 4 + 9 files changed, 216 insertions(+), 8 deletions(-) create mode 100644 src/polybench/POLYBENCH_ADI-Sycl.cpp diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index 62ddc37cf..8f282224e 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -28,6 +28,7 @@ blt_add_library( POLYBENCH_ADI-Cuda.cpp POLYBENCH_ADI-OMP.cpp POLYBENCH_ADI-OMPTarget.cpp + POLYBENCH_ADI-Sycl.cpp POLYBENCH_ATAX.cpp POLYBENCH_ATAX-Seq.cpp POLYBENCH_ATAX-Hip.cpp diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index ee2125584..a21a66481 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -97,7 +97,11 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::Lambda<0, RAJA::Params<0>>, diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp index 101bb2596..e63610e48 100644 --- a/src/polybench/POLYBENCH_3MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp @@ -117,7 +117,11 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::Lambda<0, RAJA::Params<0>>, diff --git a/src/polybench/POLYBENCH_ADI-Sycl.cpp b/src/polybench/POLYBENCH_ADI-Sycl.cpp new file mode 100644 index 000000000..e83b05302 --- /dev/null +++ b/src/polybench/POLYBENCH_ADI-Sycl.cpp @@ -0,0 +1,171 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_ADI.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + +template < size_t work_group_size > +void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_ADI_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 1; t <= tsteps; ++t) { + + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(n-2, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0) + 1; + + if (i < n-1) { + POLYBENCH_ADI_BODY2; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY3; + } + POLYBENCH_ADI_BODY4; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY5; + } + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0) + 1; + + if (i < n-1) { + POLYBENCH_ADI_BODY6; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY7; + } + POLYBENCH_ADI_BODY8; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY9; + } + } + + }); + }); + + } // tstep loop + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_ADI_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Segs<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<3, RAJA::Segs<0,2>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 1; t <= tsteps; ++t) { + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, + RAJA::RangeSegment{1, n-1}, + RAJA::RangeStrideSegment{n-2, 0, -1}), + + [=] (Index_type i) { + POLYBENCH_ADI_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j) { + POLYBENCH_ADI_BODY3_RAJA; + }, + [=] (Index_type i) { + POLYBENCH_ADI_BODY4_RAJA; + }, + [=] (Index_type i, Index_type k) { + POLYBENCH_ADI_BODY5_RAJA; + } + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, + RAJA::RangeSegment{1, n-1}, + RAJA::RangeStrideSegment{n-2, 0, -1}), + + [=] (Index_type i) { + POLYBENCH_ADI_BODY6_RAJA; + }, + [=] (Index_type i, Index_type j) { + POLYBENCH_ADI_BODY7_RAJA; + }, + [=] (Index_type i) { + POLYBENCH_ADI_BODY8_RAJA; + }, + [=] (Index_type i, Index_type k) { + POLYBENCH_ADI_BODY9_RAJA; + } + ); + + } // tstep loop + + } // run_reps + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_ADI : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_ADI, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 54997a63b..211f23be3 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -63,6 +63,9 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_ADI::~POLYBENCH_ADI() diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index 519a2de1f..613202509 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -195,13 +195,18 @@ class POLYBENCH_ADI : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/polybench/POLYBENCH_ATAX-Sycl.cpp b/src/polybench/POLYBENCH_ATAX-Sycl.cpp index 5081c99eb..21239bf28 100644 --- a/src/polybench/POLYBENCH_ATAX-Sycl.cpp +++ b/src/polybench/POLYBENCH_ATAX-Sycl.cpp @@ -44,11 +44,13 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) Index_type i = item.get_global_id(0); - POLYBENCH_ATAX_BODY1; - for (Index_type j = 0; j < N; ++j ) { - POLYBENCH_ATAX_BODY2; + if (i < N) { + POLYBENCH_ATAX_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY2; + } + POLYBENCH_ATAX_BODY3; } - POLYBENCH_ATAX_BODY3; }); }); @@ -59,11 +61,13 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) Index_type j = item.get_global_id(0); - POLYBENCH_ATAX_BODY4; - for (Index_type i = 0; i < N; ++i ) { - POLYBENCH_ATAX_BODY5; + if (j < N) { + POLYBENCH_ATAX_BODY4; + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY5; + } + POLYBENCH_ATAX_BODY6; } - POLYBENCH_ATAX_BODY6; }); }); @@ -77,7 +81,11 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) using EXEC_POL1 = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, RAJA::statement::For<1, RAJA::seq_exec, @@ -90,7 +98,11 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) using EXEC_POL2 = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<1, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Segs<1>, RAJA::Params<0>>, RAJA::statement::For<0, RAJA::seq_exec, diff --git a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp index 8613839d2..130f82a00 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp @@ -119,7 +119,11 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) using EXEC_POL234 = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::Lambda<0> diff --git a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp index ff98efa01..278a803d5 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp @@ -93,7 +93,11 @@ void POLYBENCH_HEAT_3D::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::For<2, RAJA::sycl_global_2, From 07d39000f5f97f7e2e5662a9a575b3043797a5b6 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 19 Apr 2024 15:58:51 -0700 Subject: [PATCH 319/454] Add FLOYD_WARSHALL kernel --- src/polybench/CMakeLists.txt | 1 + .../POLYBENCH_FLOYD_WARSHALL-Sycl.cpp | 118 ++++++++++++++++++ src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 3 + src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 5 + 4 files changed, 127 insertions(+) create mode 100644 src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index 8f282224e..f610dd9d1 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -49,6 +49,7 @@ blt_add_library( POLYBENCH_FLOYD_WARSHALL-Cuda.cpp POLYBENCH_FLOYD_WARSHALL-OMP.cpp POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp + POLYBENCH_FLOYD_WARSHALL-Sycl.cpp POLYBENCH_GEMM.cpp POLYBENCH_GEMM-Seq.cpp POLYBENCH_GEMM-Hip.cpp diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp new file mode 100644 index 000000000..401d1baed --- /dev/null +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp @@ -0,0 +1,118 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_FLOYD_WARSHALL.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_FLOYD_WARSHALL::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_FLOYD_WARSHALL_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<2> global_dim(i_wg_sz * RAJA_DIVIDE_CEILING_INT(N, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(N, j_wg_sz)); + + sycl::range<2> wkgroup_dim(i_wg_sz, j_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < N; ++k) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<2> item) { + + Index_type i = item.get_global_id(0); + Index_type j = item.get_global_id(1); + + if ( i < N && j < N ) { + POLYBENCH_FLOYD_WARSHALL_BODY; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::seq_exec, +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<1, RAJA::sycl_global_0, + RAJA::statement::For<2, RAJA::sycl_global_1, + RAJA::statement::Lambda<0> + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + res, + [=] (Index_type k, Index_type i, Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_FLOYD_WARSHALL : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_FLOYD_WARSHALL, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 614ba9ed1..3e5844805 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -60,6 +60,9 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL() diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index 5a9d7f26e..618f6e0f6 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -76,13 +76,18 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From ae1dcf8420d3125c4bae7c76fdc99d9db9227186 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Tue, 30 Apr 2024 13:21:39 -0700 Subject: [PATCH 320/454] Convert to use explicit resource --- src/polybench/POLYBENCH_2MM-Sycl.cpp | 6 ++++-- src/polybench/POLYBENCH_3MM-Sycl.cpp | 9 ++++++--- src/polybench/POLYBENCH_ADI-Sycl.cpp | 6 ++++-- src/polybench/POLYBENCH_ATAX-Sycl.cpp | 6 ++++-- src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp | 11 +++++++---- src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp | 6 ++++-- src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp | 10 +++++----- 7 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index a21a66481..81aea76e4 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -117,11 +117,12 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_param( + RAJA::kernel_param_resource( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nk}), RAJA::tuple{0.0}, + res, [=] (Real_type &dot) { POLYBENCH_2MM_BODY1_RAJA; @@ -136,11 +137,12 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) } ); - RAJA::kernel_param( + RAJA::kernel_param_resource( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nj}), RAJA::tuple{0.0}, + res, [=] (Real_type &dot) { POLYBENCH_2MM_BODY4_RAJA; diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp index e63610e48..3f72d13cf 100644 --- a/src/polybench/POLYBENCH_3MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp @@ -137,11 +137,12 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_param( + RAJA::kernel_param_resource( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nk}), RAJA::tuple{0.0}, + res, [=] (Real_type &dot) { POLYBENCH_3MM_BODY1_RAJA; @@ -157,11 +158,12 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) ); - RAJA::kernel_param( + RAJA::kernel_param_resource( RAJA::make_tuple(RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nm}), RAJA::tuple{0.0}, + res, [=] (Real_type &dot) { POLYBENCH_3MM_BODY4_RAJA; @@ -177,11 +179,12 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) ); - RAJA::kernel_param( + RAJA::kernel_param_resource( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, RAJA::RangeSegment{0, nj}), RAJA::tuple{0.0}, + res, [=] (Real_type &dot) { POLYBENCH_3MM_BODY7_RAJA; diff --git a/src/polybench/POLYBENCH_ADI-Sycl.cpp b/src/polybench/POLYBENCH_ADI-Sycl.cpp index e83b05302..d5711f106 100644 --- a/src/polybench/POLYBENCH_ADI-Sycl.cpp +++ b/src/polybench/POLYBENCH_ADI-Sycl.cpp @@ -114,10 +114,11 @@ void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) for (Index_type t = 1; t <= tsteps; ++t) { - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, RAJA::RangeSegment{1, n-1}, RAJA::RangeStrideSegment{n-2, 0, -1}), + res, [=] (Index_type i) { POLYBENCH_ADI_BODY2_RAJA; @@ -133,10 +134,11 @@ void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) } ); - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, RAJA::RangeSegment{1, n-1}, RAJA::RangeStrideSegment{n-2, 0, -1}), + res, [=] (Index_type i) { POLYBENCH_ADI_BODY6_RAJA; diff --git a/src/polybench/POLYBENCH_ATAX-Sycl.cpp b/src/polybench/POLYBENCH_ATAX-Sycl.cpp index 21239bf28..23759f95c 100644 --- a/src/polybench/POLYBENCH_ATAX-Sycl.cpp +++ b/src/polybench/POLYBENCH_ATAX-Sycl.cpp @@ -117,10 +117,11 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel_param( + RAJA::kernel_param_resource( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, + res, [=] (Index_type i, Real_type &dot) { POLYBENCH_ATAX_BODY1_RAJA; @@ -134,10 +135,11 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) ); - RAJA::kernel_param( + RAJA::kernel_param_resource( RAJA::make_tuple(RAJA::RangeSegment{0, N}, RAJA::RangeSegment{0, N}), RAJA::tuple{0.0}, + res, [=] (Index_type j, Real_type &dot) { POLYBENCH_ATAX_BODY4_RAJA; diff --git a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp index 130f82a00..6a9b41ffb 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp @@ -137,30 +137,33 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) for (t = 0; t < tsteps; ++t) { - RAJA::forall( RAJA::RangeSegment(0, ny), + RAJA::forall( res, RAJA::RangeSegment(0, ny), [=] (Index_type j) { POLYBENCH_FDTD_2D_BODY1_RAJA; }); - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, nx}, RAJA::RangeSegment{0, ny}), + res, [=] (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY2_RAJA; } ); - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, nx}, RAJA::RangeSegment{1, ny}), + res, [=] (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY3_RAJA; } ); - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{0, nx-1}, RAJA::RangeSegment{0, ny-1}), + res, [=] (Index_type i, Index_type j) { POLYBENCH_FDTD_2D_BODY4_RAJA; } diff --git a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp index 278a803d5..4acfb7468 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp @@ -114,19 +114,21 @@ void POLYBENCH_HEAT_3D::runSyclVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, RAJA::RangeSegment{1, N-1}, RAJA::RangeSegment{1, N-1}), + res, [=] (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY1_RAJA; } ); - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, RAJA::RangeSegment{1, N-1}, RAJA::RangeSegment{1, N-1}), + res, [=] (Index_type i, Index_type j, Index_type k) { POLYBENCH_HEAT_3D_BODY2_RAJA; } diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp index 0a12baa10..95d8e9e91 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp @@ -71,20 +71,20 @@ void POLYBENCH_JACOBI_1D::runSyclVariantImpl(VariantID vid) } else if (vid == RAJA_SYCL) { - const bool async = true; + using EXEC_POL = RAJA::sycl_exec; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type t = 0; t < tsteps; ++t) { - RAJA::forall< RAJA::sycl_exec>( - RAJA::RangeSegment{1, N-1}, [=] (Index_type i) { + RAJA::forall ( res, RAJA::RangeSegment{1, N-1}, + [=] (Index_type i) { POLYBENCH_JACOBI_1D_BODY1; }); - RAJA::forall< RAJA::sycl_exec>( - RAJA::RangeSegment{1, N-1}, [=] (Index_type i) { + RAJA::forall ( res, RAJA::RangeSegment{1, N-1}, + [=] (Index_type i) { POLYBENCH_JACOBI_1D_BODY2; }); From 222ad0294afaa73cfb42786fddd0a9d87aecd77b Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Tue, 30 Apr 2024 14:51:02 -0700 Subject: [PATCH 321/454] Change to explicit resource usage, and note where async policies apply --- src/apps/DEL_DOT_VEC_2D-Sycl.cpp | 2 +- src/apps/ENERGY-Sycl.cpp | 12 ++++++------ src/apps/FIR-Sycl.cpp | 2 +- src/apps/LTIMES-Sycl.cpp | 14 ++++++++++---- src/apps/LTIMES_NOVIEW-Sycl.cpp | 14 ++++++++++---- src/apps/PRESSURE-Sycl.cpp | 4 ++-- src/apps/VOL3D-Sycl.cpp | 2 +- src/basic/DAXPY-Sycl.cpp | 2 +- src/basic/IF_QUAD-Sycl.cpp | 2 +- src/basic/INIT3-Sycl.cpp | 2 +- src/basic/INIT_VIEW1D-Sycl.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp | 2 +- src/basic/MULADDSUB-Sycl.cpp | 2 +- src/basic/NESTED_INIT-Sycl.cpp | 12 +++++++++--- src/basic/REDUCE3_INT-Sycl.cpp | 2 +- src/basic/TRAP_INT-Sycl.cpp | 2 +- src/lcals/DIFF_PREDICT-Sycl.cpp | 2 +- src/lcals/EOS-Sycl.cpp | 2 +- src/lcals/FIRST_DIFF-Sycl.cpp | 2 +- src/lcals/GEN_LIN_RECUR-Sycl.cpp | 4 ++-- src/lcals/HYDRO_1D-Sycl.cpp | 2 +- src/lcals/HYDRO_2D-Sycl.cpp | 4 ++++ src/lcals/INT_PREDICT-Sycl.cpp | 2 +- src/lcals/PLANCKIAN-Sycl.cpp | 2 +- src/lcals/TRIDIAG_ELIM-Sycl.cpp | 2 +- src/stream/ADD-Sycl.cpp | 2 +- src/stream/COPY-Sycl.cpp | 2 +- src/stream/DOT-Sycl.cpp | 2 +- src/stream/MUL-Sycl.cpp | 2 +- src/stream/TRIAD-Sycl.cpp | 2 +- 30 files changed, 66 insertions(+), 44 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp index da3bc9b4b..f1c199337 100644 --- a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -66,7 +66,7 @@ void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, zones, [=] (Index_type i) { DEL_DOT_VEC_2D_BODY; }); diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp index 1fd105f81..dd65f4f7e 100644 --- a/src/apps/ENERGY-Sycl.cpp +++ b/src/apps/ENERGY-Sycl.cpp @@ -125,32 +125,32 @@ void ENERGY::runSyclVariantImpl(VariantID vid) RAJA::region( [=]() { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY1; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY2; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY3; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY4; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY5; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ENERGY_BODY6; }); diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp index 178d00b4b..eb1c5faef 100644 --- a/src/apps/FIR-Sycl.cpp +++ b/src/apps/FIR-Sycl.cpp @@ -85,7 +85,7 @@ void FIR::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { FIR_BODY; }); diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index b0f8ad089..84f6b1e2d 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -73,7 +73,11 @@ void LTIMES::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else RAJA::statement::SyclKernel< +#endif RAJA::statement::For<1, RAJA::sycl_global_2, //z RAJA::statement::For<2, RAJA::sycl_global_1, //g RAJA::statement::For<3, RAJA::sycl_global_0, //m @@ -89,10 +93,12 @@ void LTIMES::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(IDRange(0, num_d), - IZRange(0, num_z), - IGRange(0, num_g), - IMRange(0, num_m)), + RAJA::kernel_resource( + RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + res, [=] (ID d, IZ z, IG g, IM m) { LTIMES_BODY_RAJA; }); diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index ae2f2b000..9a8744f2b 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -71,7 +71,11 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else RAJA::statement::SyclKernel< +#endif RAJA::statement::For<1, RAJA::sycl_global_2, //z RAJA::statement::For<2, RAJA::sycl_global_1, //g RAJA::statement::For<3, RAJA::sycl_global_0, //m @@ -87,10 +91,12 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, num_d), - RAJA::RangeSegment(0, num_z), - RAJA::RangeSegment(0, num_g), - RAJA::RangeSegment(0, num_m)), + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + res, [=] (Index_type d, Index_type z, Index_type g, Index_type m) { LTIMES_NOVIEW_BODY; }); diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp index 6fd7735bb..1d1778d6d 100644 --- a/src/apps/PRESSURE-Sycl.cpp +++ b/src/apps/PRESSURE-Sycl.cpp @@ -78,12 +78,12 @@ void PRESSURE::runSyclVariantImpl(VariantID vid) RAJA::region( [=]() { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { PRESSURE_BODY1; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { PRESSURE_BODY2; }); diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp index 6c18ec1c8..13a66d15c 100644 --- a/src/apps/VOL3D-Sycl.cpp +++ b/src/apps/VOL3D-Sycl.cpp @@ -62,7 +62,7 @@ void VOL3D::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { VOL3D_BODY; }); diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index f74634beb..11441107c 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -59,7 +59,7 @@ void DAXPY::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { DAXPY_BODY; }); diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index 93ff76c4e..3c447bf9c 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -59,7 +59,7 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { IF_QUAD_BODY; }); diff --git a/src/basic/INIT3-Sycl.cpp b/src/basic/INIT3-Sycl.cpp index 4b71d1846..ea5277730 100644 --- a/src/basic/INIT3-Sycl.cpp +++ b/src/basic/INIT3-Sycl.cpp @@ -60,7 +60,7 @@ void INIT3::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { INIT3_BODY; }); diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index d7f99ce35..f8ce78434 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -61,7 +61,7 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { INIT_VIEW1D_BODY_RAJA; }); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp index 4834edbe8..f586540f6 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Sycl.cpp @@ -60,7 +60,7 @@ void INIT_VIEW1D_OFFSET::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { INIT_VIEW1D_OFFSET_BODY; }); diff --git a/src/basic/MULADDSUB-Sycl.cpp b/src/basic/MULADDSUB-Sycl.cpp index 13d690c2b..9bca65221 100644 --- a/src/basic/MULADDSUB-Sycl.cpp +++ b/src/basic/MULADDSUB-Sycl.cpp @@ -60,7 +60,7 @@ void MULADDSUB::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { MULADDSUB_BODY; }); diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index b01a7d597..5e310fabd 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -70,7 +70,11 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<2, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::For<0, RAJA::sycl_global_2, @@ -84,9 +88,11 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + res, [=] (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }); diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index f90a39d81..367b43801 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -109,7 +109,7 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) RAJA::ReduceMin vmin(m_vmin_init); RAJA::ReduceMax vmax(m_vmax_init); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { REDUCE3_INT_BODY_RAJA; }); diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index e8f066c1d..68ff0001c 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -81,7 +81,7 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { TRAP_INT_BODY; }); diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp index e32dd99f2..e46e93530 100644 --- a/src/lcals/DIFF_PREDICT-Sycl.cpp +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -60,7 +60,7 @@ void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { DIFF_PREDICT_BODY; }); diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp index 7737aa0e3..cb4e200b9 100644 --- a/src/lcals/EOS-Sycl.cpp +++ b/src/lcals/EOS-Sycl.cpp @@ -59,7 +59,7 @@ void EOS::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { EOS_BODY; }); diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp index 20df37f9d..4d8aa3b99 100644 --- a/src/lcals/FIRST_DIFF-Sycl.cpp +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -59,7 +59,7 @@ void FIRST_DIFF::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { FIRST_DIFF_BODY; }); diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp index dc6e7ab95..abfad2347 100644 --- a/src/lcals/GEN_LIN_RECUR-Sycl.cpp +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -71,12 +71,12 @@ void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(0, N), [=] (Index_type k) { GEN_LIN_RECUR_BODY1; }); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(1, N+1), [=] (Index_type i) { GEN_LIN_RECUR_BODY2; }); diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp index 9ea248536..bbef573d5 100644 --- a/src/lcals/HYDRO_1D-Sycl.cpp +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -59,7 +59,7 @@ void HYDRO_1D::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { HYDRO_1D_BODY; }); diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index 471c13aca..7100f0690 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -102,7 +102,11 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { using EXECPOL = RAJA::KernelPolicy< +#if 0 RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::Lambda<0> diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp index 88e7fb60a..7813b63c8 100644 --- a/src/lcals/INT_PREDICT-Sycl.cpp +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -59,7 +59,7 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { INT_PREDICT_BODY; }); diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp index 09b294f81..164bc9e69 100644 --- a/src/lcals/PLANCKIAN-Sycl.cpp +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -62,7 +62,7 @@ void PLANCKIAN::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { PLANCKIAN_BODY; }); diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp index 213de1bfd..1ed8918f8 100644 --- a/src/lcals/TRIDIAG_ELIM-Sycl.cpp +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -59,7 +59,7 @@ void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { TRIDIAG_ELIM_BODY; }); diff --git a/src/stream/ADD-Sycl.cpp b/src/stream/ADD-Sycl.cpp index 0e56cee30..483672cb1 100644 --- a/src/stream/ADD-Sycl.cpp +++ b/src/stream/ADD-Sycl.cpp @@ -59,7 +59,7 @@ void ADD::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { ADD_BODY; }); diff --git a/src/stream/COPY-Sycl.cpp b/src/stream/COPY-Sycl.cpp index 2f2e6511b..4f1049a6e 100644 --- a/src/stream/COPY-Sycl.cpp +++ b/src/stream/COPY-Sycl.cpp @@ -59,7 +59,7 @@ void COPY::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { COPY_BODY; }); diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp index 0475dcc70..d42f4d428 100644 --- a/src/stream/DOT-Sycl.cpp +++ b/src/stream/DOT-Sycl.cpp @@ -77,7 +77,7 @@ void DOT::runSyclVariantImpl(VariantID vid) RAJA::ReduceSum dot(m_dot_init); - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { DOT_BODY; }); diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp index ccac06b84..f97ea5445 100644 --- a/src/stream/MUL-Sycl.cpp +++ b/src/stream/MUL-Sycl.cpp @@ -58,7 +58,7 @@ void MUL::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { MUL_BODY; }); diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp index aaa4011ab..71dfeee1d 100644 --- a/src/stream/TRIAD-Sycl.cpp +++ b/src/stream/TRIAD-Sycl.cpp @@ -59,7 +59,7 @@ void TRIAD::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::sycl_exec >( + RAJA::forall< RAJA::sycl_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { TRIAD_BODY; }); From ae8373f791b3fd53ddb5e6d2fa8c8219a7316283 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 6 Mar 2024 09:06:50 -0800 Subject: [PATCH 322/454] Generalize MULTI_REDUCE data type --- src/basic/MULTI_REDUCE-Cuda.cpp | 6 +++--- src/basic/MULTI_REDUCE-Hip.cpp | 6 +++--- src/basic/MULTI_REDUCE.cpp | 4 ++-- src/basic/MULTI_REDUCE.hpp | 26 ++++++++++++++------------ 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index b18e38c12..df400342a 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -23,9 +23,9 @@ namespace basic template < size_t block_size, size_t replication > __launch_bounds__(block_size) -__global__ void multi_reduce(Real_ptr values, +__global__ void multi_reduce(Data_ptr values, Index_ptr bins, - Real_ptr data, + Data_ptr data, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; @@ -47,7 +47,7 @@ void MULTI_REDUCE::runCudaVariantReplicateGlobal(VariantID vid) MULTI_REDUCE_GPU_DATA_SETUP; - RAJAPERF_CUDA_REDUCER_SETUP(Real_ptr, values, hvalues, num_bins, replication); + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, replication); if ( vid == Base_CUDA ) { diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 41766ad0e..a05c777c5 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -23,9 +23,9 @@ namespace basic template < size_t block_size, size_t replication > __launch_bounds__(block_size) -__global__ void multi_reduce(Real_ptr values, +__global__ void multi_reduce(Data_ptr values, Index_ptr bins, - Real_ptr data, + Data_ptr data, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; @@ -47,7 +47,7 @@ void MULTI_REDUCE::runHipVariantReplicateGlobal(VariantID vid) MULTI_REDUCE_GPU_DATA_SETUP; - RAJAPERF_HIP_REDUCER_SETUP(Real_ptr, values, hvalues, num_bins, replication); + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, replication); if ( vid == Base_HIP ) { diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 9bb26201b..a41b20850 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -30,8 +30,8 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type))*m_num_bins + - (1*sizeof(Real_type) + 0*sizeof(Real_type) + + setBytesPerRep( (1*sizeof(Data_type) + 1*sizeof(Data_type))*m_num_bins + + (1*sizeof(Data_type) + 0*sizeof(Data_type) + 1*sizeof(Index_type) + 0*sizeof(Index_type)) * getActualProblemSize() ); setFLOPsPerRep(1 * getActualProblemSize()); diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index ae3190c52..7d9e0af4f 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -21,10 +21,10 @@ #define MULTI_REDUCE_DATA_SETUP \ Index_type num_bins = m_num_bins; \ Index_ptr bins = m_bins; \ - Real_ptr data = m_data; \ - Real_ptr values_init = m_values_init.data(); \ - Real_ptr values_final = m_values_final.data(); \ - Real_ptr values; \ + Data_ptr data = m_data; \ + Data_ptr values_init = m_values_init.data(); \ + Data_ptr values_final = m_values_final.data(); \ + Data_ptr values; \ allocData(getReductionDataSpace(vid), values, num_bins); #define MULTI_REDUCE_DATA_TEARDOWN \ @@ -33,9 +33,9 @@ #define MULTI_REDUCE_GPU_DATA_SETUP \ Index_type num_bins = m_num_bins; \ Index_ptr bins = m_bins; \ - Real_ptr data = m_data; \ - Real_ptr values_init = m_values_init.data(); \ - Real_ptr values_final = m_values_final.data(); + Data_ptr data = m_data; \ + Data_ptr values_init = m_values_init.data(); \ + Data_ptr values_final = m_values_final.data(); #define MULTI_REDUCE_BODY \ values[bins[i]] += data[i]; @@ -58,7 +58,7 @@ #define MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication) \ for (Index_type b = 0; b < (num_bins); ++b) { \ - Real_type val_final = 0.0; \ + Data_type val_final = 0; \ for (size_t r = 0; r < (replication); ++r) { \ val_final += (hvalues)[b*(replication) + r]; \ } \ @@ -78,6 +78,8 @@ namespace basic class MULTI_REDUCE : public KernelBase { public: + using Data_type = Real_type; + using Data_ptr = Real_ptr; MULTI_REDUCE(const RunParams& params); @@ -104,14 +106,14 @@ class MULTI_REDUCE : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const size_t default_atomic_replication = 4096; + static const size_t default_atomic_replication = 2048; // 512, 512 using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; Index_ptr m_bins; - Real_ptr m_data; - std::vector m_values_init; - std::vector m_values_final; + Data_ptr m_data; + std::vector m_values_init; + std::vector m_values_final; }; } // end namespace basic From ddfca036079fd929c9ddfe4f74e99c491c0f8cae Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 6 Mar 2024 09:09:05 -0800 Subject: [PATCH 323/454] Add some bin list creation options in HISTOGRAM --- src/algorithm/HISTOGRAM.cpp | 54 ++++++++++++++++++++++++++++++------- src/algorithm/HISTOGRAM.hpp | 2 +- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 8de9690e3..8252aaec6 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -12,6 +12,8 @@ #include "common/DataUtils.hpp" +#include + namespace rajaperf { namespace algorithm @@ -68,20 +70,52 @@ void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) allocData(m_bins, getActualProblemSize(), vid); { auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); - Real_ptr data; - allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq); - for (Index_type i = 0; i < getActualProblemSize(); ++i) { - m_bins[i] = static_cast(data[i] * m_num_bins); - if (m_bins[i] >= m_num_bins) { - m_bins[i] = m_num_bins - 1; + bool init_even_sizes = false; + bool init_random_sizes = true; + bool init_random_per_iterate = false; + if (init_even_sizes || init_random_sizes) { + Real_ptr data = nullptr; + if (init_even_sizes) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(b+1) / m_num_bins; + } + } else if (init_random_sizes) { + allocAndInitDataRandValue(data, m_num_bins, Base_Seq); + std::sort(data, data+m_num_bins); } - if (m_bins[i] < 0) { - m_bins[i] = 0; + + Index_type actual_prob_size = getActualProblemSize(); + Index_type bin = 0; + for (Index_type i = 0; i < actual_prob_size; ++i) { + Real_type pos = static_cast(i) / actual_prob_size; + while (pos >= data[bin]) { + bin += 1; + } + m_bins[i] = bin; } - } - deallocData(data, Base_Seq); + deallocData(data, Base_Seq); + + } else if (init_random_per_iterate) { + Real_ptr data; + allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq); + + for (Index_type i = 0; i < getActualProblemSize(); ++i) { + m_bins[i] = static_cast(data[i] * m_num_bins); + if (m_bins[i] >= m_num_bins) { + m_bins[i] = m_num_bins - 1; + } + if (m_bins[i] < 0) { + m_bins[i] = 0; + } + } + + deallocData(data, Base_Seq); + } else { + throw 1; + } } m_counts_init.resize(m_num_bins, 0); diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index bed41658c..4dbf19a74 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -106,7 +106,7 @@ class HISTOGRAM : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const size_t default_atomic_replication = 4096; + static const size_t default_atomic_replication = 2048; // 512, 512 using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; From 756ffe50b022a7112259c185b251dc205564110d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 6 Mar 2024 09:41:52 -0800 Subject: [PATCH 324/454] Fixup MULTI_REDUCE --- src/basic/MULTI_REDUCE-Cuda.cpp | 4 ++-- src/basic/MULTI_REDUCE-Hip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index df400342a..67df3e53e 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -23,9 +23,9 @@ namespace basic template < size_t block_size, size_t replication > __launch_bounds__(block_size) -__global__ void multi_reduce(Data_ptr values, +__global__ void multi_reduce(MULTI_REDUCE::Data_ptr values, Index_ptr bins, - Data_ptr data, + MULTI_REDUCE::Data_ptr data, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index a05c777c5..9260ead8d 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -23,9 +23,9 @@ namespace basic template < size_t block_size, size_t replication > __launch_bounds__(block_size) -__global__ void multi_reduce(Data_ptr values, +__global__ void multi_reduce(MULTI_REDUCE::Data_ptr values, Index_ptr bins, - Data_ptr data, + MULTI_REDUCE::Data_ptr data, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; From d4b6ecf01af6d4a665b2fa91a71bb638097dbb2a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 13 Mar 2024 16:42:15 -0700 Subject: [PATCH 325/454] fix naming default_gpu_atomic_replication --- src/algorithm/HISTOGRAM.hpp | 4 ++-- src/basic/MULTI_REDUCE.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 4dbf19a74..c2f74bd25 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -106,8 +106,8 @@ class HISTOGRAM : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const size_t default_atomic_replication = 2048; // 512, 512 - using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; + static const size_t default_gpu_atomic_replication = 2048; // 512, 512 + using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; Index_ptr m_bins; diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 7d9e0af4f..49680ddae 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -106,8 +106,8 @@ class MULTI_REDUCE : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const size_t default_atomic_replication = 2048; // 512, 512 - using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; + static const size_t default_gpu_atomic_replication = 2048; // 512, 512 + using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; Index_ptr m_bins; From a8e3ade969362300be491ea0350fb6dfa81edd1d Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 1 May 2024 10:55:16 -0700 Subject: [PATCH 326/454] update raja version --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 784457ea1..c315dddd6 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 784457ea194756bf2a5abaf65e0b9feb863994ff +Subproject commit c315dddd601036f93d2f3db6f06563beb165fe77 From 81f90e68b11846108d98615e3d768c3ca8e86870 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 1 May 2024 11:22:47 -0700 Subject: [PATCH 327/454] address PR comments --- src/apps/CONVECTION3DPA-Sycl.cpp | 8 +++----- src/apps/CONVECTION3DPA.hpp | 2 +- src/apps/DIFFUSION3DPA-Sycl.cpp | 6 +++--- src/apps/DIFFUSION3DPA.hpp | 2 +- src/apps/MASS3DEA-Sycl.cpp | 6 +++--- src/apps/MASS3DEA.hpp | 2 +- src/apps/MASS3DPA-Sycl.cpp | 6 +++--- src/apps/MASS3DPA.hpp | 2 +- src/basic/MAT_MAT_SHARED-Sycl.cpp | 19 ++++++++++++------- src/basic/MAT_MAT_SHARED.hpp | 10 +--------- 10 files changed, 29 insertions(+), 34 deletions(-) diff --git a/src/apps/CONVECTION3DPA-Sycl.cpp b/src/apps/CONVECTION3DPA-Sycl.cpp index 87ad04a35..c01087818 100644 --- a/src/apps/CONVECTION3DPA-Sycl.cpp +++ b/src/apps/CONVECTION3DPA-Sycl.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -template < size_t block_size > +template < size_t work_group_size > void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -28,7 +28,7 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { CONVECTION3DPA_DATA_SETUP; - const ::sycl::range<3> blockSize(CPA_Q1D, CPA_Q1D, CPA_Q1D); + const ::sycl::range<3> workGroupSize(CPA_Q1D, CPA_Q1D, CPA_Q1D); const ::sycl::range<3> gridSize(CPA_Q1D,CPA_Q1D,CPA_Q1D*NE); constexpr size_t shmem = 0; @@ -40,8 +40,6 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - //constexpr size_t shmem = 0; - qu->submit([&](cl::sycl::handler& h) { constexpr int max_D1D = CPA_D1D; @@ -56,7 +54,7 @@ void CONVECTION3DPA::runSyclVariantImpl(VariantID vid) { auto sm5_vec = ::sycl::local_accessor(::sycl::range<1>(max_DQ*max_DQ*max_DQ), h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), + (cl::sycl::nd_range<3>(gridSize, workGroupSize), [=] (cl::sycl::nd_item<3> itm) { const Index_type e = itm.get_group(2); diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index c0044ecdc..38629b28c 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -388,7 +388,7 @@ class CONVECTION3DPA : public KernelBase void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/apps/DIFFUSION3DPA-Sycl.cpp b/src/apps/DIFFUSION3DPA-Sycl.cpp index 2a10c9370..fccc14260 100644 --- a/src/apps/DIFFUSION3DPA-Sycl.cpp +++ b/src/apps/DIFFUSION3DPA-Sycl.cpp @@ -22,7 +22,7 @@ namespace rajaperf { namespace apps { -template < size_t block_size > +template < size_t work_group_size > void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -35,7 +35,7 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { case Base_SYCL: { - const ::sycl::range<3> blockSize(DPA_Q1D, DPA_Q1D, DPA_Q1D); + const ::sycl::range<3> workGroupSize(DPA_Q1D, DPA_Q1D, DPA_Q1D); const ::sycl::range<3> gridSize(DPA_Q1D,DPA_Q1D,DPA_Q1D*NE); startTimer(); @@ -57,7 +57,7 @@ void DIFFUSION3DPA::runSyclVariantImpl(VariantID vid) { auto sm1_2_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ*MDQ*MDQ), h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), + (cl::sycl::nd_range<3>(gridSize, workGroupSize), [=] (cl::sycl::nd_item<3> itm) { const Index_type e = itm.get_group(2); diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index 2d6c8362f..5b587279c 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -491,7 +491,7 @@ class DIFFUSION3DPA : public KernelBase void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/apps/MASS3DEA-Sycl.cpp b/src/apps/MASS3DEA-Sycl.cpp index 8fccffdc3..a2dfd87f3 100644 --- a/src/apps/MASS3DEA-Sycl.cpp +++ b/src/apps/MASS3DEA-Sycl.cpp @@ -19,7 +19,7 @@ namespace rajaperf { namespace apps { -template < size_t block_size > +template < size_t work_group_size > void MASS3DEA::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -32,7 +32,7 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { case Base_SYCL: { - const ::sycl::range<3> blockSize(MEA_Q1D, MEA_Q1D, MEA_Q1D); + const ::sycl::range<3> workGroupSize(MEA_Q1D, MEA_Q1D, MEA_Q1D); const ::sycl::range<3> gridSize(MEA_Q1D,MEA_Q1D,MEA_Q1D*NE); startTimer(); @@ -45,7 +45,7 @@ void MASS3DEA::runSyclVariantImpl(VariantID vid) { ::sycl::local_accessor s_D(::sycl::range<3>(MEA_Q1D,MEA_Q1D,MEA_Q1D),h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), + (cl::sycl::nd_range<3>(gridSize, workGroupSize), [=] (cl::sycl::nd_item<3> itm) { const Index_type e = itm.get_group(2); diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index e5a3a5165..7c0ea6e02 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -162,7 +162,7 @@ class MASS3DEA : public KernelBase { void runCudaVariantImpl(VariantID vid); template void runHipVariantImpl(VariantID vid); - template + template void runSyclVariantImpl(VariantID vid); private: diff --git a/src/apps/MASS3DPA-Sycl.cpp b/src/apps/MASS3DPA-Sycl.cpp index 1b49ef02c..7d65034a7 100644 --- a/src/apps/MASS3DPA-Sycl.cpp +++ b/src/apps/MASS3DPA-Sycl.cpp @@ -22,7 +22,7 @@ namespace rajaperf { namespace apps { -template < size_t block_size > +template < size_t work_group_size > void MASS3DPA::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -31,7 +31,7 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { MASS3DPA_DATA_SETUP; - const ::sycl::range<3> blockSize(1, MPA_Q1D, MPA_Q1D); + const ::sycl::range<3> workGroupSize(1, MPA_Q1D, MPA_Q1D); const ::sycl::range<3> gridSize(1, MPA_Q1D, MPA_Q1D*NE); switch (vid) { @@ -52,7 +52,7 @@ void MASS3DPA::runSyclVariantImpl(VariantID vid) { auto sm1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), + (cl::sycl::nd_range<3>(gridSize, workGroupSize), [=] (cl::sycl::nd_item<3> itm) { const Index_type e = itm.get_group(2); diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 2d6f38006..0e11e234b 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -373,7 +373,7 @@ class MASS3DPA : public KernelBase void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp index 927a0bebc..174ac0952 100644 --- a/src/basic/MAT_MAT_SHARED-Sycl.cpp +++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp @@ -19,11 +19,11 @@ namespace rajaperf { namespace basic { -template < size_t block_size > +template < size_t work_group_size > void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) { - constexpr Index_type tile_size = integer::sqrt(block_size); - static_assert(tile_size*tile_size == block_size, "Invalid block_size"); + constexpr Index_type tile_size = integer::sqrt(work_group_size); + static_assert(tile_size*tile_size == work_group_size, "Invalid block_size"); const Index_type run_reps = getRunReps(); const Index_type N = m_N; @@ -32,7 +32,7 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, tile_size); //Right most is the fastest index - const ::sycl::range<3> blockSize(1, tile_size, tile_size); + const ::sycl::range<3> workGroupSize(1, tile_size, tile_size); const ::sycl::range<3> gridSize(1, Ny*tile_size, Nx*tile_size); constexpr size_t shmem = tile_size * tile_size; @@ -54,7 +54,7 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); h.parallel_for - (cl::sycl::nd_range<3>(gridSize, blockSize), + (cl::sycl::nd_range<3>(gridSize, workGroupSize), [=] (cl::sycl::nd_item<3> itm) { Index_type tx = itm.get_local_id(2); @@ -118,13 +118,18 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) //We only support dynamic shared memory in Sycl //Thus requiring a different setup than other backends //which use static shared memory - MAT_MAT_SHARED_BODY_SYCL_0(tile_size) + double * As_ptr = ctx.getSharedMemory(tile_size * tile_size); + double * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); + double * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); + double (*As)[tile_size] = (double (*)[tile_size]) As_ptr; + double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr; + double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr; RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_1(tile_size) + MAT_MAT_SHARED_BODY_1(tile_size) } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index d2708a0c4..b543dd4f7 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -94,14 +94,6 @@ constexpr rajaperf::Index_type TL_SZ = 16; RAJA_TEAM_SHARED double Bs[tile_size][tile_size]; \ RAJA_TEAM_SHARED double Cs[tile_size][tile_size]; -#define MAT_MAT_SHARED_BODY_SYCL_0(tile_size) \ - double * As_ptr = ctx.getSharedMemory(tile_size * tile_size); \ - double * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); \ - double * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); \ - double (*As)[tile_size] = (double (*)[tile_size]) As_ptr; \ - double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr; \ - double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr; \ - #define MAT_MAT_SHARED_BODY_1(tile_size) \ Cs[ty][tx] = 0; @@ -157,7 +149,7 @@ class MAT_MAT_SHARED : public KernelBase { void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); - template < size_t block_size > + template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); private: From 302acdee53894d5497daa21fd1c0becb7f538d1b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 1 May 2024 12:29:41 -0700 Subject: [PATCH 328/454] Empty-Commit From b44c18cc62261a2d2a797bd097bbd76da8678cc0 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 1 May 2024 14:46:16 -0700 Subject: [PATCH 329/454] Empty-Commit From dbc22182a9a84a0b043f53193a2cac0ceeb6f282 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 1 May 2024 16:03:42 -0700 Subject: [PATCH 330/454] Change to 3D workgroup --- src/lcals/HYDRO_2D-Sycl.cpp | 42 ++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index 7100f0690..4b317d6ce 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -43,20 +43,21 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { if ( vid == Base_SYCL ) { - sycl::range<2> global_dim(k_wg_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_wg_sz), + sycl::range<3> global_dim(1, + k_wg_sz * RAJA_DIVIDE_CEILING_INT(kn-2, k_wg_sz), j_wg_sz * RAJA_DIVIDE_CEILING_INT(jn-2, j_wg_sz)); - sycl::range<2> wkgroup_dim(k_wg_sz, j_wg_sz); + sycl::range<3> wkgroup_dim(1, k_wg_sz, j_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - int j = item.get_global_id(1) + 1; - int k = item.get_global_id(0) + 1; + int j = item.get_global_id(2) + 1; + int k = item.get_global_id(1) + 1; if (j < jn-1 && k < kn-1) { HYDRO_2D_BODY1 @@ -66,11 +67,11 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - int j = item.get_global_id(1) + 1; - int k = item.get_global_id(0) + 1; + int j = item.get_global_id(2) + 1; + int k = item.get_global_id(1) + 1; if (j < jn-1 && k < kn-1) { HYDRO_2D_BODY2 @@ -80,11 +81,11 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - int j = item.get_global_id(1) + 1; - int k = item.get_global_id(0) + 1; + int j = item.get_global_id(2) + 1; + int k = item.get_global_id(1) + 1; if (j < jn-1 && k < kn-1) { HYDRO_2D_BODY3 @@ -107,8 +108,8 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { #else RAJA::statement::SyclKernel< #endif - RAJA::statement::For<0, RAJA::sycl_global_0, - RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0> > > @@ -118,23 +119,26 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), + res, [=] (Index_type k, Index_type j) { HYDRO_2D_BODY1_RAJA; }); - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), + res, [=] (Index_type k, Index_type j) { HYDRO_2D_BODY2_RAJA; }); - RAJA::kernel( + RAJA::kernel_resource( RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), RAJA::RangeSegment(jbeg, jend)), + res, [=] (Index_type k, Index_type j) { HYDRO_2D_BODY3_RAJA; }); From e51f6975b22a05c4ebd2ad34e954518d3bcc601f Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 1 May 2024 16:25:33 -0700 Subject: [PATCH 331/454] Add SYCL variant for FIRST_SUM kernel and some formatting cleanup --- src/apps/ENERGY-Sycl.cpp | 1 + src/apps/FIR-Sycl.cpp | 1 + src/apps/LTIMES-Sycl.cpp | 1 + src/apps/LTIMES_NOVIEW-Sycl.cpp | 1 + src/apps/VOL3D-Sycl.cpp | 1 + src/basic/DAXPY-Sycl.cpp | 1 + src/lcals/CMakeLists.txt | 1 + src/lcals/DIFF_PREDICT-Sycl.cpp | 1 + src/lcals/EOS-Sycl.cpp | 1 + src/lcals/FIRST_DIFF-Sycl.cpp | 1 + src/lcals/FIRST_SUM-Sycl.cpp | 82 ++++++++++++++++++++++++++++++++ src/lcals/FIRST_SUM.cpp | 3 ++ src/lcals/FIRST_SUM.hpp | 4 ++ src/lcals/GEN_LIN_RECUR-Sycl.cpp | 1 + src/lcals/HYDRO_1D-Sycl.cpp | 1 + src/lcals/INT_PREDICT-Sycl.cpp | 1 + src/lcals/PLANCKIAN-Sycl.cpp | 1 + src/lcals/TRIDIAG_ELIM-Sycl.cpp | 1 + 18 files changed, 104 insertions(+) create mode 100644 src/lcals/FIRST_SUM-Sycl.cpp diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp index dd65f4f7e..c68f3fc06 100644 --- a/src/apps/ENERGY-Sycl.cpp +++ b/src/apps/ENERGY-Sycl.cpp @@ -113,6 +113,7 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp index eb1c5faef..2f941f039 100644 --- a/src/apps/FIR-Sycl.cpp +++ b/src/apps/FIR-Sycl.cpp @@ -71,6 +71,7 @@ void FIR::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index 84f6b1e2d..02c2b1181 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -64,6 +64,7 @@ void LTIMES::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index 9a8744f2b..9c961d266 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -64,6 +64,7 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp index 13a66d15c..1681ad486 100644 --- a/src/apps/VOL3D-Sycl.cpp +++ b/src/apps/VOL3D-Sycl.cpp @@ -54,6 +54,7 @@ void VOL3D::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/basic/DAXPY-Sycl.cpp b/src/basic/DAXPY-Sycl.cpp index 11441107c..15642a08b 100644 --- a/src/basic/DAXPY-Sycl.cpp +++ b/src/basic/DAXPY-Sycl.cpp @@ -51,6 +51,7 @@ void DAXPY::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt index f8f7af2cb..1c0695a53 100644 --- a/src/lcals/CMakeLists.txt +++ b/src/lcals/CMakeLists.txt @@ -41,6 +41,7 @@ blt_add_library( FIRST_SUM-Cuda.cpp FIRST_SUM-OMP.cpp FIRST_SUM-OMPTarget.cpp + FIRST_SUM-Sycl.cpp GEN_LIN_RECUR.cpp GEN_LIN_RECUR-Seq.cpp GEN_LIN_RECUR-Hip.cpp diff --git a/src/lcals/DIFF_PREDICT-Sycl.cpp b/src/lcals/DIFF_PREDICT-Sycl.cpp index e46e93530..5ac815671 100644 --- a/src/lcals/DIFF_PREDICT-Sycl.cpp +++ b/src/lcals/DIFF_PREDICT-Sycl.cpp @@ -52,6 +52,7 @@ void DIFF_PREDICT::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/EOS-Sycl.cpp b/src/lcals/EOS-Sycl.cpp index cb4e200b9..898d25bc8 100644 --- a/src/lcals/EOS-Sycl.cpp +++ b/src/lcals/EOS-Sycl.cpp @@ -51,6 +51,7 @@ void EOS::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/FIRST_DIFF-Sycl.cpp b/src/lcals/FIRST_DIFF-Sycl.cpp index 4d8aa3b99..41bacafe3 100644 --- a/src/lcals/FIRST_DIFF-Sycl.cpp +++ b/src/lcals/FIRST_DIFF-Sycl.cpp @@ -51,6 +51,7 @@ void FIRST_DIFF::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/FIRST_SUM-Sycl.cpp b/src/lcals/FIRST_SUM-Sycl.cpp new file mode 100644 index 000000000..3d63fdcbc --- /dev/null +++ b/src/lcals/FIRST_SUM-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +template < size_t work_group_size > +void FIRST_SUM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + FIRST_SUM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i > 0 && i < iend) { + FIRST_SUM_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + FIRST_SUM_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_SUM : Unknown Syclvariant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_SUM, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index 3c2c0e03b..a4765bfc6 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -53,6 +53,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 52a3841a1..1fc9c48cd 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -55,16 +55,20 @@ class FIRST_SUM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/lcals/GEN_LIN_RECUR-Sycl.cpp b/src/lcals/GEN_LIN_RECUR-Sycl.cpp index abfad2347..06ca45e7b 100644 --- a/src/lcals/GEN_LIN_RECUR-Sycl.cpp +++ b/src/lcals/GEN_LIN_RECUR-Sycl.cpp @@ -63,6 +63,7 @@ void GEN_LIN_RECUR::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/HYDRO_1D-Sycl.cpp b/src/lcals/HYDRO_1D-Sycl.cpp index bbef573d5..3ccbad9a7 100644 --- a/src/lcals/HYDRO_1D-Sycl.cpp +++ b/src/lcals/HYDRO_1D-Sycl.cpp @@ -51,6 +51,7 @@ void HYDRO_1D::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/INT_PREDICT-Sycl.cpp b/src/lcals/INT_PREDICT-Sycl.cpp index 7813b63c8..992dbcba1 100644 --- a/src/lcals/INT_PREDICT-Sycl.cpp +++ b/src/lcals/INT_PREDICT-Sycl.cpp @@ -51,6 +51,7 @@ void INT_PREDICT::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/PLANCKIAN-Sycl.cpp b/src/lcals/PLANCKIAN-Sycl.cpp index 164bc9e69..31b43c2f7 100644 --- a/src/lcals/PLANCKIAN-Sycl.cpp +++ b/src/lcals/PLANCKIAN-Sycl.cpp @@ -54,6 +54,7 @@ void PLANCKIAN::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/lcals/TRIDIAG_ELIM-Sycl.cpp b/src/lcals/TRIDIAG_ELIM-Sycl.cpp index 1ed8918f8..74e23665f 100644 --- a/src/lcals/TRIDIAG_ELIM-Sycl.cpp +++ b/src/lcals/TRIDIAG_ELIM-Sycl.cpp @@ -51,6 +51,7 @@ void TRIDIAG_ELIM::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); From f3b464ec54efd026c6e2a128ece2599f42a8fdd1 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 2 May 2024 09:00:58 -0700 Subject: [PATCH 332/454] Minor formatting mods --- src/basic/IF_QUAD-Sycl.cpp | 1 + src/basic/INIT_VIEW1D-Sycl.cpp | 1 + src/stream/MUL-Sycl.cpp | 1 + src/stream/TRIAD-Sycl.cpp | 1 + 4 files changed, 4 insertions(+) diff --git a/src/basic/IF_QUAD-Sycl.cpp b/src/basic/IF_QUAD-Sycl.cpp index 3c447bf9c..17e569c6f 100644 --- a/src/basic/IF_QUAD-Sycl.cpp +++ b/src/basic/IF_QUAD-Sycl.cpp @@ -51,6 +51,7 @@ void IF_QUAD::runSyclVariantImpl(VariantID vid) } }); }); + } stopTimer(); diff --git a/src/basic/INIT_VIEW1D-Sycl.cpp b/src/basic/INIT_VIEW1D-Sycl.cpp index f8ce78434..ff06d2203 100644 --- a/src/basic/INIT_VIEW1D-Sycl.cpp +++ b/src/basic/INIT_VIEW1D-Sycl.cpp @@ -51,6 +51,7 @@ void INIT_VIEW1D::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); diff --git a/src/stream/MUL-Sycl.cpp b/src/stream/MUL-Sycl.cpp index f97ea5445..01be5d872 100644 --- a/src/stream/MUL-Sycl.cpp +++ b/src/stream/MUL-Sycl.cpp @@ -50,6 +50,7 @@ void MUL::runSyclVariantImpl(VariantID vid) } }); }); + } stopTimer(); diff --git a/src/stream/TRIAD-Sycl.cpp b/src/stream/TRIAD-Sycl.cpp index 71dfeee1d..c8ecafdf7 100644 --- a/src/stream/TRIAD-Sycl.cpp +++ b/src/stream/TRIAD-Sycl.cpp @@ -51,6 +51,7 @@ void TRIAD::runSyclVariantImpl(VariantID vid) }); }); + } stopTimer(); From 6d54bef7502e05e7bdf974c717a84fa12a4a2f84 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 2 May 2024 12:43:49 -0700 Subject: [PATCH 333/454] Change variable names to more closely align with SYCL concepts --- src/apps/DEL_DOT_VEC_2D-Sycl.cpp | 4 ++-- src/apps/ENERGY-Sycl.cpp | 14 +++++++------- src/apps/FIR-Sycl.cpp | 4 ++-- src/apps/PRESSURE-Sycl.cpp | 6 +++--- src/apps/VOL3D-Sycl.cpp | 4 ++-- src/polybench/POLYBENCH_ADI-Sycl.cpp | 6 +++--- src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp | 6 +++--- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp index f1c199337..13f59e29d 100644 --- a/src/apps/DEL_DOT_VEC_2D-Sycl.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Sycl.cpp @@ -40,10 +40,10 @@ void DEL_DOT_VEC_2D::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type ii = item.get_global_id(0); diff --git a/src/apps/ENERGY-Sycl.cpp b/src/apps/ENERGY-Sycl.cpp index c68f3fc06..7ebc7f3c7 100644 --- a/src/apps/ENERGY-Sycl.cpp +++ b/src/apps/ENERGY-Sycl.cpp @@ -41,10 +41,10 @@ void ENERGY::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -56,7 +56,7 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -68,7 +68,7 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -79,7 +79,7 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -91,7 +91,7 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -103,7 +103,7 @@ void ENERGY::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp index 2f941f039..74b23f8d0 100644 --- a/src/apps/FIR-Sycl.cpp +++ b/src/apps/FIR-Sycl.cpp @@ -58,10 +58,10 @@ void FIR::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); diff --git a/src/apps/PRESSURE-Sycl.cpp b/src/apps/PRESSURE-Sycl.cpp index 1d1778d6d..2b0e3b4dd 100644 --- a/src/apps/PRESSURE-Sycl.cpp +++ b/src/apps/PRESSURE-Sycl.cpp @@ -40,10 +40,10 @@ void PRESSURE::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -55,7 +55,7 @@ void PRESSURE::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); diff --git a/src/apps/VOL3D-Sycl.cpp b/src/apps/VOL3D-Sycl.cpp index 1681ad486..37c7bc90f 100644 --- a/src/apps/VOL3D-Sycl.cpp +++ b/src/apps/VOL3D-Sycl.cpp @@ -40,10 +40,10 @@ void VOL3D::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type ii = item.get_global_id(0); diff --git a/src/polybench/POLYBENCH_ADI-Sycl.cpp b/src/polybench/POLYBENCH_ADI-Sycl.cpp index d5711f106..0c42167b3 100644 --- a/src/polybench/POLYBENCH_ADI-Sycl.cpp +++ b/src/polybench/POLYBENCH_ADI-Sycl.cpp @@ -38,10 +38,10 @@ void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) for (Index_type t = 1; t <= tsteps; ++t) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(n-2, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(n-2, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0) + 1; @@ -61,7 +61,7 @@ void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0) + 1; diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp index 95d8e9e91..8a13f6567 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Sycl.cpp @@ -38,10 +38,10 @@ void POLYBENCH_JACOBI_1D::runSyclVariantImpl(VariantID vid) for (Index_type t = 0; t < tsteps; ++t) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -53,7 +53,7 @@ void POLYBENCH_JACOBI_1D::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); From ea67f66aa44a5970fa84e18fe8d09cd8d8f28320 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 3 May 2024 09:59:52 -0700 Subject: [PATCH 334/454] Add GEMVER SYCL variants, swicth to 3d workgroup --- src/polybench/CMakeLists.txt | 2 + src/polybench/POLYBENCH_GEMM-Sycl.cpp | 139 +++++++++++++++ src/polybench/POLYBENCH_GEMM.cpp | 3 + src/polybench/POLYBENCH_GEMM.hpp | 5 + src/polybench/POLYBENCH_GEMVER-Sycl.cpp | 218 ++++++++++++++++++++++++ src/polybench/POLYBENCH_GEMVER.cpp | 3 + src/polybench/POLYBENCH_GEMVER.hpp | 5 + 7 files changed, 375 insertions(+) create mode 100644 src/polybench/POLYBENCH_GEMM-Sycl.cpp create mode 100644 src/polybench/POLYBENCH_GEMVER-Sycl.cpp diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index f610dd9d1..6ae9174d2 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -56,12 +56,14 @@ blt_add_library( POLYBENCH_GEMM-Cuda.cpp POLYBENCH_GEMM-OMP.cpp POLYBENCH_GEMM-OMPTarget.cpp + POLYBENCH_GEMM-Sycl.cpp POLYBENCH_GEMVER.cpp POLYBENCH_GEMVER-Seq.cpp POLYBENCH_GEMVER-Hip.cpp POLYBENCH_GEMVER-Cuda.cpp POLYBENCH_GEMVER-OMP.cpp POLYBENCH_GEMVER-OMPTarget.cpp + POLYBENCH_GEMVER-Sycl.cpp POLYBENCH_GESUMMV.cpp POLYBENCH_GESUMMV-Seq.cpp POLYBENCH_GESUMMV-Hip.cpp diff --git a/src/polybench/POLYBENCH_GEMM-Sycl.cpp b/src/polybench/POLYBENCH_GEMM-Sycl.cpp new file mode 100644 index 000000000..1ad050402 --- /dev/null +++ b/src/polybench/POLYBENCH_GEMM-Sycl.cpp @@ -0,0 +1,139 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GEMM.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_GEMM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_GEMM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, j_wg_sz)); + + sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < ni && j < nj) { + POLYBENCH_GEMM_BODY1; + POLYBENCH_GEMM_BODY2; + for (Index_type k = 0; k < nk; ++k) { + POLYBENCH_GEMM_BODY3; + } + POLYBENCH_GEMM_BODY4; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_GEMM_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>>, + RAJA::statement::For<2, RAJA::seq_exec, + RAJA::statement::Lambda<2, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<3, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param_resource( + + RAJA::make_tuple( RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk} ), + RAJA::tuple{0.0}, // variable for dot + res, + + [=] (Real_type& dot) { + POLYBENCH_GEMM_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j) { + POLYBENCH_GEMM_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, Index_type k, + Real_type& dot) { + POLYBENCH_GEMM_BODY3_RAJA; + }, + [=] (Index_type i, Index_type j, + Real_type& dot) { + POLYBENCH_GEMM_BODY4_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_GEMM : Unknown Cuda variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GEMM, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 63897bd37..1462eaf66 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -70,6 +70,9 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_GEMM::~POLYBENCH_GEMM() diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index 1d788154c..14c590596 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -99,13 +99,18 @@ class POLYBENCH_GEMM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/polybench/POLYBENCH_GEMVER-Sycl.cpp b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp new file mode 100644 index 000000000..7c19c882d --- /dev/null +++ b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp @@ -0,0 +1,218 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GEMVER.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_GEMVER::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_GEMVER_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + sycl::range<3> global_dim1(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(n, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(n, j_wg_sz)); + sycl::range<3> wkgroup_dim1(1, i_wg_sz, j_wg_sz); + + const size_t global_size234 = work_group_size * RAJA_DIVIDE_CEILING_INT(n, work_group_size); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim1), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); + + if (i < n && j < n) { + POLYBENCH_GEMVER_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < n) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY3; + } + POLYBENCH_GEMVER_BODY4; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < n) { + POLYBENCH_GEMVER_BODY5; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size234, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < n) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; ++j) { + POLYBENCH_GEMVER_BODY7; + } + POLYBENCH_GEMVER_BODY8; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_GEMVER_VIEWS_RAJA; + + using EXEC_POL1 = + RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + >; + + using EXEC_POL24 = + RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + > + >; + + using EXEC_POL3 = RAJA::sycl_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1_RAJA; + } + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + RAJA::tuple{0.0}, + res, + + [=] (Index_type /* i */, Real_type &dot) { + POLYBENCH_GEMVER_BODY2_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_GEMVER_BODY3_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY4_RAJA; + } + ); + + RAJA::forall ( res, RAJA::RangeSegment{0, n}, + [=] (Index_type i) { + POLYBENCH_GEMVER_BODY5_RAJA; + } + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + RAJA::tuple{0.0}, + res, + + [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY6_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_GEMVER_BODY7_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY8_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_GEMVER : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GEMVER, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 657beb3e0..ea331fdf3 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -79,6 +79,9 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_GEMVER::~POLYBENCH_GEMVER() diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 048645a3f..ef5ad5a8a 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -152,13 +152,18 @@ class POLYBENCH_GEMVER : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From fb81baed7dfa289662afdf2862a99e9c0f533833 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 3 May 2024 10:00:27 -0700 Subject: [PATCH 335/454] Change to 3d work-gorups --- src/polybench/POLYBENCH_2MM-Sycl.cpp | 28 ++++++------- src/polybench/POLYBENCH_3MM-Sycl.cpp | 39 ++++++++++--------- src/polybench/POLYBENCH_ATAX-Sycl.cpp | 6 +-- src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp | 37 +++++++++--------- .../POLYBENCH_FLOYD_WARSHALL-Sycl.cpp | 17 ++++---- 5 files changed, 67 insertions(+), 60 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index 81aea76e4..f78ad3386 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -41,23 +41,25 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<2> global_dim1(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + sycl::range<3> global_dim1(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz)); - sycl::range<2> global_dim2(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + sycl::range<3> global_dim2(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); - sycl::range<2> wkgroup_dim(out_wg_sz, in_wg_sz); + sycl::range<3> wkgroup_dim(1, out_wg_sz, in_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim1, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type j = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); if (i < ni && j < nj) { POLYBENCH_2MM_BODY1; @@ -71,11 +73,11 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim2, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type l = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type l = item.get_global_id(2); if (i < ni && l < nl) { POLYBENCH_2MM_BODY4; @@ -102,8 +104,8 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) #else RAJA::statement::SyclKernel< #endif - RAJA::statement::For<0, RAJA::sycl_global_0, - RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0, RAJA::Params<0>>, RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp index 3f72d13cf..4df0cc66c 100644 --- a/src/polybench/POLYBENCH_3MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp @@ -43,23 +43,26 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - sycl::range<2> global_dim1(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + sycl::range<3> global_dim1(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), in_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, in_wg_sz)); - sycl::range<2> global_dim2(out_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, out_wg_sz), + sycl::range<3> global_dim2(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(nj, out_wg_sz), in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); - sycl::range<2> global_dim3(out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), + sycl::range<3> global_dim3(1, + out_wg_sz * RAJA_DIVIDE_CEILING_INT(ni, out_wg_sz), in_wg_sz * RAJA_DIVIDE_CEILING_INT(nl, in_wg_sz)); - sycl::range<2> wkgroup_dim(out_wg_sz, in_wg_sz); + sycl::range<3> wkgroup_dim(1, out_wg_sz, in_wg_sz); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim1, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim1, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type j = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); if (i < ni && j < nj) { POLYBENCH_3MM_BODY1; @@ -73,11 +76,11 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim2, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type j = item.get_global_id(0); - Index_type l = item.get_global_id(1); + Index_type j = item.get_global_id(1); + Index_type l = item.get_global_id(2); if (j < nj && l < nl) { POLYBENCH_3MM_BODY4; @@ -91,11 +94,11 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim2, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type l = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type l = item.get_global_id(2); if (i < ni && l < nl) { POLYBENCH_3MM_BODY7; @@ -122,8 +125,8 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) #else RAJA::statement::SyclKernel< #endif - RAJA::statement::For<0, RAJA::sycl_global_0, - RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0, RAJA::Params<0>>, RAJA::statement::For<2, RAJA::seq_exec, RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> diff --git a/src/polybench/POLYBENCH_ATAX-Sycl.cpp b/src/polybench/POLYBENCH_ATAX-Sycl.cpp index 23759f95c..b2d68e3a0 100644 --- a/src/polybench/POLYBENCH_ATAX-Sycl.cpp +++ b/src/polybench/POLYBENCH_ATAX-Sycl.cpp @@ -36,10 +36,10 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type i = item.get_global_id(0); @@ -56,7 +56,7 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), [=] (sycl::nd_item<1> item) { Index_type j = item.get_global_id(0); diff --git a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp index 6a9b41ffb..a3b4e8690 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp @@ -45,10 +45,10 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) for (t = 0; t < tsteps; ++t) { - const size_t grid_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(ny, work_group_size); + const size_t global_size1 = work_group_size * RAJA_DIVIDE_CEILING_INT(ny, work_group_size); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<1> (grid_size1, work_group_size), + h.parallel_for(sycl::nd_range<1> (global_size1, work_group_size), [=] (sycl::nd_item<1> item) { Index_type j = item.get_global_id(0); @@ -59,17 +59,18 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) }); }); - sycl::range<2> global_dim234(i_wg_sz * RAJA_DIVIDE_CEILING_INT(nx, i_wg_sz), + sycl::range<3> global_dim234(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(nx, i_wg_sz), j_wg_sz * RAJA_DIVIDE_CEILING_INT(ny, j_wg_sz)); - sycl::range<2> wkgroup_dim234(i_wg_sz, j_wg_sz); + sycl::range<3> wkgroup_dim234(1, i_wg_sz, j_wg_sz); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim234, wkgroup_dim234), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type j = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); if (i > 0 && i < nx && j < ny) { POLYBENCH_FDTD_2D_BODY2; @@ -79,11 +80,11 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim234, wkgroup_dim234), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type j = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); if (i < nx && j > 0 && j < ny) { POLYBENCH_FDTD_2D_BODY3; @@ -93,11 +94,11 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim234, wkgroup_dim234), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim234, wkgroup_dim234), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type j = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); if (i < nx-1 && j < ny-1) { POLYBENCH_FDTD_2D_BODY4; @@ -124,8 +125,8 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) #else RAJA::statement::SyclKernel< #endif - RAJA::statement::For<0, RAJA::sycl_global_0, - RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0> > > diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp index 401d1baed..a0256d207 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp @@ -38,10 +38,11 @@ void POLYBENCH_FLOYD_WARSHALL::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<2> global_dim(i_wg_sz * RAJA_DIVIDE_CEILING_INT(N, i_wg_sz), + sycl::range<3> global_dim(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(N, i_wg_sz), j_wg_sz * RAJA_DIVIDE_CEILING_INT(N, j_wg_sz)); - sycl::range<2> wkgroup_dim(i_wg_sz, j_wg_sz); + sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -49,11 +50,11 @@ void POLYBENCH_FLOYD_WARSHALL::runSyclVariantImpl(VariantID vid) for (Index_type k = 0; k < N; ++k) { qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<2>( global_dim, wkgroup_dim), - [=] (sycl::nd_item<2> item) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { - Index_type i = item.get_global_id(0); - Index_type j = item.get_global_id(1); + Index_type i = item.get_global_id(1); + Index_type j = item.get_global_id(2); if ( i < N && j < N ) { POLYBENCH_FLOYD_WARSHALL_BODY; @@ -79,8 +80,8 @@ void POLYBENCH_FLOYD_WARSHALL::runSyclVariantImpl(VariantID vid) #else RAJA::statement::SyclKernel< #endif - RAJA::statement::For<1, RAJA::sycl_global_0, - RAJA::statement::For<2, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_1, + RAJA::statement::For<2, RAJA::sycl_global_2, RAJA::statement::Lambda<0> > > From 625429b82947015fd8a1a3c195f3a6edb83d1da6 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 3 May 2024 10:57:44 -0700 Subject: [PATCH 336/454] Add SYCL kernel variants. --- src/polybench/CMakeLists.txt | 2 + src/polybench/POLYBENCH_GESUMMV-Sycl.cpp | 121 +++++++++++++++++ src/polybench/POLYBENCH_GESUMMV.cpp | 3 + src/polybench/POLYBENCH_GESUMMV.hpp | 5 + src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp | 145 +++++++++++++++++++++ src/polybench/POLYBENCH_JACOBI_2D.cpp | 3 + src/polybench/POLYBENCH_JACOBI_2D.hpp | 5 + 7 files changed, 284 insertions(+) create mode 100644 src/polybench/POLYBENCH_GESUMMV-Sycl.cpp create mode 100644 src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index 6ae9174d2..c41a0468d 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -70,6 +70,7 @@ blt_add_library( POLYBENCH_GESUMMV-Cuda.cpp POLYBENCH_GESUMMV-OMP.cpp POLYBENCH_GESUMMV-OMPTarget.cpp + POLYBENCH_GESUMMV-Sycl.cpp POLYBENCH_HEAT_3D.cpp POLYBENCH_HEAT_3D-Seq.cpp POLYBENCH_HEAT_3D-Hip.cpp @@ -90,6 +91,7 @@ blt_add_library( POLYBENCH_JACOBI_2D-Cuda.cpp POLYBENCH_JACOBI_2D-OMP.cpp POLYBENCH_JACOBI_2D-OMPTarget.cpp + POLYBENCH_JACOBI_2D-Sycl.cpp POLYBENCH_MVT.cpp POLYBENCH_MVT-Seq.cpp POLYBENCH_MVT-Hip.cpp diff --git a/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp new file mode 100644 index 000000000..ed43ff9eb --- /dev/null +++ b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp @@ -0,0 +1,121 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GESUMMV.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + +template < size_t work_group_size > +void POLYBENCH_GESUMMV::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_GESUMMV_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + if (i < N) { + POLYBENCH_GESUMMV_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_GESUMMV_BODY2; + } + POLYBENCH_GESUMMV_BODY3; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_GESUMMV_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Params<0,1>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0,1>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0,1>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param_resource( + RAJA::make_tuple( RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N} ), + RAJA::make_tuple(static_cast(0.0), + static_cast(0.0)), + res, + + [=] (Real_type& tmpdot, + Real_type& ydot) { + POLYBENCH_GESUMMV_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type& tmpdot, + Real_type& ydot) { + POLYBENCH_GESUMMV_BODY2_RAJA; + }, + [=] (Index_type i, Real_type& tmpdot, + Real_type& ydot) { + POLYBENCH_GESUMMV_BODY3_RAJA; + } + ); + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_GESUMMV : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_GESUMMV, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index d227756da..99efcc0b2 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -59,6 +59,9 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV() diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 75d4aa8c9..3d80155ed 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -98,13 +98,18 @@ class POLYBENCH_GESUMMV : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp new file mode 100644 index 000000000..d7db021e9 --- /dev/null +++ b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp @@ -0,0 +1,145 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_JACOBI_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + // + // Define work-group shape for SYCL execution + // +#define j_wg_sz (32) +#define i_wg_sz (work_group_size / j_wg_sz) + + +template < size_t work_group_size > +void POLYBENCH_JACOBI_2D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_JACOBI_2D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + sycl::range<3> global_dim(1, + i_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, i_wg_sz), + j_wg_sz * RAJA_DIVIDE_CEILING_INT(N-2, j_wg_sz)); + + sycl::range<3> wkgroup_dim(1, i_wg_sz, j_wg_sz); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1) + 1; + Index_type j = item.get_global_id(2) + 1; + + if ( i < N-1 && j < N-1 ) { + POLYBENCH_JACOBI_2D_BODY1; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<3>( global_dim, wkgroup_dim), + [=] (sycl::nd_item<3> item) { + + Index_type i = item.get_global_id(1) + 1; + Index_type j = item.get_global_id(2) + 1; + + if ( i < N-1 && j < N-1 ) { + POLYBENCH_JACOBI_2D_BODY2; + } + + }); + }); + + } + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_JACOBI_2D_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<0, RAJA::sycl_global_1, + RAJA::statement::For<1, RAJA::sycl_global_2, + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY1_RAJA; + } + ); + + RAJA::kernel_resource( + RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + res, + [=] (Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY2_RAJA; + } + ); + + } + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_JACOBI_2D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_JACOBI_2D, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 4216c115a..f1db49000 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -69,6 +69,9 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D() diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 7e8819bf1..df170306e 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -87,13 +87,18 @@ class POLYBENCH_JACOBI_2D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From efcb03f0e5fe5e93bc2f9ef22eea1a5b6c575bfd Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 3 May 2024 11:23:10 -0700 Subject: [PATCH 337/454] Add SYCL kernel variants --- src/polybench/CMakeLists.txt | 1 + src/polybench/POLYBENCH_MVT-Sycl.cpp | 157 +++++++++++++++++++++++++++ src/polybench/POLYBENCH_MVT.cpp | 3 + src/polybench/POLYBENCH_MVT.hpp | 5 + 4 files changed, 166 insertions(+) create mode 100644 src/polybench/POLYBENCH_MVT-Sycl.cpp diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt index c41a0468d..2722a1fac 100644 --- a/src/polybench/CMakeLists.txt +++ b/src/polybench/CMakeLists.txt @@ -98,5 +98,6 @@ blt_add_library( POLYBENCH_MVT-Cuda.cpp POLYBENCH_MVT-OMP.cpp POLYBENCH_MVT-OMPTarget.cpp + POLYBENCH_MVT-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/polybench/POLYBENCH_MVT-Sycl.cpp b/src/polybench/POLYBENCH_MVT-Sycl.cpp new file mode 100644 index 000000000..306cece38 --- /dev/null +++ b/src/polybench/POLYBENCH_MVT-Sycl.cpp @@ -0,0 +1,157 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_MVT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + +template < size_t work_group_size > +void POLYBENCH_MVT::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + POLYBENCH_MVT_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(N, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + if (i < N) { + POLYBENCH_MVT_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_MVT_BODY2; + } + POLYBENCH_MVT_BODY3; + } + + }); + }); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type i = item.get_global_id(0); + + if (i < N) { + POLYBENCH_MVT_BODY4; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_MVT_BODY5; + } + POLYBENCH_MVT_BODY6; + } + + }); + }); + + } + stopTimer(); + + } else if (vid == RAJA_SYCL) { + + POLYBENCH_MVT_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< +#if 0 + RAJA::statement::SyclKernelAsync< +#else + RAJA::statement::SyclKernel< +#endif + RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_MVT_BODY1_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_MVT_BODY2_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_MVT_BODY3_RAJA; + } + + ); + + RAJA::kernel_param_resource( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + res, + + [=] (Real_type &dot) { + POLYBENCH_MVT_BODY4_RAJA; + }, + [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_MVT_BODY5_RAJA; + }, + [=] (Index_type i, Real_type &dot) { + POLYBENCH_MVT_BODY6_RAJA; + } + + ); + + }); // end sequential region (for single-source code) + + } + stopTimer(); + + } else { + getCout() << "\n POLYBENCH_MVT : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(POLYBENCH_MVT, Sycl) + +} // end namespace polybench +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL + diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 41caad0d4..2ae85367a 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -62,6 +62,9 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } POLYBENCH_MVT::~POLYBENCH_MVT() diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 809c3e624..a54181833 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -112,13 +112,18 @@ class POLYBENCH_MVT : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 419bb402e8e8f8a7ec184149bbc5fce55966e4d6 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 3 May 2024 14:38:11 -0700 Subject: [PATCH 338/454] Add SYCL variants --- src/apps/CMakeLists.txt | 1 + src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp | 89 +++++++++++++++++++++++++ src/apps/ZONAL_ACCUMULATION_3D.cpp | 3 + src/apps/ZONAL_ACCUMULATION_3D.hpp | 6 ++ 4 files changed, 99 insertions(+) create mode 100644 src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 0ed6e9b81..c66f81992 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -104,5 +104,6 @@ blt_add_library( ZONAL_ACCUMULATION_3D-Cuda.cpp ZONAL_ACCUMULATION_3D-OMP.cpp ZONAL_ACCUMULATION_3D-OMPTarget.cpp + ZONAL_ACCUMULATION_3D-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp new file mode 100644 index 000000000..67b25086e --- /dev/null +++ b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp @@ -0,0 +1,89 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ZONAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +template < size_t work_group_size > +void ZONAL_ACCUMULATION_3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + ZONAL_ACCUMULATION_3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type ii = item.get_global_id(0); + Index_type i = ii + ibegin; + if (i < iend) { + ZONAL_ACCUMULATION_3D_BODY_INDEX; + ZONAL_ACCUMULATION_3D_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + zones, [=] (Index_type i) { + ZONAL_ACCUMULATION_3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n ZONAL_ACCUMULATION_3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ZONAL_ACCUMULATION_3D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index be5c93000..5b0f4c20d 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -66,6 +66,9 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } ZONAL_ACCUMULATION_3D::~ZONAL_ACCUMULATION_3D() diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 2e15e3d60..5427dcd52 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -81,14 +81,20 @@ class ZONAL_ACCUMULATION_3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); + + void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 0dfedf13e2789b221973db8a6bac0bdd0d7149e1 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 3 May 2024 14:38:34 -0700 Subject: [PATCH 339/454] Code formatting --- src/basic/DAXPY.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index c63683844..db9edba60 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -59,6 +59,7 @@ class DAXPY : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > From 3c64fc4ae3629361c38093aee5a823115ab65778 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 3 May 2024 15:26:12 -0700 Subject: [PATCH 340/454] Add SYCL variants --- src/apps/CMakeLists.txt | 1 + src/apps/EDGE3D-Sycl.cpp | 84 ++++++++++++++++++++++++++++++++++++++++ src/apps/EDGE3D.cpp | 3 ++ src/apps/EDGE3D.hpp | 4 ++ 4 files changed, 92 insertions(+) create mode 100644 src/apps/EDGE3D-Sycl.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index c66f81992..dc8c2048a 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -36,6 +36,7 @@ blt_add_library( EDGE3D-Seq.cpp EDGE3D-OMP.cpp EDGE3D-OMPTarget.cpp + EDGE3D-Sycl.cpp ENERGY.cpp ENERGY-Seq.cpp ENERGY-Hip.cpp diff --git a/src/apps/EDGE3D-Sycl.cpp b/src/apps/EDGE3D-Sycl.cpp new file mode 100644 index 000000000..6b60bbc3c --- /dev/null +++ b/src/apps/EDGE3D-Sycl.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "EDGE3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +template < size_t work_group_size > +void EDGE3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + EDGE3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0) + ibegin; + if (i < iend) { + EDGE3D_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + EDGE3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n EDGE3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(EDGE3D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index a9335f727..0e8b7c3ea 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -83,6 +83,9 @@ EDGE3D::EDGE3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } EDGE3D::~EDGE3D() diff --git a/src/apps/EDGE3D.hpp b/src/apps/EDGE3D.hpp index 6a3e1f903..a5ae54cfa 100644 --- a/src/apps/EDGE3D.hpp +++ b/src/apps/EDGE3D.hpp @@ -417,14 +417,18 @@ class EDGE3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From e75228d86807fbc57d0ee488bbe3f7e781918060 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Mon, 20 May 2024 15:23:19 -0700 Subject: [PATCH 341/454] Update RAJA tpl to develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index c315dddd6..c5ed86ef0 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit c315dddd601036f93d2f3db6f06563beb165fe77 +Subproject commit c5ed86ef0bea57e47573d7f512e203ac6ecc39c4 From 2231ae912b5d71f22badde5dad6473ac56648c63 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 21 May 2024 12:55:13 -0700 Subject: [PATCH 342/454] Update RAJA and blt submodules to latest releases --- blt | 2 +- tpl/RAJA | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/blt b/blt index 148c53ecc..9ff77344f 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 148c53ecc8bcaad5eaa4c1e39cb8144b8f1388ae +Subproject commit 9ff77344f0b2a6ee345e452bddd6bfd46cbbfa35 diff --git a/tpl/RAJA b/tpl/RAJA index 82d1b926a..5295aee68 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 82d1b926ada0fbb15a4a6e0adadc30c715cfda7b +Subproject commit 5295aee682f6f71bf7c3ff98f2dabce9f9529abf From 676a9b5cdf41db03e802895297b303455fb7bdc0 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 29 May 2024 09:25:18 -0700 Subject: [PATCH 343/454] Pull in recent SYCL back-end internals cleanup --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index c5ed86ef0..4a8ab8b53 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit c5ed86ef0bea57e47573d7f512e203ac6ecc39c4 +Subproject commit 4a8ab8b536201ea9fecad2df485016dc606b3fc2 From 25085198478435a81b478d6ab30c63623431dccf Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 May 2024 10:30:30 -0700 Subject: [PATCH 344/454] Rename some functions --- src/algorithm/HISTOGRAM-Cuda.cpp | 17 +++++++++-------- src/algorithm/HISTOGRAM-Hip.cpp | 14 +++++++------- src/algorithm/HISTOGRAM.hpp | 8 ++++---- src/basic/MULTI_REDUCE-Cuda.cpp | 4 ++-- src/basic/MULTI_REDUCE-Hip.cpp | 4 ++-- src/basic/MULTI_REDUCE.hpp | 4 ++-- 6 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 9dc0c25d1..5df2d3640 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -107,7 +107,7 @@ void HISTOGRAM::runCudaVariantLibrary(VariantID vid) } template < size_t block_size, size_t replication > -void HISTOGRAM::runCudaVariantReplicateGlobal(VariantID vid) +void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -217,15 +217,15 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + run_params.validAtomicReplication(global_replication)) { if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantReplicateGlobal(vid); + runCudaVariantAtomicGlobal(vid); } @@ -262,13 +262,14 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + run_params.validAtomicReplication(global_replication)) { + + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">_"+std::to_string(block_size)); - addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ - "_global_"+std::to_string(block_size)); } diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 63475e79b..199bd121c 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -136,7 +136,7 @@ void HISTOGRAM::runHipVariantLibrary(VariantID vid) } template < size_t block_size, size_t replication > -void HISTOGRAM::runHipVariantReplicateGlobal(VariantID vid) +void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -246,15 +246,15 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + run_params.validAtomicReplication(global_replication)) { if (tune_idx == t) { setBlockSize(block_size); - runHipVariantReplicateGlobal(vid); + runHipVariantAtomicGlobal(vid); } @@ -291,12 +291,12 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + run_params.validAtomicReplication(global_replication)) { - addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ + addVariantTuningName(vid, "replicate_"+std::to_string(global_replication)+ "_global_"+std::to_string(block_size)); } diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index c2f74bd25..7d33eae75 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -98,10 +98,10 @@ class HISTOGRAM : public KernelBase void setHipTuningDefinitions(VariantID vid); void runCudaVariantLibrary(VariantID vid); void runHipVariantLibrary(VariantID vid); - template < size_t block_size, size_t replication > - void runCudaVariantReplicateGlobal(VariantID vid); - template < size_t block_size, size_t replication > - void runHipVariantReplicateGlobal(VariantID vid); + template < size_t block_size, size_t global_replication > + void runCudaVariantAtomicGlobal(VariantID vid); + template < size_t block_size, size_t global_replication > + void runHipVariantAtomicGlobal(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index 67df3e53e..435cd783e 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -37,7 +37,7 @@ __global__ void multi_reduce(MULTI_REDUCE::Data_ptr values, template < size_t block_size, size_t replication > -void MULTI_REDUCE::runCudaVariantReplicateGlobal(VariantID vid) +void MULTI_REDUCE::runCudaVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -144,7 +144,7 @@ void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantReplicateGlobal(vid); + runCudaVariantAtomicGlobal(vid); } diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 9260ead8d..b64c6fc72 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -37,7 +37,7 @@ __global__ void multi_reduce(MULTI_REDUCE::Data_ptr values, template < size_t block_size, size_t replication > -void MULTI_REDUCE::runHipVariantReplicateGlobal(VariantID vid) +void MULTI_REDUCE::runHipVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -144,7 +144,7 @@ void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantReplicateGlobal(vid); + runHipVariantAtomicGlobal(vid); } diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 49680ddae..4123f483b 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -99,9 +99,9 @@ class MULTI_REDUCE : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); template < size_t block_size, size_t replication > - void runCudaVariantReplicateGlobal(VariantID vid); + void runCudaVariantAtomicGlobal(VariantID vid); template < size_t block_size, size_t replication > - void runHipVariantReplicateGlobal(VariantID vid); + void runHipVariantAtomicGlobal(VariantID vid); private: static const size_t default_gpu_block_size = 256; From af2f80f7f124ba73fa7264f061f0d4dd4c36b245 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 May 2024 10:32:04 -0700 Subject: [PATCH 345/454] Generalize histogram indexing --- src/algorithm/HISTOGRAM-Cuda.cpp | 38 ++++++++++++++++---------------- src/algorithm/HISTOGRAM-Hip.cpp | 34 ++++++++++++++-------------- src/algorithm/HISTOGRAM.hpp | 9 +++++--- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 5df2d3640..03ee492a1 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -24,15 +24,15 @@ namespace rajaperf namespace algorithm { -template < size_t block_size, size_t replication > +template < Index_type block_size, Index_type global_replication > __launch_bounds__(block_size) -__global__ void histogram(HISTOGRAM::Data_ptr counts, - Index_ptr bins, - Index_type iend) +__global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, + Index_ptr bins, + Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic); + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); } } @@ -106,7 +106,7 @@ void HISTOGRAM::runCudaVariantLibrary(VariantID vid) } -template < size_t block_size, size_t replication > +template < size_t block_size, size_t global_replication > void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -117,27 +117,27 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) HISTOGRAM_GPU_DATA_SETUP; - RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, replication); + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); if ( vid == Base_CUDA ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - RPlaunchCudaKernel( (histogram), + RPlaunchCudaKernel( (histogram_atomic_global), grid_size, block_size, shmem, res.get_stream(), counts, bins, iend ); - RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); } stopTimer(); @@ -147,10 +147,10 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); auto histogram_lambda = [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic); + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); @@ -162,8 +162,8 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) shmem, res.get_stream(), ibegin, iend, histogram_lambda ); - RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); } stopTimer(); @@ -173,15 +173,15 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic); + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); }); - RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); } stopTimer(); diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 199bd121c..1c7d69c59 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -29,15 +29,15 @@ namespace rajaperf namespace algorithm { -template < size_t block_size, size_t replication > +template < Index_type block_size, Index_type global_replication > __launch_bounds__(block_size) -__global__ void histogram(HISTOGRAM::Data_ptr counts, +__global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, Index_ptr bins, Index_type iend) { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic); + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); } } @@ -135,7 +135,7 @@ void HISTOGRAM::runHipVariantLibrary(VariantID vid) } -template < size_t block_size, size_t replication > +template < size_t block_size, size_t global_replication > void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -146,27 +146,27 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) HISTOGRAM_GPU_DATA_SETUP; - RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, replication); + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); if ( vid == Base_HIP ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); constexpr size_t shmem = 0; - RPlaunchHipKernel( (histogram), + RPlaunchHipKernel( (histogram_atomic_global), grid_size, block_size, shmem, res.get_stream(), counts, bins, iend ); - RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); } stopTimer(); @@ -176,10 +176,10 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); auto histogram_lambda = [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic); + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); @@ -191,8 +191,8 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) shmem, res.get_stream(), ibegin, iend, histogram_lambda ); - RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); } stopTimer(); @@ -202,15 +202,15 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, replication); + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic); + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); }); - RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication); + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); } stopTimer(); diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 7d33eae75..82e720e75 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -41,8 +41,11 @@ #define HISTOGRAM_RAJA_BODY(policy) \ RAJA::atomicAdd(&counts[bins[i]], static_cast(1)); -#define HISTOGRAM_GPU_RAJA_BODY(policy) \ - RAJA::atomicAdd(&counts[bins[i]*replication + (i%replication)], static_cast(1)); +#define HISTOGRAM_GPU_BIN_INDEX(bin, offset, replication) \ + ((bin)*(replication) + ((offset)%(replication))) + +#define HISTOGRAM_GPU_RAJA_BODY(policy, counts, index, value) \ + RAJA::atomicAdd(&(counts)[(index)], (value)); #define HISTOGRAM_INIT_VALUES \ for (Index_type b = 0; b < num_bins; ++b ) { \ @@ -58,7 +61,7 @@ for (Index_type b = 0; b < (num_bins); ++b) { \ Data_type count_final = 0; \ for (size_t r = 0; r < (replication); ++r) { \ - count_final += (hcounts)[b*(replication) + r]; \ + count_final += (hcounts)[HISTOGRAM_GPU_BIN_INDEX(b, r, replication)]; \ } \ counts_final[b] = count_final; \ } From e3e1502e6a3aa4ab2e9b567a32a04163d6833ccb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 May 2024 10:33:18 -0700 Subject: [PATCH 346/454] Add shared and memory atomic tunings of histogram --- src/algorithm/HISTOGRAM-Cuda.cpp | 258 ++++++++++++++++++++++++++++++ src/algorithm/HISTOGRAM-Hip.cpp | 263 +++++++++++++++++++++++++++++++ src/algorithm/HISTOGRAM.hpp | 143 ++++++++++++++++- src/common/GPUUtils.hpp | 66 ++++++++ 4 files changed, 728 insertions(+), 2 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 03ee492a1..6e90caac7 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -24,6 +24,50 @@ namespace rajaperf namespace algorithm { +// for these models the input is block_size and the output is cache lines +using histogram_global_atomic_model = CutoffModel<512, 2, 1>; // v100 + +// for these models the input is block_size and the output is values +using histogram_shared_atomic_model = ConstantModel<4>; // v100 + + +template < size_t t_block_size, typename T, typename FunctionSignature > +struct histogram_info +{ + static constexpr size_t block_size = t_block_size; + + static size_t get_grid_size(size_t problem_size) + { + return RAJA_DIVIDE_CEILING_INT(problem_size, block_size); + } + + static size_t get_max_shmem(FunctionSignature const& func) + { + cudaFuncAttributes func_attr; + cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func)); + return func_attr.maxDynamicSharedSizeBytes; + } + + FunctionSignature const& const func; + const size_t grid_size; + const MultiReduceAtomicCalculator atomic_calc; + + histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins) + : func(a_func) + , grid_size(get_grid_size(problem_size)) + , atomic_calc(num_bins, block_size, grid_size, get_max_shmem(a_func), + histogram_global_atomic_model{}, histogram_shared_atomic_model{}) + { } + + std::string get_name() const + { + return "atomic_shared("+std::to_string(atomic_calc.shared_replication())+ + ")_global("+std::to_string(atomic_calc.global_replication())+ + ")_"+std::to_string(block_size); + } +}; + + template < Index_type block_size, Index_type global_replication > __launch_bounds__(block_size) __global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, @@ -36,6 +80,71 @@ __global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, } } +template < Index_type block_size, Index_type shared_replication, Index_type global_replication > +__launch_bounds__(block_size) +__global__ void histogram_atomic_shared_global(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type num_bins, + Index_type iend) +{ + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type i = threadIdx.x; i < num_bins * shared_replication; i += block_size) { + shared_counts[i] = HISTOGRAM::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, shared_counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], threadIdx.x, shared_replication), HISTOGRAM::Data_type(1)); + } + } + __syncthreads(); + + for (Index_type b = threadIdx.x; b < num_bins; b += block_size) { + Index_type i = blockIdx.x * num_bins + b; + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_counts[HISTOGRAM_GPU_BIN_INDEX(b, s, shared_replication)]; + } + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, global_counts, HISTOGRAM_GPU_BIN_INDEX(b, i, global_replication), block_sum); + } +} + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type iend, + MultiReduceAtomicCalculator atomic_calc) +{ + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type i = threadIdx.x; + i < Index_type(atomic_calc.num_bins() * atomic_calc.shared_replication()); + i += block_size) { + shared_counts[i] = HISTOGRAM::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, shared_counts, atomic_calc.get_shared_offset(bins[i], threadIdx.x), HISTOGRAM::Data_type(1)); + } + } + __syncthreads(); + + for (Index_type b = threadIdx.x; b < atomic_calc.num_bins(); b += block_size) { + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < atomic_calc.shared_replication(); ++s) { + block_sum += shared_counts[atomic_calc.get_shared_offset(b, s)]; + } + if (block_sum != HISTOGRAM::Data_type(0)) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, global_counts, atomic_calc.get_global_offset(b, blockIdx.x), block_sum); + } + } +} + void HISTOGRAM::runCudaVariantLibrary(VariantID vid) { @@ -194,6 +303,105 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) } +template < size_t block_size, size_t shared_replication, size_t global_replication > +void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t shmem = num_bins*shared_replication*sizeof(Data_type); + + RPlaunchCudaKernel( (histogram_atomic_shared_global), + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + num_bins, + iend ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + } + + RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); + +} + + +template < typename MultiReduceInfo > +void HISTOGRAM::runCudaVariantAtomicRuntime(MultiReduceInfo info, VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + static constexpr size_t block_size = info.block_size; + const size_t grid_size = info.grid_size; + const auto atomic_calc = info.atomic_calc; + const size_t global_replication = atomic_calc.global_replication(); + const size_t shmem = atomic_calc.shared_memory_in_bytes(); + + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + + RPlaunchCudaKernel( info.func, + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + iend, + atomic_calc ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + for (Index_type b = 0; b < num_bins; ++b) { + Data_type count_final = 0; + for (size_t r = 0; r < global_replication; ++r) { + count_final += hcounts[atomic_calc.get_global_offset(b, r)]; + } + counts_final[b] = count_final; + } + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; + } + + RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); + +} + + void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -231,10 +439,44 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) t += 1; + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicShared(vid); + + } + + t += 1; + + } + + }); + } }); + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + setBlockSize(block_size); + runCudaVariantAtomicRuntime(info, vid); + + } + + t += 1; + + } + } }); @@ -270,11 +512,27 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ ">_"+std::to_string(block_size)); + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if ( vid == Base_CUDA ) { + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">_"+std::to_string(block_size)); + } + + }); } }); + if ( vid == Base_CUDA ) { + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + auto name = info.get_name(); + addVariantTuningName(vid, name.c_str()); + } + } }); diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 1c7d69c59..32ee0b686 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -29,6 +29,52 @@ namespace rajaperf namespace algorithm { +// for these models the input is block_size and the output is cache lines +using histogram_global_atomic_model = CutoffModel<512, 32, 16>; // gfx942 +// using histogram_global_atomic_model = ConstantModel<1>; // gfx90a + +// for these models the input is block_size and the output is values +using histogram_shared_atomic_model = ConstantModel<4>; // gfx942 +// using histogram_shared_atomic_model = ConstantModel<4>; // gfx90a + + +template < size_t t_block_size, typename T, typename FunctionSignature > +struct histogram_info +{ + static constexpr size_t block_size = t_block_size; + + static size_t get_grid_size(size_t problem_size) + { + return RAJA_DIVIDE_CEILING_INT(problem_size, block_size); + } + + static size_t get_max_shmem(FunctionSignature const& func) + { + hipFuncAttributes func_attr; + hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func)); + return func_attr.maxDynamicSharedSizeBytes; + } + + FunctionSignature const& func; + const size_t grid_size; + const MultiReduceAtomicCalculator atomic_calc; + + histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins) + : func(a_func) + , grid_size(get_grid_size(problem_size)) + , atomic_calc(num_bins, block_size, grid_size, get_max_shmem(a_func), + histogram_global_atomic_model{}, histogram_shared_atomic_model{}) + { } + + std::string get_name() const + { + return "atomic_shared("+std::to_string(atomic_calc.shared_replication())+ + ")_global("+std::to_string(atomic_calc.global_replication())+ + ")_"+std::to_string(block_size); + } +}; + + template < Index_type block_size, Index_type global_replication > __launch_bounds__(block_size) __global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, @@ -41,6 +87,73 @@ __global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, } } +template < Index_type block_size, Index_type shared_replication, Index_type global_replication > +__launch_bounds__(block_size) +__global__ void histogram_atomic_shared_global(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type num_bins, + Index_type iend) +{ + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type sb = threadIdx.x; sb < num_bins * shared_replication; sb += block_size) { + shared_counts[sb] = HISTOGRAM::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, shared_counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], threadIdx.x, shared_replication), HISTOGRAM::Data_type(1)); + } + } + __syncthreads(); + + for (Index_type b = threadIdx.x; b < num_bins; b += block_size) { + Index_type i = blockIdx.x * num_bins + b; + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_counts[HISTOGRAM_GPU_BIN_INDEX(b, s, shared_replication)]; + } + if (block_sum != HISTOGRAM::Data_type(0)) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, global_counts, HISTOGRAM_GPU_BIN_INDEX(b, i, global_replication), block_sum); + } + } +} + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type iend, + MultiReduceAtomicCalculator atomic_calc) +{ + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type i = threadIdx.x; + i < Index_type(atomic_calc.num_bins() * atomic_calc.shared_replication()); + i += block_size) { + shared_counts[i] = HISTOGRAM::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, shared_counts, atomic_calc.get_shared_offset(bins[i], threadIdx.x), HISTOGRAM::Data_type(1)); + } + } + __syncthreads(); + + for (Index_type b = threadIdx.x; b < atomic_calc.num_bins(); b += block_size) { + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < atomic_calc.shared_replication(); ++s) { + block_sum += shared_counts[atomic_calc.get_shared_offset(b, s)]; + } + if (block_sum != HISTOGRAM::Data_type(0)) { + HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, global_counts, atomic_calc.get_global_offset(b, blockIdx.x), block_sum); + } + } +} + void HISTOGRAM::runHipVariantLibrary(VariantID vid) { @@ -223,6 +336,105 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) } +template < size_t block_size, size_t shared_replication, size_t global_replication > +void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const size_t shmem = num_bins*shared_replication*sizeof(Data_type); + + RPlaunchHipKernel( (histogram_atomic_shared_global), + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + num_bins, + iend ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + } + + RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); + +} + + +template < typename MultiReduceInfo > +void HISTOGRAM::runHipVariantAtomicRuntime(MultiReduceInfo info, VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + HISTOGRAM_GPU_DATA_SETUP; + + static constexpr size_t block_size = info.block_size; + const size_t grid_size = info.grid_size; + const auto atomic_calc = info.atomic_calc; + const size_t global_replication = atomic_calc.global_replication(); + const size_t shmem = atomic_calc.shared_memory_in_bytes(); + + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + + RPlaunchHipKernel( info.func, + grid_size, block_size, + shmem, res.get_stream(), + counts, + bins, + iend, + atomic_calc ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); + for (Index_type b = 0; b < num_bins; ++b) { + Data_type count_final = 0; + for (size_t r = 0; r < global_replication; ++r) { + count_final += hcounts[atomic_calc.get_global_offset(b, r)]; + } + counts_final[b] = count_final; + } + + } + stopTimer(); + + } else { + getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; + } + + RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); + +} + + void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -260,10 +472,44 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) t += 1; + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicShared(vid); + + } + + t += 1; + + } + + }); + } }); + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + setBlockSize(block_size); + runHipVariantAtomicRuntime(info, vid); + + } + + t += 1; + + } + } }); @@ -299,10 +545,27 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) addVariantTuningName(vid, "replicate_"+std::to_string(global_replication)+ "_global_"+std::to_string(block_size)); + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + + if ( vid == Base_HIP ) { + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">_"+std::to_string(block_size)); + } + + }); + } }); + if ( vid == Base_HIP ) { + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + auto name = info.get_name(); + addVariantTuningName(vid, name.c_str()); + } + } }); diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 82e720e75..357346fd9 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -105,12 +105,22 @@ class HISTOGRAM : public KernelBase void runCudaVariantAtomicGlobal(VariantID vid); template < size_t block_size, size_t global_replication > void runHipVariantAtomicGlobal(VariantID vid); + template < size_t block_size, size_t shared_replication, size_t global_replication > + void runCudaVariantAtomicShared(VariantID vid); + template < size_t block_size, size_t shared_replication, size_t global_replication > + void runHipVariantAtomicShared(VariantID vid); + template < typename MultiReduceInfo > + void runCudaVariantAtomicRuntime(MultiReduceInfo info, VariantID vid); + template < typename MultiReduceInfo > + void runHipVariantAtomicRuntime(MultiReduceInfo info, VariantID vid); private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const size_t default_gpu_atomic_replication = 2048; // 512, 512 - using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; + static const size_t default_gpu_atomic_global_replication = 2048; // 512, 512 + // using gpu_atomic_global_replications_type = integer::make_atomic_replication_list_type; + using gpu_atomic_global_replications_type = integer::list_type<32, 64, 128, 256, 512, 1024, 2048, 4096>; + using gpu_atomic_shared_replications_type = integer::list_type<1, 2, 4, 8, 16, 32, 64>; Index_type m_num_bins; Index_ptr m_bins; @@ -118,6 +128,135 @@ class HISTOGRAM : public KernelBase std::vector m_counts_final; }; + +// Compute lhs % rhs between non-negative lhs and positive power of 2 rhs +template < typename L, typename R > +constexpr auto power_of_2_mod(L lhs, R rhs) noexcept +{ + return lhs & (rhs-1); +} + +template < size_t value > +struct ConstantModel +{ + static constexpr size_t get_replication(size_t RAJAPERF_UNUSED_ARG(parallelism)) noexcept + { + return value; + } +}; + +template < size_t cutoff, size_t value_before_cutoff, size_t value_after_cutoff > +struct CutoffModel +{ + static constexpr size_t get_replication(size_t parallelism) noexcept + { + return parallelism <= cutoff ? value_before_cutoff : value_after_cutoff; + } +}; + +template < typename T, typename IndexType > +struct MultiReduceAtomicCalculator +{ + template < typename SharedAtomicModel > + static constexpr IndexType calculate_shared_replication(IndexType num_bins, + IndexType threads_per_block, + IndexType max_shmem_per_block_in_bytes, + SharedAtomicModel shared_atomic_model) + { + IndexType shared_replication = shared_atomic_model.get_replication(threads_per_block); + IndexType max_shared_replication = max_shmem_per_block_in_bytes / sizeof(T) / num_bins; + return prev_pow2(std::min(shared_replication, max_shared_replication)); + } + + template < typename GlobalAtomicModel > + static constexpr IndexType calculate_global_replication(IndexType threads_per_block, + IndexType blocks_per_kernel, + GlobalAtomicModel global_atomic_model) + { + IndexType global_replication = global_atomic_model.get_replication(threads_per_block); + return next_pow2(std::min(global_replication, blocks_per_kernel)); + } + + template < typename GlobalAtomicModel, typename SharedAtomicModel > + constexpr MultiReduceAtomicCalculator(IndexType num_bins, + IndexType threads_per_block, + IndexType blocks_per_kernel, + IndexType max_shmem_per_block_in_bytes, + GlobalAtomicModel global_atomic_model, + SharedAtomicModel shared_atomic_model) + : m_num_bins(num_bins) + , m_shared_replication(calculate_shared_replication(num_bins, threads_per_block, max_shmem_per_block_in_bytes, shared_atomic_model)) + , m_global_replication(calculate_global_replication(threads_per_block, blocks_per_kernel, global_atomic_model)) + { } + + // get the shared memory usage in bytes + __host__ __device__ + constexpr IndexType shared_memory_in_bytes() const + { + return m_shared_replication * m_num_bins * sizeof(T); + } + + // get the number of bins + __host__ __device__ + constexpr IndexType num_bins() const + { + return m_num_bins; + } + + // get the shared replication, always a power of 2 + __host__ __device__ + constexpr IndexType shared_replication() const + { + return m_shared_replication; + } + + // get the global replication, always a power of 2 + __host__ __device__ + constexpr IndexType global_replication() const + { + return m_global_replication; + } + + // get the offset into shared memory + __host__ __device__ + constexpr IndexType get_shared_offset(IndexType bin, IndexType rep) const + { + // make rep stride-1 to avoid bank conflicts + return bin * shared_replication() + power_of_2_mod(rep, shared_replication()); + } + + // get the offset into global memory + __host__ __device__ + constexpr IndexType get_global_offset(IndexType bin, IndexType rep) const + { + // make bin stride-1 so atomics from a single block can coalesce + return bin + power_of_2_mod(rep, global_replication()) * num_bins(); + } + + template < typename IterFinal, typename IterGlobal, typename Op > + T combine_global(IndexType bin, IterGlobal counts_global, Op combiner) + { + T count_final = combiner.identity(); + for (IndexType rep = 0; rep < global_replication(); ++rep) { + combiner(count_final, counts_global[get_global_offset(bin, rep)]); + } + return count_final; + } + + template < typename IterFinal, typename IterGlobal, typename Op > + void combine_globals(IterFinal counts_final, IterGlobal counts_global, Op combiner) + { + for (IndexType bin = 0; bin < num_bins; ++bin) { + counts_final[bin] = combine_global(bin, counts_global, combiner); + } + } + +private: + IndexType m_num_bins; + IndexType m_shared_replication; + IndexType m_global_replication; +}; + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index a0c55563a..1ef5d249e 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -224,6 +224,72 @@ using reducer_helpers = camp::list< } // closing brace for gpu_mapping namespace + + +/*! + \brief evaluate log base 2 of n + + For positive n calculate log base 2 of n, and round the result down to the + nearest integer. + For zero or negative n return 0 + +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +constexpr T log2(T n) noexcept +{ + T result = 0; + if (n > 0) { + while(n >>= 1) { + ++result; + } + } + return result; +} + +/*! + \brief "round up" to the next greatest power of 2 + + For a integer n, + if n is non-negative, + if n is a power of 2, return n + if n is not a power of 2, return the next greater power of 2 + if n is negative, return 0 +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +constexpr T next_pow2(T n) noexcept +{ + --n; + for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) { + n |= n >> s; + } + ++n; + return n; +} + +/*! + \brief "round down" to the next smallest power of 2 + + For a integer n, + if n is non-negative, + if n is a power of 2, return n + if n is not a power of 2, return the next smaller power of 2 + if n is negative, return 0 +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +constexpr T prev_pow2(T n) noexcept +{ + if ( n < 0 ) return 0; + for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) { + n |= n >> s; + } + return n - (n >> 1); +} + + + } // closing brace for rajaperf namespace // Get the max number of blocks to launch with the given MappingHelper From a46ebc7a250e9b998281daedf887cd5fb499f4c4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 May 2024 10:33:54 -0700 Subject: [PATCH 347/454] fix bin numbering --- src/algorithm/HISTOGRAM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 8252aaec6..8d8cf2f39 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -90,7 +90,7 @@ void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) Index_type bin = 0; for (Index_type i = 0; i < actual_prob_size; ++i) { Real_type pos = static_cast(i) / actual_prob_size; - while (pos >= data[bin]) { + while (bin+1 < m_num_bins && pos >= data[bin]) { bin += 1; } m_bins[i] = bin; From 189d5273fa430c033978c6c188098b15de125d0b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 30 May 2024 10:35:09 -0700 Subject: [PATCH 348/454] Add option to only use one bin in histogram --- src/algorithm/HISTOGRAM.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 8d8cf2f39..116892195 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -73,8 +73,9 @@ void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) bool init_even_sizes = false; bool init_random_sizes = true; + bool init_all_one = false; bool init_random_per_iterate = false; - if (init_even_sizes || init_random_sizes) { + if (init_even_sizes || init_random_sizes || init_all_one) { Real_ptr data = nullptr; if (init_even_sizes) { allocData(data, m_num_bins, Base_Seq); @@ -84,6 +85,11 @@ void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } else if (init_random_sizes) { allocAndInitDataRandValue(data, m_num_bins, Base_Seq); std::sort(data, data+m_num_bins); + } else if (init_all_one) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(0); + } } Index_type actual_prob_size = getActualProblemSize(); From 2a87140d6d84a037ecea6f4f096baa265de399d5 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 31 May 2024 15:10:15 -0700 Subject: [PATCH 349/454] Enable asynchronous SYCL execution --- src/apps/LTIMES-Sycl.cpp | 4 ---- src/apps/LTIMES_NOVIEW-Sycl.cpp | 4 ---- src/basic/NESTED_INIT-Sycl.cpp | 4 ---- src/lcals/HYDRO_2D-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_2MM-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_3MM-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_ADI-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_ATAX-Sycl.cpp | 8 -------- src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_GEMM-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_GEMVER-Sycl.cpp | 8 -------- src/polybench/POLYBENCH_GESUMMV-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp | 4 ---- src/polybench/POLYBENCH_MVT-Sycl.cpp | 4 ---- 16 files changed, 72 deletions(-) diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index 02c2b1181..541a132f7 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -74,11 +74,7 @@ void LTIMES::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<1, RAJA::sycl_global_2, //z RAJA::statement::For<2, RAJA::sycl_global_1, //g RAJA::statement::For<3, RAJA::sycl_global_0, //m diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index 9c961d266..d9b5cfaf6 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -72,11 +72,7 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<1, RAJA::sycl_global_2, //z RAJA::statement::For<2, RAJA::sycl_global_1, //g RAJA::statement::For<3, RAJA::sycl_global_0, //m diff --git a/src/basic/NESTED_INIT-Sycl.cpp b/src/basic/NESTED_INIT-Sycl.cpp index 5e310fabd..950a6b56b 100644 --- a/src/basic/NESTED_INIT-Sycl.cpp +++ b/src/basic/NESTED_INIT-Sycl.cpp @@ -70,11 +70,7 @@ void NESTED_INIT::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<2, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::For<0, RAJA::sycl_global_2, diff --git a/src/lcals/HYDRO_2D-Sycl.cpp b/src/lcals/HYDRO_2D-Sycl.cpp index 4b317d6ce..975467bc5 100644 --- a/src/lcals/HYDRO_2D-Sycl.cpp +++ b/src/lcals/HYDRO_2D-Sycl.cpp @@ -103,11 +103,7 @@ void HYDRO_2D::runSyclVariantImpl(VariantID vid) { using EXECPOL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_1, RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0> diff --git a/src/polybench/POLYBENCH_2MM-Sycl.cpp b/src/polybench/POLYBENCH_2MM-Sycl.cpp index f78ad3386..867ad780e 100644 --- a/src/polybench/POLYBENCH_2MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_2MM-Sycl.cpp @@ -99,11 +99,7 @@ void POLYBENCH_2MM::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_1, RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0, RAJA::Params<0>>, diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp index 4df0cc66c..b78c80111 100644 --- a/src/polybench/POLYBENCH_3MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp @@ -120,11 +120,7 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_1, RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0, RAJA::Params<0>>, diff --git a/src/polybench/POLYBENCH_ADI-Sycl.cpp b/src/polybench/POLYBENCH_ADI-Sycl.cpp index 0c42167b3..e0dc5e317 100644 --- a/src/polybench/POLYBENCH_ADI-Sycl.cpp +++ b/src/polybench/POLYBENCH_ADI-Sycl.cpp @@ -91,11 +91,7 @@ void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Segs<0>>, RAJA::statement::For<1, RAJA::seq_exec, diff --git a/src/polybench/POLYBENCH_ATAX-Sycl.cpp b/src/polybench/POLYBENCH_ATAX-Sycl.cpp index b2d68e3a0..110e58cd0 100644 --- a/src/polybench/POLYBENCH_ATAX-Sycl.cpp +++ b/src/polybench/POLYBENCH_ATAX-Sycl.cpp @@ -81,11 +81,7 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) using EXEC_POL1 = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, RAJA::statement::For<1, RAJA::seq_exec, @@ -98,11 +94,7 @@ void POLYBENCH_ATAX::runSyclVariantImpl(VariantID vid) using EXEC_POL2 = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<1, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Segs<1>, RAJA::Params<0>>, RAJA::statement::For<0, RAJA::seq_exec, diff --git a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp index a3b4e8690..b409b7569 100644 --- a/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D-Sycl.cpp @@ -120,11 +120,7 @@ void POLYBENCH_FDTD_2D::runSyclVariantImpl(VariantID vid) using EXEC_POL234 = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_1, RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0> diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp index a0256d207..415470801 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL-Sycl.cpp @@ -75,11 +75,7 @@ void POLYBENCH_FLOYD_WARSHALL::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::seq_exec, -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::For<2, RAJA::sycl_global_2, RAJA::statement::Lambda<0> diff --git a/src/polybench/POLYBENCH_GEMM-Sycl.cpp b/src/polybench/POLYBENCH_GEMM-Sycl.cpp index 1ad050402..2f4fc09d3 100644 --- a/src/polybench/POLYBENCH_GEMM-Sycl.cpp +++ b/src/polybench/POLYBENCH_GEMM-Sycl.cpp @@ -77,11 +77,7 @@ void POLYBENCH_GEMM::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_1, RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0, RAJA::Params<0>>, diff --git a/src/polybench/POLYBENCH_GEMVER-Sycl.cpp b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp index 7c19c882d..1242de063 100644 --- a/src/polybench/POLYBENCH_GEMVER-Sycl.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Sycl.cpp @@ -117,11 +117,7 @@ void POLYBENCH_GEMVER::runSyclVariantImpl(VariantID vid) using EXEC_POL1 = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_1, RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0> @@ -132,11 +128,7 @@ void POLYBENCH_GEMVER::runSyclVariantImpl(VariantID vid) using EXEC_POL24 = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, RAJA::statement::For<1, RAJA::seq_exec, diff --git a/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp index ed43ff9eb..83197e995 100644 --- a/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp +++ b/src/polybench/POLYBENCH_GESUMMV-Sycl.cpp @@ -65,11 +65,7 @@ void POLYBENCH_GESUMMV::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Params<0,1>>, RAJA::statement::For<1, RAJA::seq_exec, diff --git a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp index 4acfb7468..27341d447 100644 --- a/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D-Sycl.cpp @@ -93,11 +93,7 @@ void POLYBENCH_HEAT_3D::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::For<1, RAJA::sycl_global_1, RAJA::statement::For<2, RAJA::sycl_global_2, diff --git a/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp index d7db021e9..ff6dab08b 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D-Sycl.cpp @@ -90,11 +90,7 @@ void POLYBENCH_JACOBI_2D::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_1, RAJA::statement::For<1, RAJA::sycl_global_2, RAJA::statement::Lambda<0> diff --git a/src/polybench/POLYBENCH_MVT-Sycl.cpp b/src/polybench/POLYBENCH_MVT-Sycl.cpp index 306cece38..c0a3879ad 100644 --- a/src/polybench/POLYBENCH_MVT-Sycl.cpp +++ b/src/polybench/POLYBENCH_MVT-Sycl.cpp @@ -82,11 +82,7 @@ void POLYBENCH_MVT::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< -#if 0 RAJA::statement::SyclKernelAsync< -#else - RAJA::statement::SyclKernel< -#endif RAJA::statement::For<0, RAJA::sycl_global_0, RAJA::statement::Lambda<0, RAJA::Params<0>>, RAJA::statement::For<1, RAJA::seq_exec, From f6aca1bab9bdb50717af20009609b040725f3f97 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 3 Jun 2024 09:11:38 -0700 Subject: [PATCH 350/454] Add Apps_MATVEC_3D kernel --- src/CMakeLists.txt | 3 + src/apps/CMakeLists.txt | 6 + src/apps/MATVEC_3D-Cuda.cpp | 204 +++++++++++++++++++++++++++++++ src/apps/MATVEC_3D-Hip.cpp | 204 +++++++++++++++++++++++++++++++ src/apps/MATVEC_3D-OMP.cpp | 108 ++++++++++++++++ src/apps/MATVEC_3D-OMPTarget.cpp | 81 ++++++++++++ src/apps/MATVEC_3D-Seq.cpp | 101 +++++++++++++++ src/apps/MATVEC_3D.cpp | 185 ++++++++++++++++++++++++++++ src/apps/MATVEC_3D.hpp | 195 +++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 + src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 1094 insertions(+) create mode 100644 src/apps/MATVEC_3D-Cuda.cpp create mode 100644 src/apps/MATVEC_3D-Hip.cpp create mode 100644 src/apps/MATVEC_3D-OMP.cpp create mode 100644 src/apps/MATVEC_3D-OMPTarget.cpp create mode 100644 src/apps/MATVEC_3D-Seq.cpp create mode 100644 src/apps/MATVEC_3D.cpp create mode 100644 src/apps/MATVEC_3D.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 70b5b1d4f..4cf80e4f2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -71,6 +71,9 @@ blt_add_executable( apps/MASS3DPA.cpp apps/MASS3DPA-Seq.cpp apps/MASS3DPA-OMPTarget.cpp + apps/MATVEC_3D.cpp + apps/MATVEC_3D-Seq.cpp + apps/MATVEC_3D-OMPTarget.cpp apps/NODAL_ACCUMULATION_3D.cpp apps/NODAL_ACCUMULATION_3D-Seq.cpp apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 0ed6e9b81..23a08f665 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -78,6 +78,12 @@ blt_add_library( MASS3DPA-OMP.cpp MASS3DPA-OMPTarget.cpp MASS3DPA-Sycl.cpp + MATVEC_3D.cpp + MATVEC_3D-Seq.cpp + MATVEC_3D-Hip.cpp + MATVEC_3D-Cuda.cpp + MATVEC_3D-OMP.cpp + MATVEC_3D-OMPTarget.cpp NODAL_ACCUMULATION_3D.cpp NODAL_ACCUMULATION_3D-Seq.cpp NODAL_ACCUMULATION_3D-Hip.cpp diff --git a/src/apps/MATVEC_3D-Cuda.cpp b/src/apps/MATVEC_3D-Cuda.cpp new file mode 100644 index 000000000..413cfc3ee --- /dev/null +++ b/src/apps/MATVEC_3D-Cuda.cpp @@ -0,0 +1,204 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void matvec_3d(Real_ptr b, + Real_ptr xdbl, + Real_ptr xdbc, + Real_ptr xdbr, + Real_ptr xdcl, + Real_ptr xdcc, + Real_ptr xdcr, + Real_ptr xdfl, + Real_ptr xdfc, + Real_ptr xdfr, + Real_ptr xcbl, + Real_ptr xcbc, + Real_ptr xcbr, + Real_ptr xccl, + Real_ptr xccc, + Real_ptr xccr, + Real_ptr xcfl, + Real_ptr xcfc, + Real_ptr xcfr, + Real_ptr xubl, + Real_ptr xubc, + Real_ptr xubr, + Real_ptr xucl, + Real_ptr xucc, + Real_ptr xucr, + Real_ptr xufl, + Real_ptr xufc, + Real_ptr xufr, + Real_ptr dbl, + Real_ptr dbc, + Real_ptr dbr, + Real_ptr dcl, + Real_ptr dcc, + Real_ptr dcr, + Real_ptr dfl, + Real_ptr dfc, + Real_ptr dfr, + Real_ptr cbl, + Real_ptr cbc, + Real_ptr cbr, + Real_ptr ccl, + Real_ptr ccc, + Real_ptr ccr, + Real_ptr cfl, + Real_ptr cfc, + Real_ptr cfr, + Real_ptr ubl, + Real_ptr ubc, + Real_ptr ubr, + Real_ptr ucl, + Real_ptr ucc, + Real_ptr ucr, + Real_ptr ufl, + Real_ptr ufc, + Real_ptr ufr, + Index_ptr real_zones, + Index_type ibegin, Index_type iend) +{ + Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ii + ibegin; + if (i < iend) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + } +} + + +template < size_t block_size > +void MATVEC_3D::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getCudaResource()}; + + MATVEC_3D_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (matvec_3d), + grid_size, block_size, + shmem, res.get_stream(), + b, + xdbl, + xdbc, + xdbr, + xdcl, + xdcc, + xdcr, + xdfl, + xdfc, + xdfr, + xcbl, + xcbc, + xcbr, + xccl, + xccc, + xccr, + xcfl, + xcfc, + xcfr, + xubl, + xubc, + xubr, + xucl, + xucc, + xucr, + xufl, + xufc, + xufr, + dbl, + dbc, + dbr, + dcl, + dcc, + dcr, + dfl, + dfc, + dfr, + cbl, + cbc, + cbr, + ccl, + ccc, + ccr, + cfl, + cfc, + cfr, + ubl, + ubc, + ubr, + ucl, + ucc, + ucr, + ufl, + ufc, + ufr, + real_zones, + ibegin, iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_CUDA ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( res, + zones, [=] __device__ (Index_type i) { + MATVEC_3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n MATVEC_3D : Unknown Cuda variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D, Cuda) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MATVEC_3D-Hip.cpp b/src/apps/MATVEC_3D-Hip.cpp new file mode 100644 index 000000000..4793d1c98 --- /dev/null +++ b/src/apps/MATVEC_3D-Hip.cpp @@ -0,0 +1,204 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +template < size_t block_size > +__launch_bounds__(block_size) +__global__ void matvec_3d(Real_ptr b, + Real_ptr xdbl, + Real_ptr xdbc, + Real_ptr xdbr, + Real_ptr xdcl, + Real_ptr xdcc, + Real_ptr xdcr, + Real_ptr xdfl, + Real_ptr xdfc, + Real_ptr xdfr, + Real_ptr xcbl, + Real_ptr xcbc, + Real_ptr xcbr, + Real_ptr xccl, + Real_ptr xccc, + Real_ptr xccr, + Real_ptr xcfl, + Real_ptr xcfc, + Real_ptr xcfr, + Real_ptr xubl, + Real_ptr xubc, + Real_ptr xubr, + Real_ptr xucl, + Real_ptr xucc, + Real_ptr xucr, + Real_ptr xufl, + Real_ptr xufc, + Real_ptr xufr, + Real_ptr dbl, + Real_ptr dbc, + Real_ptr dbr, + Real_ptr dcl, + Real_ptr dcc, + Real_ptr dcr, + Real_ptr dfl, + Real_ptr dfc, + Real_ptr dfr, + Real_ptr cbl, + Real_ptr cbc, + Real_ptr cbr, + Real_ptr ccl, + Real_ptr ccc, + Real_ptr ccr, + Real_ptr cfl, + Real_ptr cfc, + Real_ptr cfr, + Real_ptr ubl, + Real_ptr ubc, + Real_ptr ubr, + Real_ptr ucl, + Real_ptr ucc, + Real_ptr ucr, + Real_ptr ufl, + Real_ptr ufc, + Real_ptr ufr, + Index_ptr real_zones, + Index_type ibegin, Index_type iend) +{ + Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; + Index_type i = ii + ibegin; + if (i < iend) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + } +} + + +template < size_t block_size > +void MATVEC_3D::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getHipResource()}; + + MATVEC_3D_DATA_SETUP; + + if ( vid == Base_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (matvec_3d), + grid_size, block_size, + shmem, res.get_stream(), + b, + xdbl, + xdbc, + xdbr, + xdcl, + xdcc, + xdcr, + xdfl, + xdfc, + xdfr, + xcbl, + xcbc, + xcbr, + xccl, + xccc, + xccr, + xcfl, + xcfc, + xcfr, + xubl, + xubc, + xubr, + xucl, + xucc, + xucr, + xufl, + xufc, + xufr, + dbl, + dbc, + dbr, + dcl, + dcc, + dcr, + dfl, + dfc, + dfr, + cbl, + cbc, + cbr, + ccl, + ccc, + ccr, + cfl, + cfc, + cfr, + ubl, + ubc, + ubr, + ucl, + ucc, + ucr, + ufl, + ufc, + ufr, + real_zones, + ibegin, iend ); + + } + stopTimer(); + + } else if ( vid == RAJA_HIP ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( res, + zones, [=] __device__ (Index_type i) { + MATVEC_3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n MATVEC_3D : Unknown Hip variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D, Hip) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/MATVEC_3D-OMP.cpp b/src/apps/MATVEC_3D-OMP.cpp new file mode 100644 index 000000000..dd2f33916 --- /dev/null +++ b/src/apps/MATVEC_3D-OMP.cpp @@ -0,0 +1,108 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MATVEC_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_DATA_SETUP; + + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto matvec_3d_lam = [=](Index_type ii) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + matvec_3d_lam(ii); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + camp::resources::Resource working_res{camp::resources::Host::get_default()}; + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); + + auto matvec_3d_lam = [=](Index_type i) { + MATVEC_3D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + zones, matvec_3d_lam); + + } + stopTimer(); + + break; + } + + default : { + getCout() << "\n MATVEC_3D : Unknown variant id = " << vid << std::endl; + } + + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MATVEC_3D-OMPTarget.cpp b/src/apps/MATVEC_3D-OMPTarget.cpp new file mode 100644 index 000000000..5be2e1e22 --- /dev/null +++ b/src/apps/MATVEC_3D-OMPTarget.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +void MATVEC_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(x0,x1,x2,x3,x4,x5,x6,x7, \ + vol, real_zones) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + } + + } + stopTimer(); + + } else if ( vid == RAJA_OpenMPTarget ) { + + camp::resources::Resource working_res{camp::resources::Omp::get_default()}; + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + zones, [=](Index_type i) { + MATVEC_3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n MATVEC_3D : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/apps/MATVEC_3D-Seq.cpp b/src/apps/MATVEC_3D-Seq.cpp new file mode 100644 index 000000000..fcb6a6f54 --- /dev/null +++ b/src/apps/MATVEC_3D-Seq.cpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void MATVEC_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto matvec_3d_lam = [=](Index_type ii) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + matvec_3d_lam(ii); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + camp::resources::Resource working_res{camp::resources::Host::get_default()}; + RAJA::TypedListSegment zones(real_zones, iend, + working_res, RAJA::Unowned); + + auto matvec_3d_lam = [=](Index_type i) { + MATVEC_3D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall(zones, matvec_3d_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n MATVEC_3D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MATVEC_3D.cpp b/src/apps/MATVEC_3D.cpp new file mode 100644 index 000000000..f0872d3e8 --- /dev/null +++ b/src/apps/MATVEC_3D.cpp @@ -0,0 +1,185 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" +#include "common/DataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace apps +{ + + +MATVEC_3D::MATVEC_3D(const RunParams& params) + : KernelBase(rajaperf::Apps_MATVEC_3D, params) +{ + setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct + setDefaultReps(100); + + Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + m_domain = new ADomain(rzmax, /* ndims = */ 3); + + m_zonal_array_length = m_domain->lpz+1; + + setActualProblemSize( m_domain->n_real_zones ); + + setItsPerRep( getActualProblemSize() ); + setKernelsPerRep(1); + + // touched data size, not actual number of stores and loads + const size_t ilen = m_domain->imax - m_domain->imin; + const size_t jlen = m_domain->jmax - m_domain->jmin; + const size_t klen = m_domain->kmax - m_domain->kmin; + auto get_size_extra = [&](size_t iextra, size_t jextra, size_t kextra) { + return (ilen + iextra) * (jlen + jextra) * (klen + kextra); + }; + auto get_size_matrix = [&](size_t ishift, size_t jshift, size_t kshift) { + // get the used size of matrix coefficient allocations + return get_size_extra(0,0,0) + // real zones + (get_size_extra(0,0,0) - (ilen - ishift) * // plus some extra from the + (jlen - jshift) * // edges based on the shift + (klen - kshift)); + }; + + const size_t b_accessed = get_size_extra(0, 0, 0); + const size_t x_accessed = get_size_extra(2, 2, 2) ; + const size_t m_accessed = get_size_matrix(0, 0, 0) + + get_size_matrix(1, 0, 0) + + get_size_matrix(1, 1, 0) + + get_size_matrix(0, 1, 0) + + get_size_matrix(1, 1, 0) + + get_size_matrix(1, 1, 1) + + get_size_matrix(0, 1, 1) + + get_size_matrix(1, 1, 1) + + get_size_matrix(1, 0, 1) + + get_size_matrix(0, 0, 1) + + get_size_matrix(1, 0, 1) + + get_size_matrix(1, 1, 1) + + get_size_matrix(0, 1, 1) + + get_size_matrix(1, 1, 1) ; + setBytesPerRep( getItsPerRep()*sizeof(Index_type) + + b_accessed*sizeof(Real_type) + + x_accessed*sizeof(Real_type) + + m_accessed*sizeof(Real_type) ); + + const size_t multiplies = 27; + const size_t adds = 26; + setFLOPsPerRep((multiplies + adds) * getItsPerRep()); + + checksum_scale_factor = 1.0 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + setUsesFeature(Forall); + + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); +} + +MATVEC_3D::~MATVEC_3D() +{ + delete m_domain; +} + +void MATVEC_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + allocAndInitDataConst(m_b, m_zonal_array_length, 0.0, vid); + allocAndInitData(m_x, m_zonal_array_length, vid); + + allocAndInitData(m_matrix.dbl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dbc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dbr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dcl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dcc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dcr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dfl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dfc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.dfr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.cbl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.cbc, m_zonal_array_length, vid); + allocAndInitData(m_matrix.cbr, m_zonal_array_length, vid); + allocAndInitData(m_matrix.ccl, m_zonal_array_length, vid); + allocAndInitData(m_matrix.ccc, m_zonal_array_length, vid); + m_matrix.ccr = m_matrix.ccl + 1 ; + m_matrix.cfl = m_matrix.cbr - 1 + m_domain->jp ; + m_matrix.cfc = m_matrix.cbc + m_domain->jp ; + m_matrix.cfr = m_matrix.cbl + 1 + m_domain->jp ; + m_matrix.ubl = m_matrix.dfr - 1 - m_domain->jp + m_domain->kp ; + m_matrix.ubc = m_matrix.dfc - m_domain->jp + m_domain->kp ; + m_matrix.ubr = m_matrix.dfl + 1 - m_domain->jp + m_domain->kp ; + m_matrix.ucl = m_matrix.dcr - 1 + m_domain->kp ; + m_matrix.ucc = m_matrix.dcc + m_domain->kp ; + m_matrix.ucr = m_matrix.dcl + 1 + m_domain->kp ; + m_matrix.ufl = m_matrix.dbr - 1 + m_domain->jp + m_domain->kp ; + m_matrix.ufc = m_matrix.dbc + m_domain->jp + m_domain->kp ; + m_matrix.ufr = m_matrix.dbl + 1 + m_domain->jp + m_domain->kp ; + + allocAndInitDataConst(m_real_zones, m_domain->n_real_zones, + static_cast(-1), vid); + + { + auto reset_rz = scopedMoveData(m_real_zones, m_domain->n_real_zones, vid); + + setRealZones_3d(m_real_zones, *m_domain); + } + +} + +void MATVEC_3D::updateChecksum(VariantID vid, size_t tune_idx) +{ + checksum[vid].at(tune_idx) += calcChecksum(m_b, m_zonal_array_length, checksum_scale_factor , vid); +} + +void MATVEC_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + + deallocData(m_b, vid); + deallocData(m_x, vid); + + deallocData(m_matrix.dbl, vid); + deallocData(m_matrix.dbc, vid); + deallocData(m_matrix.dbr, vid); + deallocData(m_matrix.dcl, vid); + deallocData(m_matrix.dcc, vid); + deallocData(m_matrix.dcr, vid); + deallocData(m_matrix.dfl, vid); + deallocData(m_matrix.dfc, vid); + deallocData(m_matrix.dfr, vid); + deallocData(m_matrix.cbl, vid); + deallocData(m_matrix.cbc, vid); + deallocData(m_matrix.cbr, vid); + deallocData(m_matrix.ccl, vid); + deallocData(m_matrix.ccc, vid); + + deallocData(m_real_zones, vid); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MATVEC_3D.hpp b/src/apps/MATVEC_3D.hpp new file mode 100644 index 000000000..650acba93 --- /dev/null +++ b/src/apps/MATVEC_3D.hpp @@ -0,0 +1,195 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// MATVEC_3D kernel reference implementation: +/// +/// for (Index_type ii = ibegin; ii < iend; ++ii ) { +/// Index_type i = real_zones[ii]; +/// +/// b[i] = dbl[i] * xdbl[i] + dbc[i] * xdbc[i] + dbr[i] * xdbr[i] + +/// dcl[i] * xdcl[i] + dcc[i] * xdcc[i] + dcr[i] * xdcr[i] + +/// dfl[i] * xdfl[i] + dfc[i] * xdfc[i] + dfr[i] * xdfr[i] + +/// +/// cbl[i] * xcbl[i] + cbc[i] * xcbc[i] + cbr[i] * xcbr[i] + +/// ccl[i] * xccl[i] + ccc[i] * xccc[i] + ccr[i] * xccr[i] + +/// cfl[i] * xcfl[i] + cfc[i] * xcfc[i] + cfr[i] * xcfr[i] + +/// +/// ubl[i] * xubl[i] + ubc[i] * xubc[i] + ubr[i] * xubr[i] + +/// ucl[i] * xucl[i] + ucc[i] * xucc[i] + ucr[i] * xucr[i] + +/// ufl[i] * xufl[i] + ufc[i] * xufc[i] + ufr[i] * xufr[i] ; +/// +/// } +/// + +#ifndef RAJAPerf_Apps_MATVEC_3D_HPP +#define RAJAPerf_Apps_MATVEC_3D_HPP + +#define MATVEC_3D_DATA_SETUP \ + Real_ptr b = m_b; \ + \ + Real_ptr xdbl = m_x - m_domain->kp - m_domain->jp - 1 ; \ + Real_ptr xdbc = m_x - m_domain->kp - m_domain->jp ; \ + Real_ptr xdbr = m_x - m_domain->kp - m_domain->jp + 1 ; \ + Real_ptr xdcl = m_x - m_domain->kp - 1 ; \ + Real_ptr xdcc = m_x - m_domain->kp ; \ + Real_ptr xdcr = m_x - m_domain->kp + 1 ; \ + Real_ptr xdfl = m_x - m_domain->kp + m_domain->jp - 1 ; \ + Real_ptr xdfc = m_x - m_domain->kp + m_domain->jp ; \ + Real_ptr xdfr = m_x - m_domain->kp + m_domain->jp + 1 ; \ + Real_ptr xcbl = m_x - m_domain->jp - 1 ; \ + Real_ptr xcbc = m_x - m_domain->jp ; \ + Real_ptr xcbr = m_x - m_domain->jp + 1 ; \ + Real_ptr xccl = m_x - 1 ; \ + Real_ptr xccc = m_x ; \ + Real_ptr xccr = m_x + 1 ; \ + Real_ptr xcfl = m_x + m_domain->jp - 1 ; \ + Real_ptr xcfc = m_x + m_domain->jp ; \ + Real_ptr xcfr = m_x + m_domain->jp + 1 ; \ + Real_ptr xubl = m_x + m_domain->kp - m_domain->jp - 1 ; \ + Real_ptr xubc = m_x + m_domain->kp - m_domain->jp ; \ + Real_ptr xubr = m_x + m_domain->kp - m_domain->jp + 1 ; \ + Real_ptr xucl = m_x + m_domain->kp - 1 ; \ + Real_ptr xucc = m_x + m_domain->kp ; \ + Real_ptr xucr = m_x + m_domain->kp + 1 ; \ + Real_ptr xufl = m_x + m_domain->kp + m_domain->jp - 1 ; \ + Real_ptr xufc = m_x + m_domain->kp + m_domain->jp ; \ + Real_ptr xufr = m_x + m_domain->kp + m_domain->jp + 1 ; \ + \ + Real_ptr dbl = m_matrix.dbl; \ + Real_ptr dbc = m_matrix.dbc; \ + Real_ptr dbr = m_matrix.dbr; \ + Real_ptr dcl = m_matrix.dcl; \ + Real_ptr dcc = m_matrix.dcc; \ + Real_ptr dcr = m_matrix.dcr; \ + Real_ptr dfl = m_matrix.dfl; \ + Real_ptr dfc = m_matrix.dfc; \ + Real_ptr dfr = m_matrix.dfr; \ + Real_ptr cbl = m_matrix.cbl; \ + Real_ptr cbc = m_matrix.cbc; \ + Real_ptr cbr = m_matrix.cbr; \ + Real_ptr ccl = m_matrix.ccl; \ + Real_ptr ccc = m_matrix.ccc; \ + Real_ptr ccr = m_matrix.ccr; \ + Real_ptr cfl = m_matrix.cfl; \ + Real_ptr cfc = m_matrix.cfc; \ + Real_ptr cfr = m_matrix.cfr; \ + Real_ptr ubl = m_matrix.ubl; \ + Real_ptr ubc = m_matrix.ubc; \ + Real_ptr ubr = m_matrix.ubr; \ + Real_ptr ucl = m_matrix.ucl; \ + Real_ptr ucc = m_matrix.ucc; \ + Real_ptr ucr = m_matrix.ucr; \ + Real_ptr ufl = m_matrix.ufl; \ + Real_ptr ufc = m_matrix.ufc; \ + Real_ptr ufr = m_matrix.ufr; \ + \ + Index_ptr real_zones = m_real_zones; + +#define MATVEC_3D_BODY_INDEX \ + Index_type i = real_zones[ii]; + +#define MATVEC_3D_BODY \ + b[i] = dbl[i] * xdbl[i] + dbc[i] * xdbc[i] + dbr[i] * xdbr[i] + \ + dcl[i] * xdcl[i] + dcc[i] * xdcc[i] + dcr[i] * xdcr[i] + \ + dfl[i] * xdfl[i] + dfc[i] * xdfc[i] + dfr[i] * xdfr[i] + \ + \ + cbl[i] * xcbl[i] + cbc[i] * xcbc[i] + cbr[i] * xcbr[i] + \ + ccl[i] * xccl[i] + ccc[i] * xccc[i] + ccr[i] * xccr[i] + \ + cfl[i] * xcfl[i] + cfc[i] * xcfc[i] + cfr[i] * xcfr[i] + \ + \ + ubl[i] * xubl[i] + ubc[i] * xubc[i] + ubr[i] * xubr[i] + \ + ucl[i] * xucl[i] + ucc[i] * xucc[i] + ucr[i] * xucr[i] + \ + ufl[i] * xufl[i] + ufc[i] * xufc[i] + ufr[i] * xufr[i] ; \ + + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace apps +{ +class ADomain; + +class MATVEC_3D : public KernelBase +{ +public: + + MATVEC_3D(const RunParams& params); + + ~MATVEC_3D(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; + + struct Matrix + { + Real_ptr dbl; + Real_ptr dbc; + Real_ptr dbr; + Real_ptr dcl; + Real_ptr dcc; + Real_ptr dcr; + Real_ptr dfl; + Real_ptr dfc; + Real_ptr dfr; + Real_ptr cbl; + Real_ptr cbc; + Real_ptr cbr; + Real_ptr ccl; + Real_ptr ccc; + Real_ptr ccr; + Real_ptr cfl; + Real_ptr cfc; + Real_ptr cfr; + Real_ptr ubl; + Real_ptr ubc; + Real_ptr ubr; + Real_ptr ucl; + Real_ptr ucc; + Real_ptr ucr; + Real_ptr ufl; + Real_ptr ufc; + Real_ptr ufr; + }; + + Real_ptr m_b; + Real_ptr m_x; + Matrix m_matrix; + + ADomain* m_domain; + Index_type* m_real_zones; + Index_type m_zonal_array_length; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index f89ea35e7..8316050d6 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -90,6 +90,7 @@ #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DEA.hpp" #include "apps/MASS3DPA.hpp" +#include "apps/MATVEC_3D.hpp" #include "apps/NODAL_ACCUMULATION_3D.hpp" #include "apps/PRESSURE.hpp" #include "apps/VOL3D.hpp" @@ -241,6 +242,7 @@ static const std::string KernelNames [] = std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DEA"), std::string("Apps_MASS3DPA"), + std::string("Apps_MATVEC_3D"), std::string("Apps_NODAL_ACCUMULATION_3D"), std::string("Apps_PRESSURE"), std::string("Apps_VOL3D"), @@ -994,6 +996,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::MASS3DPA(run_params); break; } + case Apps_MATVEC_3D : { + kernel = new apps::MATVEC_3D(run_params); + break; + } case Apps_NODAL_ACCUMULATION_3D : { kernel = new apps::NODAL_ACCUMULATION_3D(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 7954328ad..f3d8a5041 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -150,6 +150,7 @@ enum KernelID { Apps_LTIMES_NOVIEW, Apps_MASS3DEA, Apps_MASS3DPA, + Apps_MATVEC_3D, Apps_NODAL_ACCUMULATION_3D, Apps_PRESSURE, Apps_VOL3D, From 123e285a4cd2ee7ce8a670520b5325671e62e362 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 3 Jun 2024 10:50:40 -0700 Subject: [PATCH 351/454] Pull in latest RAJA develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index c315dddd6..b00291142 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit c315dddd601036f93d2f3db6f06563beb165fe77 +Subproject commit b0029114273973b77a23bf70f3390540452f3bdd From 9f93a0595d3d65ba13f8c7667c79fb42d2e33cfb Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Mon, 3 Jun 2024 15:19:11 -0700 Subject: [PATCH 352/454] Resolve merge conflicts to prepare for merging. --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 4a8ab8b53..b00291142 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 4a8ab8b536201ea9fecad2df485016dc606b3fc2 +Subproject commit b0029114273973b77a23bf70f3390540452f3bdd From 0a18f792064e2dde4c829b921e38bc1959ba9641 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 5 Jun 2024 10:59:14 -0700 Subject: [PATCH 353/454] Add SYCL variants of ARRAY_OF_PTRS kernel --- src/basic/ARRAY_OF_PTRS-Sycl.cpp | 84 ++++++++++++++++++++++++++++++++ src/basic/ARRAY_OF_PTRS.cpp | 3 ++ src/basic/ARRAY_OF_PTRS.hpp | 4 ++ src/basic/CMakeLists.txt | 1 + 4 files changed, 92 insertions(+) create mode 100644 src/basic/ARRAY_OF_PTRS-Sycl.cpp diff --git a/src/basic/ARRAY_OF_PTRS-Sycl.cpp b/src/basic/ARRAY_OF_PTRS-Sycl.cpp new file mode 100644 index 000000000..254d9573e --- /dev/null +++ b/src/basic/ARRAY_OF_PTRS-Sycl.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ARRAY_OF_PTRS.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +template < size_t work_group_size > +void ARRAY_OF_PTRS::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + ARRAY_OF_PTRS_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + ARRAY_OF_PTRS_Array x_array = x; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + ARRAY_OF_PTRS_BODY(x_array.array); + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + ARRAY_OF_PTRS_BODY(x); + }); + + } + stopTimer(); + + } else { + getCout() << "\n ARRAY_OF_PTRS : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ARRAY_OF_PTRS, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 095e52781..17f6e5c1c 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -54,6 +54,9 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index b32e10254..26ac4c78e 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -72,16 +72,20 @@ class ARRAY_OF_PTRS : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index c5b5f8f57..3fe5f1d9a 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -14,6 +14,7 @@ blt_add_library( ARRAY_OF_PTRS-Cuda.cpp ARRAY_OF_PTRS-OMP.cpp ARRAY_OF_PTRS-OMPTarget.cpp + ARRAY_OF_PTRS-Sycl.cpp COPY8.cpp COPY8-Seq.cpp COPY8-Hip.cpp From d03bba23847bbc55cfc841a235e9a60805ab5dcf Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 5 Jun 2024 11:00:13 -0700 Subject: [PATCH 354/454] Add SYCL variants of COPY8 kernel --- src/basic/CMakeLists.txt | 1 + src/basic/COPY8-Sycl.cpp | 82 ++++++++++++++++++++++++++++++++++++++++ src/basic/COPY8.cpp | 3 ++ src/basic/COPY8.hpp | 4 ++ 4 files changed, 90 insertions(+) create mode 100644 src/basic/COPY8-Sycl.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 3fe5f1d9a..7439c46c9 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -21,6 +21,7 @@ blt_add_library( COPY8-Cuda.cpp COPY8-OMP.cpp COPY8-OMPTarget.cpp + COPY8-Sycl.cpp DAXPY.cpp DAXPY-Seq.cpp DAXPY-Hip.cpp diff --git a/src/basic/COPY8-Sycl.cpp b/src/basic/COPY8-Sycl.cpp new file mode 100644 index 000000000..8ce2a8e24 --- /dev/null +++ b/src/basic/COPY8-Sycl.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "COPY8.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +template < size_t work_group_size > +void COPY8::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + COPY8_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + [=] (sycl::nd_item<1> item ) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + COPY8_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { + COPY8_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n COPY8 : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(COPY8, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index 4d22c4336..3f6e49044 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -51,6 +51,9 @@ COPY8::COPY8(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } COPY8::~COPY8() diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index 572754540..61945eed3 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -79,16 +79,20 @@ class COPY8 : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 33dcca6573ae8bdbd2eb826379b82e35efd4f481 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Wed, 5 Jun 2024 13:14:58 -0700 Subject: [PATCH 355/454] Add SYCL variants of ZONAL_ACCUMULATION_3D kernel --- src/apps/CMakeLists.txt | 1 + src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp | 89 +++++++++++++++++++++++++ src/apps/ZONAL_ACCUMULATION_3D.cpp | 3 + src/apps/ZONAL_ACCUMULATION_3D.hpp | 4 ++ 4 files changed, 97 insertions(+) create mode 100644 src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 0ed6e9b81..c66f81992 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -104,5 +104,6 @@ blt_add_library( ZONAL_ACCUMULATION_3D-Cuda.cpp ZONAL_ACCUMULATION_3D-OMP.cpp ZONAL_ACCUMULATION_3D-OMPTarget.cpp + ZONAL_ACCUMULATION_3D-Sycl.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp new file mode 100644 index 000000000..d3f097f2b --- /dev/null +++ b/src/apps/ZONAL_ACCUMULATION_3D-Sycl.cpp @@ -0,0 +1,89 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ZONAL_ACCUMULATION_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +template < size_t work_group_size > +void ZONAL_ACCUMULATION_3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + ZONAL_ACCUMULATION_3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (grid_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type ii = item.get_global_id(0); + Index_type i = ii + ibegin; + if (i < iend) { + ZONAL_ACCUMULATION_3D_BODY_INDEX; + ZONAL_ACCUMULATION_3D_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::sycl_exec >( res, + zones, [=] (Index_type i) { + ZONAL_ACCUMULATION_3D_BODY; + }); + + } + stopTimer(); + + } else { + getCout() << "\n ZONAL_ACCUMULATION_3D : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(ZONAL_ACCUMULATION_3D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index be5c93000..5b0f4c20d 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -66,6 +66,9 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } ZONAL_ACCUMULATION_3D::~ZONAL_ACCUMULATION_3D() diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 2e15e3d60..572490caa 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -81,14 +81,18 @@ class ZONAL_ACCUMULATION_3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 577e03e6db39a2d4c802394ff9f53d4837ca47bb Mon Sep 17 00:00:00 2001 From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> Date: Wed, 5 Jun 2024 22:44:33 +0200 Subject: [PATCH 356/454] Use rocm@6.1.1 and update Spack (#447) * Use rocm@6.1.1 for local and shared tioga CI jobs * Update RAJA to develop with rocm 6.1.1 support merged --- .gitlab/jobs/tioga.yml | 8 ++++---- .uberenv_config.json | 2 +- tpl/RAJA | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index decc3baa4..bcf9eccb8 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -29,13 +29,13 @@ # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -rocmcc_5_7_1_hip_openmp: +rocmcc_6_1_1_hip_openmp: variables: - SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" + SPEC: "~shared +rocm +openmp amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ${PROJECT_TIOGA_DEPS}" extends: .job_on_tioga -rocmcc_5_7_1_hip_openmp_mpi: +rocmcc_6_1_1_hip_openmp_mpi: variables: - SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ${PROJECT_TIOGA_DEPS}" + SPEC: "~shared +rocm +openmp +mpi amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ${PROJECT_TIOGA_DEPS}" extends: .job_on_tioga allow_failure: true diff --git a/.uberenv_config.json b/.uberenv_config.json index cb9f9a5dd..fda595d3a 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -4,7 +4,7 @@ "package_final_phase" : "initconfig", "package_source_dir" : "../..", "spack_url": "https://github.com/spack/spack.git", -"spack_branch": "develop-2024-02-18", +"spack_branch": "develop-2024-05-26", "spack_activate" : {}, "spack_configs_path": "tpl/RAJA/scripts/radiuss-spack-configs", "spack_packages_path": "tpl/RAJA/scripts/radiuss-spack-configs/packages", diff --git a/tpl/RAJA b/tpl/RAJA index b00291142..dfaab80f7 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit b0029114273973b77a23bf70f3390540452f3bdd +Subproject commit dfaab80f75dc8f9fc872de233ffa13b104ebac55 From c4a41a42b2925cf4f9b7eca2d1746e1d7bd7ee49 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 6 Jun 2024 08:21:17 -0700 Subject: [PATCH 357/454] Fix global dim --- src/polybench/POLYBENCH_3MM-Sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/polybench/POLYBENCH_3MM-Sycl.cpp b/src/polybench/POLYBENCH_3MM-Sycl.cpp index b78c80111..b6abea7b9 100644 --- a/src/polybench/POLYBENCH_3MM-Sycl.cpp +++ b/src/polybench/POLYBENCH_3MM-Sycl.cpp @@ -94,7 +94,7 @@ void POLYBENCH_3MM::runSyclVariantImpl(VariantID vid) }); qu->submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range<3>( global_dim2, wkgroup_dim), + h.parallel_for(sycl::nd_range<3>( global_dim3, wkgroup_dim), [=] (sycl::nd_item<3> item) { Index_type i = item.get_global_id(1); From 108d96a7966657f8f689ec498dd617e24c47e8de Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 6 Jun 2024 08:22:00 -0700 Subject: [PATCH 358/454] Change thread index dimension for consistency. --- src/polybench/POLYBENCH_ADI-Sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/polybench/POLYBENCH_ADI-Sycl.cpp b/src/polybench/POLYBENCH_ADI-Sycl.cpp index e0dc5e317..0fb3dfb4c 100644 --- a/src/polybench/POLYBENCH_ADI-Sycl.cpp +++ b/src/polybench/POLYBENCH_ADI-Sycl.cpp @@ -92,7 +92,7 @@ void POLYBENCH_ADI::runSyclVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::SyclKernelAsync< - RAJA::statement::For<0, RAJA::sycl_global_0, + RAJA::statement::For<0, RAJA::sycl_global_2, RAJA::statement::Lambda<0, RAJA::Segs<0>>, RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::Lambda<1, RAJA::Segs<0,1>> From 4c9410eab2dadc3c595c0a082e5521a2f251c8fb Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 6 Jun 2024 10:13:51 -0700 Subject: [PATCH 359/454] Remove unnecessary code not needed for lambda capture --- src/basic/ARRAY_OF_PTRS-Sycl.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/basic/ARRAY_OF_PTRS-Sycl.cpp b/src/basic/ARRAY_OF_PTRS-Sycl.cpp index 254d9573e..c3d987987 100644 --- a/src/basic/ARRAY_OF_PTRS-Sycl.cpp +++ b/src/basic/ARRAY_OF_PTRS-Sycl.cpp @@ -36,8 +36,6 @@ void ARRAY_OF_PTRS::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - ARRAY_OF_PTRS_Array x_array = x; - startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -49,7 +47,7 @@ void ARRAY_OF_PTRS::runSyclVariantImpl(VariantID vid) Index_type i = item.get_global_id(0); if (i < iend) { - ARRAY_OF_PTRS_BODY(x_array.array); + ARRAY_OF_PTRS_BODY(x); } }); From ff00d13cc1d443ac2cf8509f6ff6ad1a21cae029 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 6 Jun 2024 13:11:04 -0700 Subject: [PATCH 360/454] Bumping poodle shared allocation time in attempt to prevent timeouts --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 2735341b4..931d1961b 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -26,7 +26,7 @@ variables: # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --time=20 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=40 --nodes=1" # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle From c7ef954d8594834d9c5585b14a0bea4cbfe368fe Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 7 Jun 2024 11:16:11 +0200 Subject: [PATCH 361/454] Update RADIUSS Shared CI to fix allocation release job --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9f7020a98..ef751105d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -72,7 +72,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: 'v2024.04.0' + ref: 'v2024.06.0' file: 'pipelines/${CI_MACHINE}.yml' - artifact: '${CI_MACHINE}-jobs.yml' job: 'generate-job-lists' @@ -85,7 +85,7 @@ include: file: 'id_tokens.yml' # [Optional] checks preliminary to running the actual CI test #- project: 'radiuss/radiuss-shared-ci' - # ref: 'v2024.04.0' + # ref: 'v2024.06.0' # file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' From 48d4a171447eedaac288b4ccabef2d6ef088353d Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 7 Jun 2024 07:57:24 -0700 Subject: [PATCH 362/454] bump poodle ci shared allocation time in attempt to avoid timeout --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 2735341b4..931d1961b 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -26,7 +26,7 @@ variables: # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --time=20 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=40 --nodes=1" # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle From 0d166be7a2527d7c07fd2b156f182ec99813368d Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 7 Jun 2024 15:42:40 -0700 Subject: [PATCH 363/454] Add PI_REDUCE kernel --- src/basic/CMakeLists.txt | 1 + src/basic/PI_REDUCE-Sycl.cpp | 109 +++++++++++++++++++++++++++++++++++ src/basic/PI_REDUCE.cpp | 3 + src/basic/PI_REDUCE.hpp | 5 ++ 4 files changed, 118 insertions(+) create mode 100644 src/basic/PI_REDUCE-Sycl.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 7439c46c9..2b10f2bee 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -108,6 +108,7 @@ blt_add_library( PI_REDUCE-Cuda.cpp PI_REDUCE-OMP.cpp PI_REDUCE-OMPTarget.cpp + PI_REDUCE-Sycl.cpp REDUCE3_INT.cpp REDUCE3_INT-Seq.cpp REDUCE3_INT-Hip.cpp diff --git a/src/basic/PI_REDUCE-Sycl.cpp b/src/basic/PI_REDUCE-Sycl.cpp new file mode 100644 index 000000000..0b7a6b256 --- /dev/null +++ b/src/basic/PI_REDUCE-Sycl.cpp @@ -0,0 +1,109 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PI_REDUCE.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include +#include +#include +#include + + +namespace rajaperf +{ +namespace basic +{ + + +template < size_t work_group_size > +void PI_REDUCE::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + PI_REDUCE_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + Real_ptr pi; + allocAndInitSyclDeviceData(pi, &m_pi_init, 1, qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(pi, &m_pi_init, 1, qu); + + qu->submit([&] (sycl::handler& hdl) { + + auto sum_reduction = sycl::reduction(pi, sycl::plus<>()); + + hdl.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sum_reduction, + [=] (sycl::nd_item<1> item, auto& pi) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + PI_REDUCE_BODY; + } + + }); + }); + + Real_type lpi; + Real_ptr plpi = &lpi; + getSyclDeviceData(plpi, pi, 1, qu); + m_pi = 4.0 * lpi; + + } + stopTimer(); + + deallocSyclDeviceData(pi, qu); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(PI_REDUCE, Sycl) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 998ecf5f1..2ae27e762 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -51,6 +51,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } PI_REDUCE::~PI_REDUCE() diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 3a00fc638..4dc3104d3 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -56,9 +56,11 @@ class PI_REDUCE : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); @@ -70,6 +72,9 @@ class PI_REDUCE : public KernelBase template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); + private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; From 4c88b3a1aad72b22cb5f57237fd7c9870357b9e9 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 7 Jun 2024 15:43:16 -0700 Subject: [PATCH 364/454] Convert SYCL reductions to new interface --- src/basic/REDUCE3_INT-Sycl.cpp | 48 +++++++++++++++++----------------- src/basic/TRAP_INT-Sycl.cpp | 15 ++++++----- 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/src/basic/REDUCE3_INT-Sycl.cpp b/src/basic/REDUCE3_INT-Sycl.cpp index f90a39d81..bb99e7afc 100644 --- a/src/basic/REDUCE3_INT-Sycl.cpp +++ b/src/basic/REDUCE3_INT-Sycl.cpp @@ -21,18 +21,6 @@ namespace rajaperf namespace basic { -#define REDUCE3_INT_DATA_SETUP_SYCL \ - Int_ptr hsum; \ - allocAndInitSyclDeviceData(hsum, &m_vsum_init, 1, qu); \ - Int_ptr hmin; \ - allocAndInitSyclDeviceData(hmin, &m_vmin_init, 1, qu); \ - Int_ptr hmax; \ - allocAndInitSyclDeviceData(hmax, &m_vmax_init, 1, qu); - -#define REDUCE3_INT_DATA_TEARDOWN_SYCL \ - deallocSyclDeviceData(hsum, qu); \ - deallocSyclDeviceData(hmin, qu); \ - deallocSyclDeviceData(hmax, qu); template void REDUCE3_INT::runSyclVariantImpl(VariantID vid) @@ -48,7 +36,12 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - REDUCE3_INT_DATA_SETUP_SYCL; + Int_ptr hsum; + allocAndInitSyclDeviceData(hsum, &m_vsum_init, 1, qu); + Int_ptr hmin; + allocAndInitSyclDeviceData(hmin, &m_vmin_init, 1, qu); + Int_ptr hmax; + allocAndInitSyclDeviceData(hmax, &m_vmax_init, 1, qu); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -98,25 +91,32 @@ void REDUCE3_INT::runSyclVariantImpl(VariantID vid) } // for (RepIndex_type irep = ... stopTimer(); - REDUCE3_INT_DATA_TEARDOWN_SYCL; + deallocSyclDeviceData(hsum, qu); + deallocSyclDeviceData(hmin, qu); + deallocSyclDeviceData(hmax, qu); } else if ( vid == RAJA_SYCL ) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; RAJA::forall< RAJA::sycl_exec >( - RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=] (Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); } stopTimer(); diff --git a/src/basic/TRAP_INT-Sycl.cpp b/src/basic/TRAP_INT-Sycl.cpp index e8f066c1d..7443d9604 100644 --- a/src/basic/TRAP_INT-Sycl.cpp +++ b/src/basic/TRAP_INT-Sycl.cpp @@ -79,14 +79,17 @@ void TRAP_INT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + Real_type tsumx = m_sumx_init; RAJA::forall< RAJA::sycl_exec >( - RAJA::RangeSegment(ibegin, iend), [=] (Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; } stopTimer(); From 7aa7d35483a63eb839b00a07bcc1bd8d5114cb64 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 7 Jun 2024 15:43:52 -0700 Subject: [PATCH 365/454] Convert SYCL reductions to new interface --- src/stream/DOT-Sycl.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/stream/DOT-Sycl.cpp b/src/stream/DOT-Sycl.cpp index 0475dcc70..8b7ff2314 100644 --- a/src/stream/DOT-Sycl.cpp +++ b/src/stream/DOT-Sycl.cpp @@ -75,6 +75,7 @@ void DOT::runSyclVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +#if 0 // RDH RAJA::ReduceSum dot(m_dot_init); RAJA::forall< RAJA::sycl_exec >( @@ -83,6 +84,18 @@ void DOT::runSyclVariantImpl(VariantID vid) }); m_dot += static_cast(dot.get()); +#else + Real_type tdot = m_dot_init; + RAJA::forall< RAJA::sycl_exec >( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); +#endif } stopTimer(); From 3975b8de8d3f8130d871c678efa6300eed9ac2d0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 11 Jun 2024 13:25:03 -0700 Subject: [PATCH 366/454] Updating sphinx version --- docs/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 6b1d35172..0a42ee80c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,3 @@ -docutils<0.20 +docutils +sphinx==6.2.1 +sphinx-rtd-theme==1.2.2 From 7a1bc40afb9121a3ae32c796eaaf4744628b46c7 Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Thu, 13 Jun 2024 15:29:48 -0700 Subject: [PATCH 367/454] Added RAJA-SYCL variant of FIRST_MIN kernel. Need to figure out how to do a "loc" reduction in native SYCL. --- src/lcals/CMakeLists.txt | 1 + src/lcals/FIRST_MIN-Sycl.cpp | 100 +++++++++++++++++++++++++++++++++++ src/lcals/FIRST_MIN.cpp | 3 ++ src/lcals/FIRST_MIN.hpp | 5 ++ 4 files changed, 109 insertions(+) create mode 100644 src/lcals/FIRST_MIN-Sycl.cpp diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt index 1c0695a53..6fc819b2b 100644 --- a/src/lcals/CMakeLists.txt +++ b/src/lcals/CMakeLists.txt @@ -35,6 +35,7 @@ blt_add_library( FIRST_MIN-Cuda.cpp FIRST_MIN-OMP.cpp FIRST_MIN-OMPTarget.cpp + FIRST_MIN-Sycl.cpp FIRST_SUM.cpp FIRST_SUM-Seq.cpp FIRST_SUM-Hip.cpp diff --git a/src/lcals/FIRST_MIN-Sycl.cpp b/src/lcals/FIRST_MIN-Sycl.cpp new file mode 100644 index 000000000..a79e6f10c --- /dev/null +++ b/src/lcals/FIRST_MIN-Sycl.cpp @@ -0,0 +1,100 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_MIN.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace lcals +{ + +template +void FIRST_MIN::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + FIRST_MIN_DATA_SETUP; + + if ( vid == Base_SYCL ) { + +#if 0 // RDH + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + define and init reduction... + + qu->submit([&] (sycl::handler& h) { + + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + pass reduction..., + [=] (sycl::nd_item<1> item, auto& dot) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + // body + } + + }); + }); + + m_minloc = get loc value... + + } + stopTimer(); +#endif + + } else if ( vid == RAJA_SYCL ) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=] (Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + std::cout << "\n FIRST_MIN : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(FIRST_MIN, Sycl) + +} // end namespace lcals +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 89b741d88..4cc3e276c 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -58,6 +58,9 @@ FIRST_MIN::FIRST_MIN(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); + setVariantDefined( Kokkos_Lambda ); } diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index ed2ddc286..c6161447f 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -79,11 +79,13 @@ class FIRST_MIN : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void runKokkosVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); @@ -95,6 +97,9 @@ class FIRST_MIN : public KernelBase template < size_t block_size, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); + private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; From bca48a2fe8006c5881d73aaa5d295101edb6e04f Mon Sep 17 00:00:00 2001 From: "Richard D. Hornung" Date: Fri, 14 Jun 2024 13:25:47 -0700 Subject: [PATCH 368/454] Add base SYCL variant of FIRST_MIN kernel --- src/lcals/FIRST_MIN-Sycl.cpp | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/src/lcals/FIRST_MIN-Sycl.cpp b/src/lcals/FIRST_MIN-Sycl.cpp index a79e6f10c..616c84dcb 100644 --- a/src/lcals/FIRST_MIN-Sycl.cpp +++ b/src/lcals/FIRST_MIN-Sycl.cpp @@ -22,6 +22,15 @@ namespace rajaperf namespace lcals { +template +struct reduce_pair { + bool operator<(const reduce_pair& o) const { + return (val < o.val); + } + VAL_TYPE val; + IDX_TYPE idx; +}; + template void FIRST_MIN::runSyclVariantImpl(VariantID vid) { @@ -36,33 +45,42 @@ void FIRST_MIN::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { -#if 0 // RDH + using result_type = reduce_pair; + + auto result = sycl::malloc_shared< result_type >(1, *qu); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); - define and init reduction... + result_type result_init = { m_xmin_init, m_initloc }; + *result = result_init; + auto reduction_obj = sycl::reduction( result, result_init, sycl::minimum() ); qu->submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), - pass reduction..., - [=] (sycl::nd_item<1> item, auto& dot) { + reduction_obj, + [=] (sycl::nd_item<1> item, auto& loc) { Index_type i = item.get_global_id(0); if (i < iend) { - // body + loc.combine( {x[i], i} ); } }); + }); - m_minloc = get loc value... + qu->wait(); + + m_minloc = static_cast(result->idx); } stopTimer(); -#endif + + sycl::free(result, *qu); } else if ( vid == RAJA_SYCL ) { From fd387c66385df260076a376318707da4c381ff30 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 17 Jun 2024 10:10:26 -0700 Subject: [PATCH 369/454] Add numhosts metadata --- src/common/Executor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index bd9d43392..19d4f954f 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -130,6 +130,7 @@ Executor::Executor(int argc, char** argv) adiak::libraries(); adiak::cmdline(); adiak::clustername(); + adiak::numhosts(); adiak::value("perfsuite_version", cc.adiak_perfsuite_version); adiak::value("raja_version", cc.adiak_raja_version); adiak::value("cmake_build_type", cc.adiak_cmake_build_type); From a79e53e94adce6c7d10ce3dd12dde105e0acc4e6 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 17 Jun 2024 15:13:14 -0700 Subject: [PATCH 370/454] Add features to Caliper --- src/common/KernelBase.cpp | 78 ++++++++++++++++++++++++++++++++++++++- src/common/KernelBase.hpp | 12 ++++++ 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index a7d70bf84..ac7fcfb1f 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -77,6 +77,48 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); + Forall_attr = cali_create_attribute("Forall", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Kernel_attr = cali_create_attribute("Kernel", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Launch_attr = cali_create_attribute("Launch", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Sort_attr = cali_create_attribute("Sort", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Scan_attr = cali_create_attribute("Scan", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Workgroup_attr = cali_create_attribute("Workgroup", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Reduction_attr = cali_create_attribute("Reduction", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Atomic_attr = cali_create_attribute("Atomic", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + View_attr = cali_create_attribute("View", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + #if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_attr = cali_create_attribute("MPI", CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + #endif #endif } @@ -542,6 +584,18 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx) cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep()); cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep()); cali_set_double(BlockSize_attr, getBlockSize()); + cali_set_int(Forall_attr, usesFeature(static_cast(0))); + cali_set_int(Kernel_attr, usesFeature(static_cast(1))); + cali_set_int(Launch_attr, usesFeature(static_cast(2))); + cali_set_int(Sort_attr, usesFeature(static_cast(3))); + cali_set_int(Scan_attr, usesFeature(static_cast(4))); + cali_set_int(Workgroup_attr, usesFeature(static_cast(5))); + cali_set_int(Reduction_attr, usesFeature(static_cast(6))); + cali_set_int(Atomic_attr, usesFeature(static_cast(7))); + cali_set_int(View_attr, usesFeature(static_cast(8))); + #if defined(RAJA_PERFSUITE_ENABLE_MPI) + cali_set_int(MPI_attr, usesFeature(static_cast(9))); + #endif } } @@ -578,7 +632,17 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, { "expr": "any(max#Kernels/Rep)", "as": "Kernels/Rep" }, { "expr": "any(max#Bytes/Rep)", "as": "Bytes/Rep" }, { "expr": "any(max#Flops/Rep)", "as": "Flops/Rep" }, - { "expr": "any(max#BlockSize)", "as": "BlockSize" } + { "expr": "any(max#BlockSize)", "as": "BlockSize" }, + { "expr": "any(max#Forall)", "as": "FeatureForall" }, + { "expr": "any(max#Kernel)", "as": "FeatureKernel" }, + { "expr": "any(max#Launch)", "as": "FeatureLaunch" }, + { "expr": "any(max#Sort)", "as": "FeatureSort" }, + { "expr": "any(max#Scan)", "as": "FeatureScan" }, + { "expr": "any(max#Workgroup)", "as": "FeatureWorkgroup" }, + { "expr": "any(max#Reduction)", "as": "FeatureReduction" }, + { "expr": "any(max#Atomic)", "as": "FeatureAtomic" }, + { "expr": "any(max#View)", "as": "FeatureView" }, + { "expr": "any(max#MPI)", "as": "FeatureMPI" } ] }, { @@ -591,7 +655,17 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, { "expr": "any(any#max#Kernels/Rep)", "as": "Kernels/Rep" }, { "expr": "any(any#max#Bytes/Rep)", "as": "Bytes/Rep" }, { "expr": "any(any#max#Flops/Rep)", "as": "Flops/Rep" }, - { "expr": "any(any#max#BlockSize)", "as": "BlockSize" } + { "expr": "any(any#max#BlockSize)", "as": "BlockSize" }, + { "expr": "any(any#max#Forall)", "as": "FeatureForall" }, + { "expr": "any(any#max#Kernel)", "as": "FeatureKernel" }, + { "expr": "any(any#max#Launch)", "as": "FeatureLaunch" }, + { "expr": "any(any#max#Sort)", "as": "FeatureSort" }, + { "expr": "any(any#max#Scan)", "as": "FeatureScan" }, + { "expr": "any(any#max#Workgroup)", "as": "FeatureWorkgroup" }, + { "expr": "any(any#max#Reduction)", "as": "FeatureReduction" }, + { "expr": "any(any#max#Atomic)", "as": "FeatureAtomic" }, + { "expr": "any(any#max#View)", "as": "FeatureView" }, + { "expr": "any(any#max#MPI)", "as": "FeatureMPI" } ] } ] diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index c41d33c07..8446d3f04 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -570,6 +570,18 @@ class KernelBase cali_id_t Bytes_Rep_attr; cali_id_t Flops_Rep_attr; cali_id_t BlockSize_attr; + cali_id_t Forall_attr, + Kernel_attr, + Launch_attr, + Sort_attr, + Scan_attr, + Workgroup_attr, + Reduction_attr, + Atomic_attr, + View_attr; + #if defined(RAJA_PERFSUITE_ENABLE_MPI) + cali_id_t MPI_attr; + #endif // we need a Caliper Manager object per variant From b0528ff99652d0f906467510975a9953c10d771a Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 17 Jun 2024 15:18:56 -0700 Subject: [PATCH 371/454] Refactor into map and loops --- src/common/KernelBase.cpp | 65 +++++++-------------------------------- src/common/KernelBase.hpp | 13 +------- 2 files changed, 12 insertions(+), 66 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index ac7fcfb1f..142a07f35 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -77,48 +77,13 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); - Forall_attr = cali_create_attribute("Forall", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - Kernel_attr = cali_create_attribute("Kernel", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - Launch_attr = cali_create_attribute("Launch", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - Sort_attr = cali_create_attribute("Sort", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - Scan_attr = cali_create_attribute("Scan", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - Workgroup_attr = cali_create_attribute("Workgroup", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - Reduction_attr = cali_create_attribute("Reduction", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - Atomic_attr = cali_create_attribute("Atomic", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - View_attr = cali_create_attribute("View", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - #if defined(RAJA_PERFSUITE_ENABLE_MPI) - MPI_attr = cali_create_attribute("MPI", CALI_TYPE_INT, - CALI_ATTR_ASVALUE | - CALI_ATTR_AGGREGATABLE | - CALI_ATTR_SKIP_EVENTS); - #endif + for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { + std::string feature = getFeatureName(static_cast(i)); + Features[feature] = cali_create_attribute(feature.c_str(), CALI_TYPE_INT, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + } #endif } @@ -584,18 +549,10 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx) cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep()); cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep()); cali_set_double(BlockSize_attr, getBlockSize()); - cali_set_int(Forall_attr, usesFeature(static_cast(0))); - cali_set_int(Kernel_attr, usesFeature(static_cast(1))); - cali_set_int(Launch_attr, usesFeature(static_cast(2))); - cali_set_int(Sort_attr, usesFeature(static_cast(3))); - cali_set_int(Scan_attr, usesFeature(static_cast(4))); - cali_set_int(Workgroup_attr, usesFeature(static_cast(5))); - cali_set_int(Reduction_attr, usesFeature(static_cast(6))); - cali_set_int(Atomic_attr, usesFeature(static_cast(7))); - cali_set_int(View_attr, usesFeature(static_cast(8))); - #if defined(RAJA_PERFSUITE_ENABLE_MPI) - cali_set_int(MPI_attr, usesFeature(static_cast(9))); - #endif + for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { + std::string feature = getFeatureName(static_cast(i)); + cali_set_int(Features[feature], usesFeature(static_cast(i))); + } } } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 8446d3f04..03c75de7f 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -570,18 +570,7 @@ class KernelBase cali_id_t Bytes_Rep_attr; cali_id_t Flops_Rep_attr; cali_id_t BlockSize_attr; - cali_id_t Forall_attr, - Kernel_attr, - Launch_attr, - Sort_attr, - Scan_attr, - Workgroup_attr, - Reduction_attr, - Atomic_attr, - View_attr; - #if defined(RAJA_PERFSUITE_ENABLE_MPI) - cali_id_t MPI_attr; - #endif + std::map Features; // we need a Caliper Manager object per variant From 991f1d7166a69626a0e45d583a44265687e53e9e Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Mon, 17 Jun 2024 15:27:20 -0700 Subject: [PATCH 372/454] Optimize cast --- src/common/KernelBase.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 142a07f35..793fc2274 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -550,8 +550,9 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx) cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep()); cali_set_double(BlockSize_attr, getBlockSize()); for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { - std::string feature = getFeatureName(static_cast(i)); - cali_set_int(Features[feature], usesFeature(static_cast(i))); + FeatureID fid = static_cast(i); + std::string feature = getFeatureName(fid); + cali_set_int(Features[feature], usesFeature(fid)); } } } From cabcbb3afedad19725e63014b4ed4b8be686f521 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Tue, 18 Jun 2024 10:44:32 -0700 Subject: [PATCH 373/454] Refactor into variable --- src/common/KernelBase.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 793fc2274..0c49c83ec 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -78,7 +78,8 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { - std::string feature = getFeatureName(static_cast(i)); + FeatureID fid = static_cast(i); + std::string feature = getFeatureName(fid); Features[feature] = cali_create_attribute(feature.c_str(), CALI_TYPE_INT, CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | From 07baca41feeaf4040b9c047e1523d02ac075d1cb Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Tue, 18 Jun 2024 10:48:57 -0700 Subject: [PATCH 374/454] Rename map to match other vars --- src/common/KernelBase.cpp | 4 ++-- src/common/KernelBase.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 0c49c83ec..3d0e55302 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -80,7 +80,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { FeatureID fid = static_cast(i); std::string feature = getFeatureName(fid); - Features[feature] = cali_create_attribute(feature.c_str(), CALI_TYPE_INT, + Feature_attrs[feature] = cali_create_attribute(feature.c_str(), CALI_TYPE_INT, CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); @@ -553,7 +553,7 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx) for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { FeatureID fid = static_cast(i); std::string feature = getFeatureName(fid); - cali_set_int(Features[feature], usesFeature(fid)); + cali_set_int(Feature_attrs[feature], usesFeature(fid)); } } } diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 03c75de7f..4feda83d9 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -570,7 +570,7 @@ class KernelBase cali_id_t Bytes_Rep_attr; cali_id_t Flops_Rep_attr; cali_id_t BlockSize_attr; - std::map Features; + std::map Feature_attrs; // we need a Caliper Manager object per variant From 9d0f5da6c7e78d10dee045248ae96b9db374171b Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Tue, 18 Jun 2024 11:18:15 -0700 Subject: [PATCH 375/454] Undo add numhosts --- src/common/Executor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 19d4f954f..bd9d43392 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -130,7 +130,6 @@ Executor::Executor(int argc, char** argv) adiak::libraries(); adiak::cmdline(); adiak::clustername(); - adiak::numhosts(); adiak::value("perfsuite_version", cc.adiak_perfsuite_version); adiak::value("raja_version", cc.adiak_raja_version); adiak::value("cmake_build_type", cc.adiak_cmake_build_type); From d793b3c4b036f66140aec3242167ea9e5c66a95e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 19 Jun 2024 11:05:21 -0700 Subject: [PATCH 376/454] Add install of RAJA Perf libs so install process works for shared library builds --- src/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 70b5b1d4f..2b0cda414 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,7 +20,7 @@ add_subdirectory(stream-kokkos) add_subdirectory(algorithm) add_subdirectory(comm) -set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS +set(RAJA_PERFSUITE_LIBS common apps basic @@ -32,6 +32,7 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS stream-kokkos algorithm comm) +set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_LIBS}) list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(RAJA_ENABLE_TARGET_OPENMP) @@ -279,4 +280,7 @@ blt_add_executable( install( TARGETS raja-perf.exe RUNTIME DESTINATION bin ) +install( TARGETS ${RAJA_PERFSUITE_LIBS} + LIBRARY DESTINATION lib + ) endif() From dd869a2cbae0ac9a9e1065aeca0f03977b902f52 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 20 Jun 2024 13:56:15 -0700 Subject: [PATCH 377/454] Add Sycl varants of MATVEC_3D --- src/apps/CMakeLists.txt | 1 + src/apps/MATVEC_3D-Sycl.cpp | 89 +++++++++++++++++++++++++++++++++++++ src/apps/MATVEC_3D.cpp | 3 ++ src/apps/MATVEC_3D.hpp | 4 ++ 4 files changed, 97 insertions(+) create mode 100644 src/apps/MATVEC_3D-Sycl.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 022f6971d..09329a5d5 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -85,6 +85,7 @@ blt_add_library( MATVEC_3D-Cuda.cpp MATVEC_3D-OMP.cpp MATVEC_3D-OMPTarget.cpp + MATVEC_3D-Sycl.cpp NODAL_ACCUMULATION_3D.cpp NODAL_ACCUMULATION_3D-Seq.cpp NODAL_ACCUMULATION_3D-Hip.cpp diff --git a/src/apps/MATVEC_3D-Sycl.cpp b/src/apps/MATVEC_3D-Sycl.cpp new file mode 100644 index 000000000..8748cd23a --- /dev/null +++ b/src/apps/MATVEC_3D-Sycl.cpp @@ -0,0 +1,89 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MATVEC_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "AppsData.hpp" + +#include + +#include "common/SyclDataUtils.hpp" + +namespace rajaperf +{ +namespace apps +{ + +template +void MATVEC_3D::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MATVEC_3D_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + qu->submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1> (global_size, work_group_size), + [=] (sycl::nd_item<1> item) { + + Index_type ii = item.get_global_id(0); + Index_type i = ii + ibegin; + if (i < iend) { + MATVEC_3D_BODY_INDEX; + MATVEC_3D_BODY; + } + + }); + }); + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + RAJA::TypedListSegment zones(real_zones, iend, + res, RAJA::Unowned); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>(res, + zones, [=](Index_type i) { + MATVEC_3D_BODY; + }); + + } + stopTimer(); + + } else { + std::cout << "\n MATVEC_3D : Unknown Sycl variant id = " << vid << std::endl; + } + +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D, Sycl) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MATVEC_3D.cpp b/src/apps/MATVEC_3D.cpp index f0872d3e8..6e1ab8a0e 100644 --- a/src/apps/MATVEC_3D.cpp +++ b/src/apps/MATVEC_3D.cpp @@ -100,6 +100,9 @@ MATVEC_3D::MATVEC_3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } MATVEC_3D::~MATVEC_3D() diff --git a/src/apps/MATVEC_3D.hpp b/src/apps/MATVEC_3D.hpp index 650acba93..65908d0dd 100644 --- a/src/apps/MATVEC_3D.hpp +++ b/src/apps/MATVEC_3D.hpp @@ -136,14 +136,18 @@ class MATVEC_3D : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); template < size_t block_size > void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 91d9838bb2d50fe2de825d423956c36fbb27cc63 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 21 Jun 2024 12:51:19 -0700 Subject: [PATCH 378/454] Rename MATVEC_3D to MATVEC_3D_STENCIL --- src/CMakeLists.txt | 6 +++--- src/apps/CMakeLists.txt | 14 +++++++------- ..._3D-Cuda.cpp => MATVEC_3D_STENCIL-Cuda.cpp} | 16 ++++++++-------- ...EC_3D-Hip.cpp => MATVEC_3D_STENCIL-Hip.cpp} | 16 ++++++++-------- ...EC_3D-OMP.cpp => MATVEC_3D_STENCIL-OMP.cpp} | 18 +++++++++--------- ...get.cpp => MATVEC_3D_STENCIL-OMPTarget.cpp} | 14 +++++++------- ...EC_3D-Seq.cpp => MATVEC_3D_STENCIL-Seq.cpp} | 18 +++++++++--------- ..._3D-Sycl.cpp => MATVEC_3D_STENCIL-Sycl.cpp} | 16 ++++++++-------- .../{MATVEC_3D.cpp => MATVEC_3D_STENCIL.cpp} | 14 +++++++------- .../{MATVEC_3D.hpp => MATVEC_3D_STENCIL.hpp} | 18 +++++++++--------- src/common/RAJAPerfSuite.cpp | 8 ++++---- src/common/RAJAPerfSuite.hpp | 2 +- 12 files changed, 80 insertions(+), 80 deletions(-) rename src/apps/{MATVEC_3D-Cuda.cpp => MATVEC_3D_STENCIL-Cuda.cpp} (93%) rename src/apps/{MATVEC_3D-Hip.cpp => MATVEC_3D_STENCIL-Hip.cpp} (93%) rename src/apps/{MATVEC_3D-OMP.cpp => MATVEC_3D_STENCIL-OMP.cpp} (81%) rename src/apps/{MATVEC_3D-OMPTarget.cpp => MATVEC_3D_STENCIL-OMPTarget.cpp} (83%) rename src/apps/{MATVEC_3D-Seq.cpp => MATVEC_3D_STENCIL-Seq.cpp} (79%) rename src/apps/{MATVEC_3D-Sycl.cpp => MATVEC_3D_STENCIL-Sycl.cpp} (81%) rename src/apps/{MATVEC_3D.cpp => MATVEC_3D_STENCIL.cpp} (93%) rename src/apps/{MATVEC_3D.hpp => MATVEC_3D_STENCIL.hpp} (94%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bd1bba26f..19e4a11f5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,9 +72,9 @@ blt_add_executable( apps/MASS3DPA.cpp apps/MASS3DPA-Seq.cpp apps/MASS3DPA-OMPTarget.cpp - apps/MATVEC_3D.cpp - apps/MATVEC_3D-Seq.cpp - apps/MATVEC_3D-OMPTarget.cpp + apps/MATVEC_3D_STENCIL.cpp + apps/MATVEC_3D_STENCIL-Seq.cpp + apps/MATVEC_3D_STENCIL-OMPTarget.cpp apps/NODAL_ACCUMULATION_3D.cpp apps/NODAL_ACCUMULATION_3D-Seq.cpp apps/NODAL_ACCUMULATION_3D-OMPTarget.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 09329a5d5..4a0584e96 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -79,13 +79,13 @@ blt_add_library( MASS3DPA-OMP.cpp MASS3DPA-OMPTarget.cpp MASS3DPA-Sycl.cpp - MATVEC_3D.cpp - MATVEC_3D-Seq.cpp - MATVEC_3D-Hip.cpp - MATVEC_3D-Cuda.cpp - MATVEC_3D-OMP.cpp - MATVEC_3D-OMPTarget.cpp - MATVEC_3D-Sycl.cpp + MATVEC_3D_STENCIL.cpp + MATVEC_3D_STENCIL-Seq.cpp + MATVEC_3D_STENCIL-Hip.cpp + MATVEC_3D_STENCIL-Cuda.cpp + MATVEC_3D_STENCIL-OMP.cpp + MATVEC_3D_STENCIL-OMPTarget.cpp + MATVEC_3D_STENCIL-Sycl.cpp NODAL_ACCUMULATION_3D.cpp NODAL_ACCUMULATION_3D-Seq.cpp NODAL_ACCUMULATION_3D-Hip.cpp diff --git a/src/apps/MATVEC_3D-Cuda.cpp b/src/apps/MATVEC_3D_STENCIL-Cuda.cpp similarity index 93% rename from src/apps/MATVEC_3D-Cuda.cpp rename to src/apps/MATVEC_3D_STENCIL-Cuda.cpp index 413cfc3ee..e5ef75f08 100644 --- a/src/apps/MATVEC_3D-Cuda.cpp +++ b/src/apps/MATVEC_3D_STENCIL-Cuda.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MATVEC_3D.hpp" +#include "MATVEC_3D_STENCIL.hpp" #include "RAJA/RAJA.hpp" @@ -86,14 +86,14 @@ __global__ void matvec_3d(Real_ptr b, Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; Index_type i = ii + ibegin; if (i < iend) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; } } template < size_t block_size > -void MATVEC_3D::runCudaVariantImpl(VariantID vid) +void MATVEC_3D_STENCIL::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -101,7 +101,7 @@ void MATVEC_3D::runCudaVariantImpl(VariantID vid) auto res{getCudaResource()}; - MATVEC_3D_DATA_SETUP; + MATVEC_3D_STENCIL_DATA_SETUP; if ( vid == Base_CUDA ) { @@ -185,18 +185,18 @@ void MATVEC_3D::runCudaVariantImpl(VariantID vid) RAJA::forall< RAJA::cuda_exec >( res, zones, [=] __device__ (Index_type i) { - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY; }); } stopTimer(); } else { - getCout() << "\n MATVEC_3D : Unknown Cuda variant id = " << vid << std::endl; + getCout() << "\n MATVEC_3D_STENCIL : Unknown Cuda variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D, Cuda) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Cuda) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MATVEC_3D-Hip.cpp b/src/apps/MATVEC_3D_STENCIL-Hip.cpp similarity index 93% rename from src/apps/MATVEC_3D-Hip.cpp rename to src/apps/MATVEC_3D_STENCIL-Hip.cpp index 4793d1c98..a24757cb8 100644 --- a/src/apps/MATVEC_3D-Hip.cpp +++ b/src/apps/MATVEC_3D_STENCIL-Hip.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MATVEC_3D.hpp" +#include "MATVEC_3D_STENCIL.hpp" #include "RAJA/RAJA.hpp" @@ -86,14 +86,14 @@ __global__ void matvec_3d(Real_ptr b, Index_type ii = blockIdx.x * blockDim.x + threadIdx.x; Index_type i = ii + ibegin; if (i < iend) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; } } template < size_t block_size > -void MATVEC_3D::runHipVariantImpl(VariantID vid) +void MATVEC_3D_STENCIL::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -101,7 +101,7 @@ void MATVEC_3D::runHipVariantImpl(VariantID vid) auto res{getHipResource()}; - MATVEC_3D_DATA_SETUP; + MATVEC_3D_STENCIL_DATA_SETUP; if ( vid == Base_HIP ) { @@ -185,18 +185,18 @@ void MATVEC_3D::runHipVariantImpl(VariantID vid) RAJA::forall< RAJA::hip_exec >( res, zones, [=] __device__ (Index_type i) { - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY; }); } stopTimer(); } else { - getCout() << "\n MATVEC_3D : Unknown Hip variant id = " << vid << std::endl; + getCout() << "\n MATVEC_3D_STENCIL : Unknown Hip variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D, Hip) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Hip) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MATVEC_3D-OMP.cpp b/src/apps/MATVEC_3D_STENCIL-OMP.cpp similarity index 81% rename from src/apps/MATVEC_3D-OMP.cpp rename to src/apps/MATVEC_3D_STENCIL-OMP.cpp index dd2f33916..6365aed70 100644 --- a/src/apps/MATVEC_3D-OMP.cpp +++ b/src/apps/MATVEC_3D_STENCIL-OMP.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MATVEC_3D.hpp" +#include "MATVEC_3D_STENCIL.hpp" #include "RAJA/RAJA.hpp" @@ -20,7 +20,7 @@ namespace apps { -void MATVEC_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void MATVEC_3D_STENCIL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -28,7 +28,7 @@ void MATVEC_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; - MATVEC_3D_DATA_SETUP; + MATVEC_3D_STENCIL_DATA_SETUP; switch ( vid ) { @@ -40,8 +40,8 @@ void MATVEC_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #pragma omp parallel for for (Index_type ii = ibegin ; ii < iend ; ++ii ) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; } } @@ -53,8 +53,8 @@ void MATVEC_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ case Lambda_OpenMP : { auto matvec_3d_lam = [=](Index_type ii) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; }; startTimer(); @@ -78,7 +78,7 @@ void MATVEC_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ working_res, RAJA::Unowned); auto matvec_3d_lam = [=](Index_type i) { - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY; }; startTimer(); @@ -94,7 +94,7 @@ void MATVEC_3D::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } default : { - getCout() << "\n MATVEC_3D : Unknown variant id = " << vid << std::endl; + getCout() << "\n MATVEC_3D_STENCIL : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/MATVEC_3D-OMPTarget.cpp b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp similarity index 83% rename from src/apps/MATVEC_3D-OMPTarget.cpp rename to src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp index 5be2e1e22..1c10d27f0 100644 --- a/src/apps/MATVEC_3D-OMPTarget.cpp +++ b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MATVEC_3D.hpp" +#include "MATVEC_3D_STENCIL.hpp" #include "RAJA/RAJA.hpp" @@ -29,13 +29,13 @@ namespace apps const size_t threads_per_team = 256; -void MATVEC_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void MATVEC_3D_STENCIL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; - MATVEC_3D_DATA_SETUP; + MATVEC_3D_STENCIL_DATA_SETUP; if ( vid == Base_OpenMPTarget ) { @@ -46,8 +46,8 @@ void MATVEC_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG vol, real_zones) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type ii = ibegin ; ii < iend ; ++ii ) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; } } @@ -64,14 +64,14 @@ void MATVEC_3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG RAJA::forall>( zones, [=](Index_type i) { - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY; }); } stopTimer(); } else { - getCout() << "\n MATVEC_3D : Unknown OMP Target variant id = " << vid << std::endl; + getCout() << "\n MATVEC_3D_STENCIL : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/apps/MATVEC_3D-Seq.cpp b/src/apps/MATVEC_3D_STENCIL-Seq.cpp similarity index 79% rename from src/apps/MATVEC_3D-Seq.cpp rename to src/apps/MATVEC_3D_STENCIL-Seq.cpp index fcb6a6f54..795a01e19 100644 --- a/src/apps/MATVEC_3D-Seq.cpp +++ b/src/apps/MATVEC_3D_STENCIL-Seq.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MATVEC_3D.hpp" +#include "MATVEC_3D_STENCIL.hpp" #include "RAJA/RAJA.hpp" @@ -20,13 +20,13 @@ namespace apps { -void MATVEC_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void MATVEC_3D_STENCIL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = m_domain->n_real_zones; - MATVEC_3D_DATA_SETUP; + MATVEC_3D_STENCIL_DATA_SETUP; switch ( vid ) { @@ -36,8 +36,8 @@ void MATVEC_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx for (RepIndex_type irep = 0; irep < run_reps; ++irep) { for (Index_type ii = ibegin ; ii < iend ; ++ii ) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; } } @@ -50,8 +50,8 @@ void MATVEC_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx case Lambda_Seq : { auto matvec_3d_lam = [=](Index_type ii) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; }; startTimer(); @@ -74,7 +74,7 @@ void MATVEC_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx working_res, RAJA::Unowned); auto matvec_3d_lam = [=](Index_type i) { - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY; }; startTimer(); @@ -90,7 +90,7 @@ void MATVEC_3D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx #endif // RUN_RAJA_SEQ default : { - getCout() << "\n MATVEC_3D : Unknown variant id = " << vid << std::endl; + getCout() << "\n MATVEC_3D_STENCIL : Unknown variant id = " << vid << std::endl; } } diff --git a/src/apps/MATVEC_3D-Sycl.cpp b/src/apps/MATVEC_3D_STENCIL-Sycl.cpp similarity index 81% rename from src/apps/MATVEC_3D-Sycl.cpp rename to src/apps/MATVEC_3D_STENCIL-Sycl.cpp index 8748cd23a..c6110f2d8 100644 --- a/src/apps/MATVEC_3D-Sycl.cpp +++ b/src/apps/MATVEC_3D_STENCIL-Sycl.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MATVEC_3D.hpp" +#include "MATVEC_3D_STENCIL.hpp" #include "RAJA/RAJA.hpp" @@ -24,7 +24,7 @@ namespace apps { template -void MATVEC_3D::runSyclVariantImpl(VariantID vid) +void MATVEC_3D_STENCIL::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -33,7 +33,7 @@ void MATVEC_3D::runSyclVariantImpl(VariantID vid) auto res{getSyclResource()}; auto qu = res.get_queue(); - MATVEC_3D_DATA_SETUP; + MATVEC_3D_STENCIL_DATA_SETUP; if ( vid == Base_SYCL ) { @@ -49,8 +49,8 @@ void MATVEC_3D::runSyclVariantImpl(VariantID vid) Index_type ii = item.get_global_id(0); Index_type i = ii + ibegin; if (i < iend) { - MATVEC_3D_BODY_INDEX; - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY_INDEX; + MATVEC_3D_STENCIL_BODY; } }); @@ -69,19 +69,19 @@ void MATVEC_3D::runSyclVariantImpl(VariantID vid) RAJA::forall>(res, zones, [=](Index_type i) { - MATVEC_3D_BODY; + MATVEC_3D_STENCIL_BODY; }); } stopTimer(); } else { - std::cout << "\n MATVEC_3D : Unknown Sycl variant id = " << vid << std::endl; + std::cout << "\n MATVEC_3D_STENCIL : Unknown Sycl variant id = " << vid << std::endl; } } -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D, Sycl) +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MATVEC_3D_STENCIL, Sycl) } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MATVEC_3D.cpp b/src/apps/MATVEC_3D_STENCIL.cpp similarity index 93% rename from src/apps/MATVEC_3D.cpp rename to src/apps/MATVEC_3D_STENCIL.cpp index 6e1ab8a0e..95215e90d 100644 --- a/src/apps/MATVEC_3D.cpp +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "MATVEC_3D.hpp" +#include "MATVEC_3D_STENCIL.hpp" #include "RAJA/RAJA.hpp" @@ -22,8 +22,8 @@ namespace apps { -MATVEC_3D::MATVEC_3D(const RunParams& params) - : KernelBase(rajaperf::Apps_MATVEC_3D, params) +MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params) + : KernelBase(rajaperf::Apps_MATVEC_3D_STENCIL, params) { setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); @@ -105,12 +105,12 @@ MATVEC_3D::MATVEC_3D(const RunParams& params) setVariantDefined( RAJA_SYCL ); } -MATVEC_3D::~MATVEC_3D() +MATVEC_3D_STENCIL::~MATVEC_3D_STENCIL() { delete m_domain; } -void MATVEC_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void MATVEC_3D_STENCIL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { allocAndInitDataConst(m_b, m_zonal_array_length, 0.0, vid); allocAndInitData(m_x, m_zonal_array_length, vid); @@ -154,12 +154,12 @@ void MATVEC_3D::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } -void MATVEC_3D::updateChecksum(VariantID vid, size_t tune_idx) +void MATVEC_3D_STENCIL::updateChecksum(VariantID vid, size_t tune_idx) { checksum[vid].at(tune_idx) += calcChecksum(m_b, m_zonal_array_length, checksum_scale_factor , vid); } -void MATVEC_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void MATVEC_3D_STENCIL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { (void) vid; diff --git a/src/apps/MATVEC_3D.hpp b/src/apps/MATVEC_3D_STENCIL.hpp similarity index 94% rename from src/apps/MATVEC_3D.hpp rename to src/apps/MATVEC_3D_STENCIL.hpp index 65908d0dd..a537e7149 100644 --- a/src/apps/MATVEC_3D.hpp +++ b/src/apps/MATVEC_3D_STENCIL.hpp @@ -7,7 +7,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// /// -/// MATVEC_3D kernel reference implementation: +/// MATVEC_3D_STENCIL kernel reference implementation: /// /// for (Index_type ii = ibegin; ii < iend; ++ii ) { /// Index_type i = real_zones[ii]; @@ -27,10 +27,10 @@ /// } /// -#ifndef RAJAPerf_Apps_MATVEC_3D_HPP -#define RAJAPerf_Apps_MATVEC_3D_HPP +#ifndef RAJAPerf_Apps_MATVEC_3D_STENCIL_HPP +#define RAJAPerf_Apps_MATVEC_3D_STENCIL_HPP -#define MATVEC_3D_DATA_SETUP \ +#define MATVEC_3D_STENCIL_DATA_SETUP \ Real_ptr b = m_b; \ \ Real_ptr xdbl = m_x - m_domain->kp - m_domain->jp - 1 ; \ @@ -91,10 +91,10 @@ \ Index_ptr real_zones = m_real_zones; -#define MATVEC_3D_BODY_INDEX \ +#define MATVEC_3D_STENCIL_BODY_INDEX \ Index_type i = real_zones[ii]; -#define MATVEC_3D_BODY \ +#define MATVEC_3D_STENCIL_BODY \ b[i] = dbl[i] * xdbl[i] + dbc[i] * xdbc[i] + dbr[i] * xdbr[i] + \ dcl[i] * xdcl[i] + dcc[i] * xdcc[i] + dcr[i] * xdcr[i] + \ dfl[i] * xdfl[i] + dfc[i] * xdfc[i] + dfr[i] * xdfr[i] + \ @@ -119,13 +119,13 @@ namespace apps { class ADomain; -class MATVEC_3D : public KernelBase +class MATVEC_3D_STENCIL : public KernelBase { public: - MATVEC_3D(const RunParams& params); + MATVEC_3D_STENCIL(const RunParams& params); - ~MATVEC_3D(); + ~MATVEC_3D_STENCIL(); void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 8316050d6..3a79da87f 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -90,7 +90,7 @@ #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DEA.hpp" #include "apps/MASS3DPA.hpp" -#include "apps/MATVEC_3D.hpp" +#include "apps/MATVEC_3D_STENCIL.hpp" #include "apps/NODAL_ACCUMULATION_3D.hpp" #include "apps/PRESSURE.hpp" #include "apps/VOL3D.hpp" @@ -242,7 +242,7 @@ static const std::string KernelNames [] = std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DEA"), std::string("Apps_MASS3DPA"), - std::string("Apps_MATVEC_3D"), + std::string("Apps_MATVEC_3D_STENCIL"), std::string("Apps_NODAL_ACCUMULATION_3D"), std::string("Apps_PRESSURE"), std::string("Apps_VOL3D"), @@ -996,8 +996,8 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::MASS3DPA(run_params); break; } - case Apps_MATVEC_3D : { - kernel = new apps::MATVEC_3D(run_params); + case Apps_MATVEC_3D_STENCIL : { + kernel = new apps::MATVEC_3D_STENCIL(run_params); break; } case Apps_NODAL_ACCUMULATION_3D : { diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index f3d8a5041..20380d08e 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -150,7 +150,7 @@ enum KernelID { Apps_LTIMES_NOVIEW, Apps_MASS3DEA, Apps_MASS3DPA, - Apps_MATVEC_3D, + Apps_MATVEC_3D_STENCIL, Apps_NODAL_ACCUMULATION_3D, Apps_PRESSURE, Apps_VOL3D, From 4503638e159df76d9106d684b76b0272a85bde6c Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 27 Jun 2024 11:28:15 -0700 Subject: [PATCH 379/454] Correct usage of arg to reflect name --- src/common/KernelBase.cpp | 15 ++++++++------- src/common/KernelBase.hpp | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 3d0e55302..132552aab 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -568,7 +568,7 @@ void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx) void KernelBase::setCaliperMgrVariantTuning(VariantID vid, std::string tstr, const std::string& outdir, - const std::string& addToConfig) + const std::string& addToSpotConfig) { static bool ran_spot_config_check = false; bool config_ok = true; @@ -631,13 +631,13 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, } )json"; - if(!ran_spot_config_check && (!addToConfig.empty())) { + if(!ran_spot_config_check && (!addToSpotConfig.empty())) { cali::ConfigManager cm; - std::string check_profile = "spot()," + addToConfig; + std::string check_profile = "spot(" + addToSpotConfig + ")"; std::string msg = cm.check(check_profile.c_str()); if(!msg.empty()) { std::cerr << "Problem with Cali Config: " << check_profile << "\n"; - std::cerr << "Check your command line argument: " << addToConfig << "\n"; + std::cerr << "Check your command line argument: " << addToSpotConfig << "\n"; config_ok = false; exit(-1); } @@ -653,10 +653,11 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, od = outdir + "/"; } std::string vstr = getVariantName(vid); - std::string profile = "spot(output=" + od + vstr + "-" + tstr + ".cali)"; - if(!addToConfig.empty()) { - profile += "," + addToConfig; + std::string profile = "spot(output=" + od + vstr + "-" + tstr + ".cali"; + if(!addToSpotConfig.empty()) { + profile += "," + addToSpotConfig; } + profile += ")"; std::cout << "Profile: " << profile << std::endl; mgr[vid][tstr].add_option_spec(kernel_info_spec); mgr[vid][tstr].set_default_parameter("rajaperf_kernel_info", "true"); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 4feda83d9..53954a78a 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -488,7 +488,7 @@ class KernelBase static void setCaliperMgrVariantTuning(VariantID vid, std::string tstr, const std::string& outdir, - const std::string& addToConfig); + const std::string& addToSpotConfig); static void setCaliperMgrStart(VariantID vid, std::string tstr) { mgr[vid][tstr].start(); } static void setCaliperMgrStop(VariantID vid, std::string tstr) { mgr[vid][tstr].stop(); } From eb31ebc221dd270f9df0ff74db36738048b1f9d5 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 27 Jun 2024 12:12:47 -0700 Subject: [PATCH 380/454] Add new argument for adjusting the Caliper config --- src/common/Executor.cpp | 3 ++- src/common/KernelBase.cpp | 13 ++++++++++--- src/common/KernelBase.hpp | 3 ++- src/common/RunParams.cpp | 17 ++++++++++++++++- src/common/RunParams.hpp | 2 ++ 5 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index bd9d43392..78a3b83a5 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -331,7 +331,8 @@ void Executor::setupSuite() KernelBase::setCaliperMgrVariantTuning(vid, tstr, run_params.getOutputDirName(), - run_params.getAddToSpotConfig()); + run_params.getAddToSpotConfig(), + run_params.getAddToCaliperConfig()); #endif } diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 132552aab..248200e71 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -568,7 +568,8 @@ void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx) void KernelBase::setCaliperMgrVariantTuning(VariantID vid, std::string tstr, const std::string& outdir, - const std::string& addToSpotConfig) + const std::string& addToSpotConfig, + const std::string& addToCaliConfig) { static bool ran_spot_config_check = false; bool config_ok = true; @@ -631,13 +632,16 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, } )json"; - if(!ran_spot_config_check && (!addToSpotConfig.empty())) { + if(!ran_spot_config_check && ((!addToSpotConfig.empty()) || (!addToCaliConfig.empty()))) { cali::ConfigManager cm; std::string check_profile = "spot(" + addToSpotConfig + ")"; + if (!addToCaliConfig.empty()) { + check_profile += "," + addToCaliConfig; + } std::string msg = cm.check(check_profile.c_str()); if(!msg.empty()) { std::cerr << "Problem with Cali Config: " << check_profile << "\n"; - std::cerr << "Check your command line argument: " << addToSpotConfig << "\n"; + std::cerr << msg << "\n"; config_ok = false; exit(-1); } @@ -658,6 +662,9 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, profile += "," + addToSpotConfig; } profile += ")"; + if (!addToCaliConfig.empty()) { + profile += "," + addToCaliConfig; + } std::cout << "Profile: " << profile << std::endl; mgr[vid][tstr].add_option_spec(kernel_info_spec); mgr[vid][tstr].set_default_parameter("rajaperf_kernel_info", "true"); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 53954a78a..e40334290 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -488,7 +488,8 @@ class KernelBase static void setCaliperMgrVariantTuning(VariantID vid, std::string tstr, const std::string& outdir, - const std::string& addToSpotConfig); + const std::string& addToSpotConfig, + const std::string& addToCaliConfig); static void setCaliperMgrStart(VariantID vid, std::string tstr) { mgr[vid][tstr].start(); } static void setCaliperMgrStop(VariantID vid, std::string tstr) { mgr[vid][tstr].stop(); } diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 3b27c5685..6376b765f 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -915,6 +915,17 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) add_to_spot_config = std::string( argv[i] ); } } + } else if ( std::string(argv[i]) == std::string("--add-to-cali-config") || + std::string(argv[i]) == std::string("-atcc") ) { + i++; + if ( i < argc ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + } else { + add_to_cali_config = std::string( argv[i] ); + } + } #endif } else { @@ -1337,9 +1348,13 @@ void RunParams::printHelpMessage(std::ostream& str) const #if defined(RAJA_PERFSUITE_USE_CALIPER) str << "\t --add-to-spot-config, -atsc [Default is none]\n" - << "\t\t appends additional parameters to the built-in Caliper spot config\n"; + << "\t\t appends additional parameters to the built-in Caliper spot config (CALI_CONFIG=spot(...))\n"; str << "\t\t Example to include some PAPI counters (Intel arch)\n" << "\t\t -atsc topdown.all\n\n"; + str << "\t --add-to-cali-config, -atcc [Default is none]\n" + << "\t\t include parameters in the Caliper config (same as CALI_CONFIG=...)\n"; + str << "\t\t Example to include time spent in MPI functions\n" + << "\t\t -atcc mpi-report\n\n"; #endif str << std::endl; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 85e6c8a65..8c24bea1c 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -200,6 +200,7 @@ class RunParams { #if defined(RAJA_PERFSUITE_USE_CALIPER) const std::string& getAddToSpotConfig() const { return add_to_spot_config; } + const std::string& getAddToCaliperConfig() const { return add_to_cali_config; } #endif bool getDisableWarmup() const { return disable_warmup; } @@ -324,6 +325,7 @@ class RunParams { #if defined(RAJA_PERFSUITE_USE_CALIPER) std::string add_to_spot_config; + std::string add_to_cali_config; #endif bool disable_warmup; From ab33f9ac0e56fa3579197358f95ed99ba0b0f32e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 27 Jun 2024 14:08:26 -0700 Subject: [PATCH 381/454] Set release version number --- CMakeLists.txt | 2 +- docs/conf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2ffc3850..b9d0bd3c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,7 +137,7 @@ if (ENABLE_OPENMP) endif () set(RAJA_PERFSUITE_VERSION_MAJOR 2024) -set(RAJA_PERFSUITE_VERSION_MINOR 02) +set(RAJA_PERFSUITE_VERSION_MINOR 07) set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0) set(RAJA_PERFSUITE_DEPENDS RAJA) diff --git a/docs/conf.py b/docs/conf.py index 072637997..ee3729c8f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,9 +86,9 @@ # built documents. # # The short X.Y version. -version = u'2024.02' +version = u'2024.07' # The full version, including alpha/beta/rc tags. -release = u'2024.02.0' +release = u'2024.07.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 09797ba44b1cd33ccf0367f52aa28a434f0c3b39 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 27 Jun 2024 14:47:39 -0700 Subject: [PATCH 382/454] Pass MPI_COMM_WORLD to adiak if MPI enabled and collect all implicit adiak routines --- src/common/Executor.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 78a3b83a5..56491b2e8 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -124,12 +124,13 @@ Executor::Executor(int argc, char** argv) { #if defined(RAJA_PERFSUITE_USE_CALIPER) configuration cc; - adiak::init(NULL); - adiak::user(); - adiak::launchdate(); - adiak::libraries(); - adiak::cmdline(); - adiak::clustername(); + #if defined(RAJA_PERFSUITE_ENABLE_MPI) + MPI_Comm adiak_comm = MPI_COMM_WORLD; + adiak::init(&adiak_comm); + #else + adiak::init(nullptr); + #endif + adiak::collect_all(); adiak::value("perfsuite_version", cc.adiak_perfsuite_version); adiak::value("raja_version", cc.adiak_raja_version); adiak::value("cmake_build_type", cc.adiak_cmake_build_type); From 8904d689b948625b95aec861b8e8224944f54853 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 27 Jun 2024 14:49:12 -0700 Subject: [PATCH 383/454] Add mpi caliper build scripts --- scripts/lc-builds/toss4_clang-mpi_caliper.sh | 60 +++++++++++++++++++ scripts/lc-builds/toss4_gcc-mpi_caliper.sh | 61 ++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100755 scripts/lc-builds/toss4_clang-mpi_caliper.sh create mode 100755 scripts/lc-builds/toss4_gcc-mpi_caliper.sh diff --git a/scripts/lc-builds/toss4_clang-mpi_caliper.sh b/scripts/lc-builds/toss4_clang-mpi_caliper.sh new file mode 100755 index 000000000..d3f4eb4bf --- /dev/null +++ b/scripts/lc-builds/toss4_clang-mpi_caliper.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 3 ]]; then + echo + echo "You must pass 3 arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) path to caliper cmake directory" + echo " 3) path to adiak cmake directory" + echo + echo "For example: " + echo " toss4_clang-mpi_caliper.sh 14.0.6 /usr/workspace/wsb/asde/caliper-quartz/share/cmake/caliper /usr/workspace/wsb/asde/caliper-quartz/lib/cmake/adiak" + exit +fi + +COMP_VER=$1 +CALI_DIR=$2 +ADIAK_DIR=$3 +shift 3 + +BUILD_SUFFIX=lc_toss4-clang-mpi-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.23.1 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DCMAKE_C_FLAGS="-g -O0" \ + -DCMAKE_CXX_FLAGS="-g -O0" \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "***********************************************************************" diff --git a/scripts/lc-builds/toss4_gcc-mpi_caliper.sh b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh new file mode 100755 index 000000000..62389ea73 --- /dev/null +++ b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 3 ]]; then + echo + echo "You must pass 3 arguments to the script (in this order): " + echo " 1) compiler version number" + echo " 2) path to caliper cmake directory" + echo " 3) path to adiak cmake directory" + echo + echo "For example: " + echo " toss4_gcc-mpi_caliper.sh 10.3.1 /usr/workspace/wsb/asde/caliper-quartz/share/cmake/caliper /usr/workspace/wsb/asde/caliper-quartz/lib/cmake/adiak" + exit +fi + +COMP_VER=$1 +CALI_DIR=$2 +ADIAK_DIR=$3 +shift 3 + +BUILD_SUFFIX=lc_toss4-gcc-mpi-${COMP_VER} +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/gcc_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.23.1 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/gcc \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DENABLE_OPENMP=On \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DCMAKE_C_FLAGS="-g -O0" \ + -DCMAKE_CXX_FLAGS="-g -O0" \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo "***********************************************************************" From d383c58450249c00cafcc5f3cc4aab09da6f8605 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Thu, 27 Jun 2024 14:49:26 -0700 Subject: [PATCH 384/454] Update build script --- scripts/lc-builds/toss4_clang_caliper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index 588405a03..89ece7b23 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -25,7 +25,7 @@ ADIAK_DIR=$3 shift 3 BUILD_SUFFIX=lc_toss4-clang-${COMP_VER} -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss4/clang_X.cmake echo echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" From 2c0be2a5a480690db45b383c5de3d3eeec4ea0a8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 28 Jun 2024 11:54:25 -0700 Subject: [PATCH 385/454] Get multi reduce seq and omp kernels working --- src/algorithm/HISTOGRAM-Cuda.cpp | 18 +++++----- src/algorithm/HISTOGRAM-Hip.cpp | 18 +++++----- src/algorithm/HISTOGRAM-OMP.cpp | 22 ++++++++---- src/algorithm/HISTOGRAM-Seq.cpp | 22 ++++++++---- src/algorithm/HISTOGRAM.hpp | 58 +++++++++++++++++++------------- src/basic/MULTI_REDUCE-Cuda.cpp | 2 +- src/basic/MULTI_REDUCE-Hip.cpp | 2 +- src/basic/MULTI_REDUCE-OMP.cpp | 14 ++++++-- src/basic/MULTI_REDUCE-Seq.cpp | 14 ++++++-- src/basic/MULTI_REDUCE.hpp | 43 ++++++++++++----------- 10 files changed, 130 insertions(+), 83 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 6e90caac7..161bee82b 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -154,7 +154,7 @@ void HISTOGRAM::runCudaVariantLibrary(VariantID vid) auto res{getCudaResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1); @@ -199,7 +199,7 @@ void HISTOGRAM::runCudaVariantLibrary(VariantID vid) stream)); RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, 1); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, 1); } stopTimer(); @@ -224,7 +224,7 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) auto res{getCudaResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); @@ -246,7 +246,7 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) iend ); RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -272,7 +272,7 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) ibegin, iend, histogram_lambda ); RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -290,7 +290,7 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) }); RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -311,7 +311,7 @@ void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) auto res{getCudaResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); @@ -334,7 +334,7 @@ void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) iend ); RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -356,7 +356,7 @@ void HISTOGRAM::runCudaVariantAtomicRuntime(MultiReduceInfo info, VariantID vid) auto res{getCudaResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; static constexpr size_t block_size = info.block_size; const size_t grid_size = info.grid_size; diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 32ee0b686..4b7d00a96 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -163,7 +163,7 @@ void HISTOGRAM::runHipVariantLibrary(VariantID vid) auto res{getHipResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, 1); @@ -232,7 +232,7 @@ void HISTOGRAM::runHipVariantLibrary(VariantID vid) #endif RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, 1); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, 1); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, 1); } stopTimer(); @@ -257,7 +257,7 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) auto res{getHipResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); @@ -279,7 +279,7 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) iend ); RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -305,7 +305,7 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) ibegin, iend, histogram_lambda ); RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -323,7 +323,7 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) }); RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -344,7 +344,7 @@ void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) auto res{getHipResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); @@ -367,7 +367,7 @@ void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) iend ); RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, global_replication); + HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); } stopTimer(); @@ -389,7 +389,7 @@ void HISTOGRAM::runHipVariantAtomicRuntime(MultiReduceInfo info, VariantID vid) auto res{getHipResource()}; - HISTOGRAM_GPU_DATA_SETUP; + HISTOGRAM_DATA_SETUP; static constexpr size_t block_size = info.block_size; const size_t grid_size = info.grid_size; diff --git a/src/algorithm/HISTOGRAM-OMP.cpp b/src/algorithm/HISTOGRAM-OMP.cpp index 2ab07a367..87b554b47 100644 --- a/src/algorithm/HISTOGRAM-OMP.cpp +++ b/src/algorithm/HISTOGRAM-OMP.cpp @@ -32,10 +32,12 @@ void HISTOGRAM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ case Base_OpenMP : { + HISTOGRAM_SETUP_COUNTS; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - HISTOGRAM_INIT_VALUES; + HISTOGRAM_INIT_COUNTS; #pragma omp parallel for for (Index_type i = ibegin; i < iend; ++i ) { @@ -43,16 +45,20 @@ void HISTOGRAM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ HISTOGRAM_BODY; } - HISTOGRAM_FINALIZE_VALUES; + HISTOGRAM_FINALIZE_COUNTS; } stopTimer(); + HISTOGRAM_TEARDOWN_COUNTS; + break; } case Lambda_OpenMP : { + HISTOGRAM_SETUP_COUNTS; + auto histogram_base_lam = [=](Index_type i) { #pragma omp atomic HISTOGRAM_BODY; @@ -61,18 +67,20 @@ void HISTOGRAM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - HISTOGRAM_INIT_VALUES; + HISTOGRAM_INIT_COUNTS; #pragma omp parallel for for (Index_type i = ibegin; i < iend; ++i ) { histogram_base_lam(i); } - HISTOGRAM_FINALIZE_VALUES; + HISTOGRAM_FINALIZE_COUNTS; } stopTimer(); + HISTOGRAM_TEARDOWN_COUNTS; + break; } @@ -81,14 +89,14 @@ void HISTOGRAM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - HISTOGRAM_INIT_VALUES; + HISTOGRAM_INIT_COUNTS_RAJA(RAJA::omp_multi_reduce); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - HISTOGRAM_RAJA_BODY(RAJA::omp_atomic); + HISTOGRAM_BODY; }); - HISTOGRAM_FINALIZE_VALUES; + HISTOGRAM_FINALIZE_COUNTS_RAJA(RAJA::omp_multi_reduce); } stopTimer(); diff --git a/src/algorithm/HISTOGRAM-Seq.cpp b/src/algorithm/HISTOGRAM-Seq.cpp index c75463ed4..e41ab171e 100644 --- a/src/algorithm/HISTOGRAM-Seq.cpp +++ b/src/algorithm/HISTOGRAM-Seq.cpp @@ -30,26 +30,32 @@ void HISTOGRAM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx case Base_Seq : { + HISTOGRAM_SETUP_COUNTS; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - HISTOGRAM_INIT_VALUES; + HISTOGRAM_INIT_COUNTS; for (Index_type i = ibegin; i < iend; ++i ) { HISTOGRAM_BODY; } - HISTOGRAM_FINALIZE_VALUES; + HISTOGRAM_FINALIZE_COUNTS; } stopTimer(); + HISTOGRAM_TEARDOWN_COUNTS; + break; } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { + HISTOGRAM_SETUP_COUNTS; + auto histogram_base_lam = [=](Index_type i) { HISTOGRAM_BODY; }; @@ -57,17 +63,19 @@ void HISTOGRAM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - HISTOGRAM_INIT_VALUES; + HISTOGRAM_INIT_COUNTS; for (Index_type i = ibegin; i < iend; ++i ) { histogram_base_lam(i); } - HISTOGRAM_FINALIZE_VALUES; + HISTOGRAM_FINALIZE_COUNTS; } stopTimer(); + HISTOGRAM_TEARDOWN_COUNTS; + break; } @@ -76,14 +84,14 @@ void HISTOGRAM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - HISTOGRAM_INIT_VALUES; + HISTOGRAM_INIT_COUNTS_RAJA(RAJA::seq_multi_reduce); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - HISTOGRAM_RAJA_BODY(RAJA::seq_atomic); + HISTOGRAM_BODY; }); - HISTOGRAM_FINALIZE_VALUES; + HISTOGRAM_FINALIZE_COUNTS_RAJA(RAJA::seq_multi_reduce); } stopTimer(); diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 357346fd9..812f40e53 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -21,43 +21,36 @@ #define HISTOGRAM_DATA_SETUP \ Index_type num_bins = m_num_bins; \ Index_ptr bins = m_bins; \ - Data_ptr counts_init = m_counts_init.data(); \ - Data_ptr counts_final = m_counts_final.data(); \ - Data_ptr counts; \ - allocData(getReductionDataSpace(vid), counts, num_bins); + std::vector& counts_init = m_counts_init; \ + std::vector& counts_final = m_counts_final; -#define HISTOGRAM_DATA_TEARDOWN \ - deallocData(counts, vid); - -#define HISTOGRAM_GPU_DATA_SETUP \ - Index_type num_bins = m_num_bins; \ - Index_ptr bins = m_bins; \ - Data_ptr counts_init = m_counts_init.data(); \ - Data_ptr counts_final = m_counts_final.data(); - -#define HISTOGRAM_BODY \ - counts[bins[i]] += static_cast(1); +#define HISTOGRAM_DATA_TEARDOWN -#define HISTOGRAM_RAJA_BODY(policy) \ - RAJA::atomicAdd(&counts[bins[i]], static_cast(1)); -#define HISTOGRAM_GPU_BIN_INDEX(bin, offset, replication) \ - ((bin)*(replication) + ((offset)%(replication))) +#define HISTOGRAM_SETUP_COUNTS \ + Data_ptr counts; \ + allocData(getReductionDataSpace(vid), counts, num_bins); -#define HISTOGRAM_GPU_RAJA_BODY(policy, counts, index, value) \ - RAJA::atomicAdd(&(counts)[(index)], (value)); +#define HISTOGRAM_TEARDOWN_COUNTS \ + deallocData(counts, vid); -#define HISTOGRAM_INIT_VALUES \ +#define HISTOGRAM_INIT_COUNTS \ for (Index_type b = 0; b < num_bins; ++b ) { \ counts[b] = counts_init[b]; \ } -#define HISTOGRAM_FINALIZE_VALUES \ +#define HISTOGRAM_FINALIZE_COUNTS \ for (Index_type b = 0; b < num_bins; ++b ) { \ counts_final[b] = counts[b]; \ } -#define HISTOGRAM_GPU_FINALIZE_VALUES(hcounts, num_bins, replication) \ +#define HISTOGRAM_INIT_COUNTS_RAJA(policy) \ + RAJA::MultiReduceSum counts(counts_init); + +#define HISTOGRAM_FINALIZE_COUNTS_RAJA(policy) \ + counts.get_all(counts_final); + +#define HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, replication) \ for (Index_type b = 0; b < (num_bins); ++b) { \ Data_type count_final = 0; \ for (size_t r = 0; r < (replication); ++r) { \ @@ -67,6 +60,19 @@ } +#define HISTOGRAM_BODY \ + counts[bins[i]] += static_cast(1); + +#define HISTOGRAM_RAJA_BODY(policy) \ + RAJA::atomicAdd(&counts[bins[i]], static_cast(1)); + +#define HISTOGRAM_GPU_BIN_INDEX(bin, offset, replication) \ + ((bin)*(replication) + ((offset)%(replication))) + +#define HISTOGRAM_GPU_RAJA_BODY(policy, counts, index, value) \ + RAJA::atomicAdd(&(counts)[(index)], (value)); + + #include "common/KernelBase.hpp" namespace rajaperf @@ -129,6 +135,8 @@ class HISTOGRAM : public KernelBase }; +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HP) + // Compute lhs % rhs between non-negative lhs and positive power of 2 rhs template < typename L, typename R > constexpr auto power_of_2_mod(L lhs, R rhs) noexcept @@ -257,6 +265,8 @@ struct MultiReduceAtomicCalculator IndexType m_global_replication; }; +#endif + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index 435cd783e..705a43cd9 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -45,7 +45,7 @@ void MULTI_REDUCE::runCudaVariantAtomicGlobal(VariantID vid) auto res{getCudaResource()}; - MULTI_REDUCE_GPU_DATA_SETUP; + MULTI_REDUCE_DATA_SETUP; RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, replication); diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index b64c6fc72..2f5897a95 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -45,7 +45,7 @@ void MULTI_REDUCE::runHipVariantAtomicGlobal(VariantID vid) auto res{getHipResource()}; - MULTI_REDUCE_GPU_DATA_SETUP; + MULTI_REDUCE_DATA_SETUP; RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, replication); diff --git a/src/basic/MULTI_REDUCE-OMP.cpp b/src/basic/MULTI_REDUCE-OMP.cpp index 18777940e..2e2ebf5d4 100644 --- a/src/basic/MULTI_REDUCE-OMP.cpp +++ b/src/basic/MULTI_REDUCE-OMP.cpp @@ -32,6 +32,8 @@ void MULTI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu case Base_OpenMP : { + MULTI_REDUCE_SETUP_VALUES; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -48,11 +50,15 @@ void MULTI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } stopTimer(); + MULTI_REDUCE_TEARDOWN_VALUES; + break; } case Lambda_OpenMP : { + MULTI_REDUCE_SETUP_VALUES; + auto multi_reduce_base_lam = [=](Index_type i) { #pragma omp atomic MULTI_REDUCE_BODY; @@ -73,6 +79,8 @@ void MULTI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu } stopTimer(); + MULTI_REDUCE_TEARDOWN_VALUES; + break; } @@ -81,14 +89,14 @@ void MULTI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - MULTI_REDUCE_INIT_VALUES; + MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::omp_multi_reduce); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - MULTI_REDUCE_RAJA_BODY(RAJA::omp_atomic); + MULTI_REDUCE_BODY; }); - MULTI_REDUCE_FINALIZE_VALUES; + MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::omp_multi_reduce); } stopTimer(); diff --git a/src/basic/MULTI_REDUCE-Seq.cpp b/src/basic/MULTI_REDUCE-Seq.cpp index ac3bd41bb..a771953aa 100644 --- a/src/basic/MULTI_REDUCE-Seq.cpp +++ b/src/basic/MULTI_REDUCE-Seq.cpp @@ -30,6 +30,8 @@ void MULTI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ case Base_Seq : { + MULTI_REDUCE_SETUP_VALUES; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -44,12 +46,16 @@ void MULTI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); + MULTI_REDUCE_TEARDOWN_VALUES; + break; } #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { + MULTI_REDUCE_SETUP_VALUES; + auto multi_reduce_base_lam = [=](Index_type i) { MULTI_REDUCE_BODY; }; @@ -68,6 +74,8 @@ void MULTI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ } stopTimer(); + MULTI_REDUCE_TEARDOWN_VALUES; + break; } @@ -76,14 +84,14 @@ void MULTI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - MULTI_REDUCE_INIT_VALUES; + MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::seq_multi_reduce); RAJA::forall( RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - MULTI_REDUCE_RAJA_BODY(RAJA::seq_atomic); + MULTI_REDUCE_BODY; }); - MULTI_REDUCE_FINALIZE_VALUES; + MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::seq_multi_reduce); } stopTimer(); diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 4123f483b..a85a5473a 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -22,29 +22,18 @@ Index_type num_bins = m_num_bins; \ Index_ptr bins = m_bins; \ Data_ptr data = m_data; \ - Data_ptr values_init = m_values_init.data(); \ - Data_ptr values_final = m_values_final.data(); \ - Data_ptr values; \ - allocData(getReductionDataSpace(vid), values, num_bins); - -#define MULTI_REDUCE_DATA_TEARDOWN \ - deallocData(values, vid); + std::vector& values_init = m_values_init; \ + std::vector& values_final = m_values_final; -#define MULTI_REDUCE_GPU_DATA_SETUP \ - Index_type num_bins = m_num_bins; \ - Index_ptr bins = m_bins; \ - Data_ptr data = m_data; \ - Data_ptr values_init = m_values_init.data(); \ - Data_ptr values_final = m_values_final.data(); +#define MULTI_REDUCE_DATA_TEARDOWN -#define MULTI_REDUCE_BODY \ - values[bins[i]] += data[i]; -#define MULTI_REDUCE_RAJA_BODY(policy) \ - RAJA::atomicAdd(&values[bins[i]], data[i]); +#define MULTI_REDUCE_SETUP_VALUES \ + Data_ptr values; \ + allocData(getReductionDataSpace(vid), values, num_bins); -#define MULTI_REDUCE_GPU_RAJA_BODY(policy) \ - RAJA::atomicAdd(&values[bins[i]*replication + (i%replication)], data[i]); +#define MULTI_REDUCE_TEARDOWN_VALUES \ + deallocData(values, vid); #define MULTI_REDUCE_INIT_VALUES \ for (Index_type b = 0; b < num_bins; ++b ) { \ @@ -56,6 +45,12 @@ values_final[b] = values[b]; \ } +#define MULTI_REDUCE_INIT_VALUES_RAJA(policy) \ + RAJA::MultiReduceSum values(values_init); + +#define MULTI_REDUCE_FINALIZE_VALUES_RAJA(policy) \ + values.get_all(values_final); + #define MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication) \ for (Index_type b = 0; b < (num_bins); ++b) { \ Data_type val_final = 0; \ @@ -66,6 +61,16 @@ } +#define MULTI_REDUCE_BODY \ + values[bins[i]] += data[i]; + +#define MULTI_REDUCE_RAJA_BODY(policy) \ + RAJA::atomicAdd(&values[bins[i]], data[i]); + +#define MULTI_REDUCE_GPU_RAJA_BODY(policy) \ + RAJA::atomicAdd(&values[bins[i]*replication + (i%replication)], data[i]); + + #include "common/KernelBase.hpp" namespace rajaperf From fbc57f54b13e33d33b840f87ea44a063a5806b77 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 28 Jun 2024 11:54:37 -0700 Subject: [PATCH 386/454] Add missing header --- src/common/GPUUtils.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 1ef5d249e..0061928f5 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -16,6 +16,8 @@ #include "rajaperf_config.hpp" +#include + namespace rajaperf { From e2cae94992a8d0187f9130e9364210186de79d79 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 28 Jun 2024 11:55:11 -0700 Subject: [PATCH 387/454] remove RAJA omp target variants for now --- src/algorithm/HISTOGRAM-OMPTarget.cpp | 17 ----------------- src/algorithm/HISTOGRAM.cpp | 1 - src/basic/MULTI_REDUCE-OMPTarget.cpp | 17 ----------------- src/basic/MULTI_REDUCE.cpp | 1 - 4 files changed, 36 deletions(-) diff --git a/src/algorithm/HISTOGRAM-OMPTarget.cpp b/src/algorithm/HISTOGRAM-OMPTarget.cpp index 93217c194..033f309c9 100644 --- a/src/algorithm/HISTOGRAM-OMPTarget.cpp +++ b/src/algorithm/HISTOGRAM-OMPTarget.cpp @@ -54,23 +54,6 @@ void HISTOGRAM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } stopTimer(); - } else if ( vid == RAJA_OpenMPTarget ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initOpenMPDeviceData(counts, counts_init, num_bins); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - HISTOGRAM_RAJA_BODY(RAJA::omp_atomic); - }); - - getOpenMPDeviceData(counts_final, counts, num_bins); - - } - stopTimer(); - } else { getCout() << "\n HISTOGRAM : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 116892195..01932f43e 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -48,7 +48,6 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setVariantDefined( RAJA_OpenMP ); setVariantDefined( Base_OpenMPTarget ); - setVariantDefined( RAJA_OpenMPTarget ); setVariantDefined( Base_CUDA ); setVariantDefined( Lambda_CUDA ); diff --git a/src/basic/MULTI_REDUCE-OMPTarget.cpp b/src/basic/MULTI_REDUCE-OMPTarget.cpp index 30e2a5c2d..8c2e18060 100644 --- a/src/basic/MULTI_REDUCE-OMPTarget.cpp +++ b/src/basic/MULTI_REDUCE-OMPTarget.cpp @@ -54,23 +54,6 @@ void MULTI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ } stopTimer(); - } else if ( vid == RAJA_OpenMPTarget ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initOpenMPDeviceData(values, values_init, num_bins); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - MULTI_REDUCE_RAJA_BODY(RAJA::omp_atomic); - }); - - getOpenMPDeviceData(values_final, values, num_bins); - - } - stopTimer(); - } else { getCout() << "\n MULTI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index a41b20850..4d111444c 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -47,7 +47,6 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setVariantDefined( RAJA_OpenMP ); setVariantDefined( Base_OpenMPTarget ); - setVariantDefined( RAJA_OpenMPTarget ); setVariantDefined( Base_CUDA ); setVariantDefined( Lambda_CUDA ); From e34c3c4a955ee90d7b5e93b2e9c31528326fa058 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 28 Jun 2024 12:47:24 -0700 Subject: [PATCH 388/454] Proposed pattern for new reduction variant --- src/basic/PI_REDUCE-Cuda.cpp | 29 +++++++++++++++++++++++++---- src/basic/PI_REDUCE.cpp | 1 + src/common/KernelBase.cpp | 5 +++++ src/common/KernelBase.hpp | 3 ++- src/common/RAJAPerfSuite.cpp | 7 +++++-- src/common/RAJAPerfSuite.hpp | 1 + 6 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index c9c165321..30207c632 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -138,6 +138,27 @@ void PI_REDUCE::runCudaVariantRAJA(VariantID vid) } stopTimer(); + } else if ( vid == RAJA_CUDA_NewReduce ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall< RAJA::cuda_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] __device__ (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + } else { getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; } @@ -147,7 +168,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA || RAJA_CUDA_NewReduce ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -168,7 +189,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) t += 1; - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == RAJA_CUDA || RAJA_CUDA_NewReduce) { seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { @@ -203,7 +224,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) { - if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA || RAJA_CUDA_NewReduce ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -220,7 +241,7 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == RAJA_CUDA || RAJA_CUDA_NewReduce ) { seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 2ae27e762..d0f4ebe9d 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -48,6 +48,7 @@ PI_REDUCE::PI_REDUCE(const RunParams& params) setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); + setVariantDefined( RAJA_CUDA_NewReduce ); setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 3d0e55302..586868e90 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -160,6 +160,7 @@ void KernelBase::setVariantDefined(VariantID vid) case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : + case RAJA_CUDA_NewReduce : { #if defined(RAJA_ENABLE_CUDA) setCudaTuningDefinitions(vid); @@ -239,6 +240,7 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : + case RAJA_CUDA_NewReduce : return run_params.getCudaDataSpace(); case Base_HIP : @@ -279,6 +281,7 @@ DataSpace KernelBase::getMPIDataSpace(VariantID vid) const case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : + case RAJA_CUDA_NewReduce : return run_params.getCudaMPIDataSpace(); case Base_HIP : @@ -319,6 +322,7 @@ DataSpace KernelBase::getReductionDataSpace(VariantID vid) const case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : + case RAJA_CUDA_NewReduce : return run_params.getCudaReductionDataSpace(); case Base_HIP : @@ -421,6 +425,7 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx) case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : + case RAJA_CUDA_NewReduce : { #if defined(RAJA_ENABLE_CUDA) runCudaVariant(vid, tune_idx); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 4feda83d9..1aaa94c25 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -266,7 +266,8 @@ class KernelBase #if defined(RAJA_ENABLE_CUDA) if ( running_variant == Base_CUDA || running_variant == Lambda_CUDA || - running_variant == RAJA_CUDA ) { + running_variant == RAJA_CUDA || + running_variant == RAJA_CUDA_NewReduce ) { cudaErrchk( cudaDeviceSynchronize() ); } #endif diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 3a79da87f..fd2a060f4 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -304,6 +304,7 @@ static const std::string VariantNames [] = std::string("Base_CUDA"), std::string("Lambda_CUDA"), std::string("RAJA_CUDA"), + std::string("RAJA_CUDA_NewReduce"), std::string("Base_HIP"), std::string("Lambda_HIP"), @@ -501,7 +502,8 @@ bool isVariantAvailable(VariantID vid) #if defined(RAJA_ENABLE_CUDA) if ( vid == Base_CUDA || vid == Lambda_CUDA || - vid == RAJA_CUDA ) { + vid == RAJA_CUDA || + vid == RAJA_CUDA_NewReduce ) { ret_val = true; } #endif @@ -569,7 +571,8 @@ bool isVariantGPU(VariantID vid) #if defined(RAJA_ENABLE_CUDA) if ( vid == Base_CUDA || vid == Lambda_CUDA || - vid == RAJA_CUDA ) { + vid == RAJA_CUDA || + vid == RAJA_CUDA_NewReduce ) { ret_val = true; } #endif diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 20380d08e..930238af4 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -211,6 +211,7 @@ enum VariantID { Base_CUDA, Lambda_CUDA, RAJA_CUDA, + RAJA_CUDA_NewReduce, Base_HIP, Lambda_HIP, From b39985f30a30c86cbb7d6f3f1b0bd7436ace17e4 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 28 Jun 2024 13:19:47 -0700 Subject: [PATCH 389/454] Update PI_REDUCE-Cuda.cpp Here's an example of how to add the new reducers as a tuning --- src/basic/PI_REDUCE-Cuda.cpp | 50 +++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 30207c632..4abe7a4ad 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -138,8 +138,29 @@ void PI_REDUCE::runCudaVariantRAJA(VariantID vid) } stopTimer(); - } else if ( vid == RAJA_CUDA_NewReduce ) { - + } else { + getCout() << "\n PI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; + } +} + + +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void PI_REDUCE::runCudaVariantNewRAJA(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -168,7 +189,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - if ( vid == Base_CUDA || vid == RAJA_CUDA || RAJA_CUDA_NewReduce ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -189,7 +210,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) t += 1; - } else if ( vid == RAJA_CUDA || RAJA_CUDA_NewReduce) { + } else if ( vid == RAJA_CUDA ) { seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { @@ -206,6 +227,18 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runCudaVariantNewRAJA(vid); + + } + + t += 1; + } }); @@ -224,7 +257,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) { - if ( vid == Base_CUDA || vid == RAJA_CUDA || RAJA_CUDA_NewReduce ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -241,7 +274,7 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); - } else if ( vid == RAJA_CUDA || RAJA_CUDA_NewReduce ) { + } else if ( vid == RAJA_CUDA ) { seq_for(gpu_algorithm::reducer_helpers{}, [&](auto algorithm_helper) { @@ -250,6 +283,11 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) std::to_string(block_size)); }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); } From 064f8e79fdfa2bd639e7162d515dfb68c924d300 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 28 Jun 2024 13:57:49 -0700 Subject: [PATCH 390/454] Remove earlier "variant" attempt and fix compilation. --- src/basic/PI_REDUCE-Cuda.cpp | 8 ++++---- src/basic/PI_REDUCE.cpp | 1 - src/basic/PI_REDUCE.hpp | 3 +++ src/common/KernelBase.cpp | 5 ----- src/common/KernelBase.hpp | 3 +-- src/common/RAJAPerfSuite.cpp | 7 ++----- src/common/RAJAPerfSuite.hpp | 1 - 7 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 4abe7a4ad..2abb07555 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -145,7 +145,7 @@ void PI_REDUCE::runCudaVariantRAJA(VariantID vid) template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > -void PI_REDUCE::runCudaVariantNewRAJA(VariantID vid) +void PI_REDUCE::runCudaVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t, @@ -231,9 +231,9 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) auto algorithm_helper = gpu_algorithm::block_device_helper{}; setBlockSize(block_size); - runCudaVariantNewRAJA(vid); + runCudaVariantRAJANewReduce(vid); } diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index d0f4ebe9d..2ae27e762 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -48,7 +48,6 @@ PI_REDUCE::PI_REDUCE(const RunParams& params) setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); - setVariantDefined( RAJA_CUDA_NewReduce ); setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 4dc3104d3..98c64495d 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -69,6 +69,9 @@ class PI_REDUCE : public KernelBase template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 586868e90..3d0e55302 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -160,7 +160,6 @@ void KernelBase::setVariantDefined(VariantID vid) case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : - case RAJA_CUDA_NewReduce : { #if defined(RAJA_ENABLE_CUDA) setCudaTuningDefinitions(vid); @@ -240,7 +239,6 @@ DataSpace KernelBase::getDataSpace(VariantID vid) const case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : - case RAJA_CUDA_NewReduce : return run_params.getCudaDataSpace(); case Base_HIP : @@ -281,7 +279,6 @@ DataSpace KernelBase::getMPIDataSpace(VariantID vid) const case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : - case RAJA_CUDA_NewReduce : return run_params.getCudaMPIDataSpace(); case Base_HIP : @@ -322,7 +319,6 @@ DataSpace KernelBase::getReductionDataSpace(VariantID vid) const case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : - case RAJA_CUDA_NewReduce : return run_params.getCudaReductionDataSpace(); case Base_HIP : @@ -425,7 +421,6 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx) case Base_CUDA : case Lambda_CUDA : case RAJA_CUDA : - case RAJA_CUDA_NewReduce : { #if defined(RAJA_ENABLE_CUDA) runCudaVariant(vid, tune_idx); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 1aaa94c25..4feda83d9 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -266,8 +266,7 @@ class KernelBase #if defined(RAJA_ENABLE_CUDA) if ( running_variant == Base_CUDA || running_variant == Lambda_CUDA || - running_variant == RAJA_CUDA || - running_variant == RAJA_CUDA_NewReduce ) { + running_variant == RAJA_CUDA ) { cudaErrchk( cudaDeviceSynchronize() ); } #endif diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index fd2a060f4..3a79da87f 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -304,7 +304,6 @@ static const std::string VariantNames [] = std::string("Base_CUDA"), std::string("Lambda_CUDA"), std::string("RAJA_CUDA"), - std::string("RAJA_CUDA_NewReduce"), std::string("Base_HIP"), std::string("Lambda_HIP"), @@ -502,8 +501,7 @@ bool isVariantAvailable(VariantID vid) #if defined(RAJA_ENABLE_CUDA) if ( vid == Base_CUDA || vid == Lambda_CUDA || - vid == RAJA_CUDA || - vid == RAJA_CUDA_NewReduce ) { + vid == RAJA_CUDA ) { ret_val = true; } #endif @@ -571,8 +569,7 @@ bool isVariantGPU(VariantID vid) #if defined(RAJA_ENABLE_CUDA) if ( vid == Base_CUDA || vid == Lambda_CUDA || - vid == RAJA_CUDA || - vid == RAJA_CUDA_NewReduce ) { + vid == RAJA_CUDA ) { ret_val = true; } #endif diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 930238af4..20380d08e 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -211,7 +211,6 @@ enum VariantID { Base_CUDA, Lambda_CUDA, RAJA_CUDA, - RAJA_CUDA_NewReduce, Base_HIP, Lambda_HIP, From bdc82900b4afe5791307a0a16285cb999136ae50 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 28 Jun 2024 14:38:04 -0700 Subject: [PATCH 391/454] Add new reduce tuning to Seq variant --- src/basic/PI_REDUCE-Cuda.cpp | 3 ++ src/basic/PI_REDUCE-Seq.cpp | 53 +++++++++++++++++++++++++++++------- src/basic/PI_REDUCE.hpp | 1 + 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 2abb07555..f343de523 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -234,6 +234,7 @@ void PI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) runCudaVariantRAJANewReduce(vid); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } @@ -273,6 +274,7 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } else if ( vid == RAJA_CUDA ) { @@ -288,6 +290,7 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 9fc2ed0b5..13cd17582 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -74,20 +74,45 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum pi(m_pi_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - PI_REDUCE_BODY; - }); + RAJA::ReduceSum pi(m_pi_init); + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + PI_REDUCE_BODY; + }); - m_pi = 4.0 * pi.get(); + m_pi = 4.0 * pi.get(); + + } + stopTimer(); } - stopTimer(); + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] __device__ (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } break; } @@ -101,5 +126,13 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } +void PI_REDUCE::setSeqTuningDefinitions(VariantID vid) +{ + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "default"); + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 98c64495d..bedb972a5 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -58,6 +58,7 @@ class PI_REDUCE : public KernelBase void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); void runSyclVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); From bad957c82b55ccf07cca4ba419d64b969cc35e87 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 28 Jun 2024 14:49:42 -0700 Subject: [PATCH 392/454] Remove device lambda decoration... D'oh! --- src/basic/PI_REDUCE-Seq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 13cd17582..34ecc261f 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -102,7 +102,7 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) RAJA::forall( RAJA::RangeSegment(ibegin, iend), RAJA::expt::Reduce(&tpi), - [=] __device__ (Index_type i, Real_type& pi) { + [=] (Index_type i, Real_type& pi) { PI_REDUCE_BODY; } ); From 62a2df58266668c743780dc4c27d3cdaf4d68b7e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 1 Jul 2024 15:35:29 -0700 Subject: [PATCH 393/454] Add new reduction "tuning" for OpenMP and HIP, and make code consistent --- src/basic/PI_REDUCE-Cuda.cpp | 3 +- src/basic/PI_REDUCE-Hip.cpp | 61 ++++++++++++++++++++++++++++++++++++ src/basic/PI_REDUCE-OMP.cpp | 58 +++++++++++++++++++++++++++------- src/basic/PI_REDUCE-Seq.cpp | 4 ++- src/basic/PI_REDUCE.hpp | 8 +++-- 5 files changed, 118 insertions(+), 16 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index f343de523..8c2ebdb8e 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -166,7 +166,7 @@ void PI_REDUCE::runCudaVariantRAJANewReduce(VariantID vid) Real_type tpi = m_pi_init; - RAJA::forall< RAJA::cuda_exec >( + RAJA::forall< exec_policy >( res, RAJA::RangeSegment(ibegin, iend), RAJA::expt::Reduce(&tpi), @@ -289,6 +289,7 @@ void PI_REDUCE::setCudaTuningDefinitions(VariantID vid) auto algorithm_helper = gpu_algorithm::block_device_helper{}; addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ "new_"+std::to_string(block_size)); RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 7df62b8c8..0050f7548 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -143,6 +143,47 @@ void PI_REDUCE::runHipVariantRAJA(VariantID vid) } } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void PI_REDUCE::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + PI_REDUCE_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall< exec_policy >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] __device__ (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } else { + getCout() << "\n PI_REDUCE : Unknown Hip variant id = " << vid << std::endl; + } +} + void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -185,6 +226,19 @@ void PI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + t += 1; + } }); @@ -230,6 +284,13 @@ void PI_REDUCE::setHipTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index cbf32359e..e95aa68fc 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -18,7 +18,7 @@ namespace basic { -void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -77,21 +77,47 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum pi(m_pi_init); + RAJA::ReduceSum pi(m_pi_init); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - PI_REDUCE_BODY; - }); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + PI_REDUCE_BODY; + }); - m_pi = 4.0 * pi.get(); + m_pi = 4.0 * pi.get(); + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); } - stopTimer(); break; } @@ -107,5 +133,15 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #endif } +void PI_REDUCE::setOpenMPTuningDefinitions(VariantID vid) +{ + if (vid == Base_OpenMP || vid == Lambda_OpenMP) { + addVariantTuningName(vid, "default"); + } else if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "default"); + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 34ecc261f..2e9ad6432 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -128,7 +128,9 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) void PI_REDUCE::setSeqTuningDefinitions(VariantID vid) { - if (vid == RAJA_Seq) { + if (vid == Base_Seq || vid == Lambda_Seq) { + addVariantTuningName(vid, "default"); + } else if (vid == RAJA_Seq) { addVariantTuningName(vid, "default"); addVariantTuningName(vid, "new"); } diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index bedb972a5..34b263f54 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -59,22 +59,24 @@ class PI_REDUCE : public KernelBase void runSyclVariant(VariantID vid, size_t tune_idx); void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, typename MappingHelper > - void runHipVariantBase(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJANewReduce(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); From bd05958aa9c98e8259699f4b1fd1518bceafc2b8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 2 Jul 2024 15:51:54 -0700 Subject: [PATCH 394/454] Fix compiler errors and warnings for OpenMP target build --- src/algorithm/ATOMIC-OMPTarget.cpp | 2 +- src/apps/EDGE3D-OMPTarget.cpp | 5 ----- src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp | 10 ++++++++-- src/basic/ARRAY_OF_PTRS-Seq.cpp | 2 ++ src/comm/HALO_PACKING_FUSED-Seq.cpp | 12 ++++++------ src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp | 2 ++ 6 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/algorithm/ATOMIC-OMPTarget.cpp b/src/algorithm/ATOMIC-OMPTarget.cpp index bbd3d6b67..2c7bb7203 100644 --- a/src/algorithm/ATOMIC-OMPTarget.cpp +++ b/src/algorithm/ATOMIC-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace algorithm const size_t threads_per_team = 256; template < size_t replication > -void ATOMIC::runOpenMPTargetReplicate(VariantID vid) +void ATOMIC::runOpenMPTargetVariantReplicate(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; diff --git a/src/apps/EDGE3D-OMPTarget.cpp b/src/apps/EDGE3D-OMPTarget.cpp index 57b2bc738..2a348ec28 100644 --- a/src/apps/EDGE3D-OMPTarget.cpp +++ b/src/apps/EDGE3D-OMPTarget.cpp @@ -37,11 +37,6 @@ void EDGE3D::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tu EDGE3D_DATA_SETUP; - auto edge3d_lam = - [=](Index_type i) { - EDGE3D_BODY; - }; - if ( vid == Base_OpenMPTarget ) { startTimer(); diff --git a/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp index 1c10d27f0..09a3093d4 100644 --- a/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp +++ b/src/apps/MATVEC_3D_STENCIL-OMPTarget.cpp @@ -42,8 +42,14 @@ void MATVEC_3D_STENCIL::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp target is_device_ptr(x0,x1,x2,x3,x4,x5,x6,x7, \ - vol, real_zones) device( did ) + #pragma omp target is_device_ptr(b, \ + dbl, dbc, dbr, dcl, dcc, dcr, dfl, dfc, dfr, \ + xdbl, xdbc, xdbr, xdcl, xdcc, xdcr, xdfl, xdfc, xdfr, \ + cbl, cbc, cbr, ccl, ccc, ccr, cfl, cfc, cfr, \ + xcbl, xcbc, xcbr, xccl, xccc, xccr, xcfl, xcfc, xcfr, \ + ubl, ubc, ubr, ucl, ucc, ucr, ufl, ufc, ufr, \ + xubl, xubc, xubr, xucl, xucc, xucr, xufl, xufc, xufr, \ + real_zones) device( did ) #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) for (Index_type ii = ibegin ; ii < iend ; ++ii ) { MATVEC_3D_STENCIL_BODY_INDEX; diff --git a/src/basic/ARRAY_OF_PTRS-Seq.cpp b/src/basic/ARRAY_OF_PTRS-Seq.cpp index d03fb7ac4..dd22e010d 100644 --- a/src/basic/ARRAY_OF_PTRS-Seq.cpp +++ b/src/basic/ARRAY_OF_PTRS-Seq.cpp @@ -26,9 +26,11 @@ void ARRAY_OF_PTRS::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune ARRAY_OF_PTRS_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto array_of_ptrs_lam = [=](Index_type i) { ARRAY_OF_PTRS_BODY(x); }; +#endif switch ( vid ) { diff --git a/src/comm/HALO_PACKING_FUSED-Seq.cpp b/src/comm/HALO_PACKING_FUSED-Seq.cpp index 2b25adcd0..f7c16e253 100644 --- a/src/comm/HALO_PACKING_FUSED-Seq.cpp +++ b/src/comm/HALO_PACKING_FUSED-Seq.cpp @@ -189,15 +189,15 @@ void HALO_PACKING_FUSED::runSeqVariantDirect(VariantID vid) template < typename dispatch_helper > void HALO_PACKING_FUSED::runSeqVariantWorkGroup(VariantID vid) { - const Index_type run_reps = getRunReps(); - - HALO_PACKING_FUSED_DATA_SETUP; - switch ( vid ) { -#if defined(RUN_RAJA_SEQ) case RAJA_Seq : { +#if defined(RUN_RAJA_SEQ) + const Index_type run_reps = getRunReps(); + + HALO_PACKING_FUSED_DATA_SETUP; + using AllocatorHolder = RAJAPoolAllocatorHolder< RAJA::basic_mempool::MemPool>; using Allocator = AllocatorHolder::Allocator; @@ -281,10 +281,10 @@ void HALO_PACKING_FUSED::runSeqVariantWorkGroup(VariantID vid) } stopTimer(); +#endif // RUN_RAJA_SEQ break; } -#endif // RUN_RAJA_SEQ default : { getCout() << "\n HALO_PACKING_FUSED : Unknown variant id = " << vid << std::endl; diff --git a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp index 383822a87..5f3549b41 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D-Seq.cpp @@ -25,12 +25,14 @@ void POLYBENCH_JACOBI_1D::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR POLYBENCH_JACOBI_1D_DATA_SETUP; +#if defined(RUN_RAJA_SEQ) auto poly_jacobi1d_lam1 = [=] (Index_type i) { POLYBENCH_JACOBI_1D_BODY1; }; auto poly_jacobi1d_lam2 = [=] (Index_type i) { POLYBENCH_JACOBI_1D_BODY2; }; +#endif switch ( vid ) { From 17d4e9170b4a8fae2b3e9d6a625ed29008f79993 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 2 Jul 2024 15:52:44 -0700 Subject: [PATCH 395/454] Clean up reduction tuning variants for PI_REDUCE. Note: new reduction interface tuning is if-def'd out since it generates an internal compiler error for some compilers and when it does build generates an incorrect checksum. More explorations to do. --- src/basic/PI_REDUCE-OMP.cpp | 6 ++-- src/basic/PI_REDUCE-OMPTarget.cpp | 59 ++++++++++++++++++++++++------- src/basic/PI_REDUCE-Seq.cpp | 6 ++-- src/basic/PI_REDUCE.hpp | 1 + 4 files changed, 52 insertions(+), 20 deletions(-) diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index e95aa68fc..acc823d16 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -135,10 +135,8 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t tune_idx) void PI_REDUCE::setOpenMPTuningDefinitions(VariantID vid) { - if (vid == Base_OpenMP || vid == Lambda_OpenMP) { - addVariantTuningName(vid, "default"); - } else if (vid == RAJA_OpenMP) { - addVariantTuningName(vid, "default"); + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { addVariantTuningName(vid, "new"); } } diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index 47b64fec6..538ae3b46 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -56,21 +56,48 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum pi(m_pi_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - PI_REDUCE_BODY; - }); + RAJA::ReduceSum pi(m_pi_init); - m_pi = 4.0 * pi.get(); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + PI_REDUCE_BODY; + }); - } - stopTimer(); + m_pi = 4.0 * pi.get(); + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tpi = m_pi_init; +#if 0 + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); +#endif + + m_pi = static_cast(tpi) * 4.0; + + } + stopTimer(); + + } } else { getCout() << "\n PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; @@ -78,6 +105,14 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } +void PI_REDUCE::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 2e9ad6432..8c6f580b2 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -128,10 +128,8 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) void PI_REDUCE::setSeqTuningDefinitions(VariantID vid) { - if (vid == Base_Seq || vid == Lambda_Seq) { - addVariantTuningName(vid, "default"); - } else if (vid == RAJA_Seq) { - addVariantTuningName(vid, "default"); + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { addVariantTuningName(vid, "new"); } } diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 34b263f54..6d2d60428 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -62,6 +62,7 @@ class PI_REDUCE : public KernelBase void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > From 5cc4444e7a9592dd20fdfe96fb928b45fbe79bd3 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 3 Jul 2024 10:25:24 -0700 Subject: [PATCH 396/454] Re-adding new reduction version. Correct results with two blueos compilers. --- src/basic/PI_REDUCE-OMPTarget.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index 538ae3b46..bc9c00a9a 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -82,7 +82,6 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Real_type tpi = m_pi_init; -#if 0 RAJA::forall>( RAJA::RangeSegment(ibegin, iend), RAJA::expt::Reduce(&tpi), @@ -90,7 +89,6 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) PI_REDUCE_BODY; } ); -#endif m_pi = static_cast(tpi) * 4.0; @@ -108,7 +106,7 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) void PI_REDUCE::setOpenMPTargetTuningDefinitions(VariantID vid) { addVariantTuningName(vid, "default"); - if (vid == RAJA_OpenMP) { + if (vid == RAJA_OpenMPTarget) { addVariantTuningName(vid, "new"); } } From 8c2636d01b97319ae86e51eec3324fe15e3169c9 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 3 Jul 2024 12:37:05 -0700 Subject: [PATCH 397/454] Add new reducer tunings. --- src/basic/PI_REDUCE-OMPTarget.cpp | 31 ++++++++-------- src/basic/TRAP_INT-Cuda.cpp | 62 +++++++++++++++++++++++++++++++ src/basic/TRAP_INT-Hip.cpp | 60 ++++++++++++++++++++++++++++++ src/basic/TRAP_INT-OMP.cpp | 54 ++++++++++++++++++++++----- src/basic/TRAP_INT-OMPTarget.cpp | 57 ++++++++++++++++++++++------ src/basic/TRAP_INT-Seq.cpp | 54 ++++++++++++++++++++++----- src/basic/TRAP_INT.hpp | 13 +++++-- 7 files changed, 282 insertions(+), 49 deletions(-) diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index bc9c00a9a..cf9d2ce85 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -74,28 +74,29 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } + } - if (tune_idx == 1) { + if (tune_idx == 1) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type tpi = m_pi_init; - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - RAJA::expt::Reduce(&tpi), - [=] (Index_type i, Real_type& pi) { - PI_REDUCE_BODY; - } - ); + Real_type tpi = m_pi_init; - m_pi = static_cast(tpi) * 4.0; + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tpi), + [=] (Index_type i, Real_type& pi) { + PI_REDUCE_BODY; + } + ); - } - stopTimer(); + m_pi = static_cast(tpi) * 4.0; } + stopTimer(); + + } } else { getCout() << "\n PI_REDUCE : Unknown OMP Target variant id = " << vid << std::endl; diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index fea2de43e..373364420 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -151,6 +151,47 @@ void TRAP_INT::runCudaVariantRAJA(VariantID vid) } } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void TRAP_INT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] __device__ (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -193,6 +234,19 @@ void TRAP_INT::runCudaVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + t += 1; + } }); @@ -227,6 +281,7 @@ void TRAP_INT::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } else if ( vid == RAJA_CUDA ) { @@ -238,6 +293,13 @@ void TRAP_INT::setCudaTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index 62cd3a2d0..c671f4673 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -151,6 +151,47 @@ void TRAP_INT::runHipVariantRAJA(VariantID vid) } } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void TRAP_INT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + TRAP_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] __device__ (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); + + } else { + getCout() << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -193,6 +234,19 @@ void TRAP_INT::runHipVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + t += 1; + } }); @@ -238,6 +292,12 @@ void TRAP_INT::setHipTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + } }); diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index d21542999..7c3ef3437 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -20,7 +20,7 @@ namespace basic { -void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void TRAP_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -79,20 +79,46 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum sumx(m_sumx_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); + RAJA::ReduceSum sumx(m_sumx_init); - m_sumx += static_cast(sumx.get()) * h; + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); } - stopTimer(); break; } @@ -108,5 +134,13 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i #endif } +void TRAP_INT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index a1a88e30a..0bd9155b2 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -29,7 +29,7 @@ namespace basic const size_t threads_per_team = 256; -void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -46,7 +46,8 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( Real_type sumx = m_sumx_init; - #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \ + #pragma omp target teams distribute parallel for \ + map(tofrom: sumx) reduction(+:sumx) \ thread_limit(threads_per_team) schedule(static, 1) for (Index_type i = ibegin; i < iend; ++i ) { @@ -62,26 +63,60 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum sumx(m_sumx_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; - m_sumx += static_cast(sumx.get()) * h; + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); } - stopTimer(); } else { getCout() << "\n TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl; } } +void TRAP_INT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index 5aa253dec..3d81b2b56 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -20,7 +20,7 @@ namespace basic { -void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void TRAP_INT::runSeqVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -76,20 +76,46 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum sumx(m_sumx_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; - m_sumx += static_cast(sumx.get()) * h; + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsumx = m_sumx_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsumx), + [=] (Index_type i, Real_type& sumx) { + TRAP_INT_BODY; + } + ); + + m_sumx += static_cast(tsumx) * h; + + } + stopTimer(); } - stopTimer(); break; } @@ -103,5 +129,13 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) } +void TRAP_INT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 881e9ed66..4a40ca84a 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -71,19 +71,26 @@ class TRAP_INT : public KernelBase void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, typename MappingHelper > - void runHipVariantBase(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); From 0956828bc73412903cd3c558a9654ae17f779e80 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 3 Jul 2024 13:35:02 -0700 Subject: [PATCH 398/454] Add new reduction tunings for REDUCE3_INT kernel. Quiet compiler warnings. --- src/basic/PI_REDUCE-Seq.cpp | 3 ++ src/basic/REDUCE3_INT-Cuda.cpp | 68 +++++++++++++++++++++++++++ src/basic/REDUCE3_INT-Hip.cpp | 66 +++++++++++++++++++++++++- src/basic/REDUCE3_INT-OMP.cpp | 65 ++++++++++++++++++++----- src/basic/REDUCE3_INT-OMPTarget.cpp | 73 +++++++++++++++++++++++------ src/basic/REDUCE3_INT-Seq.cpp | 71 ++++++++++++++++++++++------ src/basic/REDUCE3_INT.hpp | 13 +++-- src/basic/TRAP_INT-Seq.cpp | 3 ++ 8 files changed, 316 insertions(+), 46 deletions(-) diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index 8c6f580b2..ee3dbae5e 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -20,6 +20,9 @@ namespace basic void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index 015693eee..a66e45e28 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -165,6 +165,53 @@ void REDUCE3_INT::runCudaVariantRAJA(VariantID vid) } } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE3_INT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=] __device__ (Index_type i, + Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -207,6 +254,19 @@ void REDUCE3_INT::runCudaVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + t += 1; + } }); @@ -241,6 +301,7 @@ void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } else if ( vid == RAJA_CUDA ) { @@ -252,6 +313,13 @@ void REDUCE3_INT::setCudaTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 460b68a2d..1984aa499 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -165,6 +165,53 @@ void REDUCE3_INT::runHipVariantRAJA(VariantID vid) } } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE3_INT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE3_INT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=] __device__ (Index_type i, + Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -207,6 +254,18 @@ void REDUCE3_INT::runHipVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + } }); @@ -252,6 +311,12 @@ void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + } }); @@ -261,7 +326,6 @@ void REDUCE3_INT::setHipTuningDefinitions(VariantID vid) }); } - } } // end namespace basic diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index 32658fcf1..89a6eeea8 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -91,22 +91,53 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + if (tune_idx == 1) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } } stopTimer(); @@ -124,5 +155,13 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun #endif } +void REDUCE3_INT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index a625ef842..5b4a01060 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -62,31 +62,74 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); } - stopTimer(); + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + } + stopTimer(); } else { getCout() << "\n REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl; } } +void REDUCE3_INT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index a5f5965a2..9ea03f0d9 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -19,8 +19,11 @@ namespace basic { -void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE3_INT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -84,24 +87,56 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + } - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); + if (tune_idx == 1) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + + } + stopTimer(); } - stopTimer(); break; } @@ -115,5 +150,13 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i } +void REDUCE3_INT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index 22bb62617..c869b766d 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -74,19 +74,26 @@ class REDUCE3_INT : public KernelBase void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, typename MappingHelper > - void runHipVariantBase(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index 3d81b2b56..6c75ffb04 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -22,6 +22,9 @@ namespace basic void TRAP_INT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); From 8b6ff5e59c889d9729eef4bf99af426166aab4d8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 3 Jul 2024 14:50:12 -0700 Subject: [PATCH 399/454] Code formatting for consistency. --- src/basic/PI_REDUCE-Cuda.cpp | 3 +-- src/basic/TRAP_INT-Cuda.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 8c2ebdb8e..2d8555e46 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -166,8 +166,7 @@ void PI_REDUCE::runCudaVariantRAJANewReduce(VariantID vid) Real_type tpi = m_pi_init; - RAJA::forall< exec_policy >( - res, + RAJA::forall< exec_policy >( res, RAJA::RangeSegment(ibegin, iend), RAJA::expt::Reduce(&tpi), [=] __device__ (Index_type i, Real_type& pi) { diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index 373364420..f771b6bae 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -173,8 +173,7 @@ void TRAP_INT::runCudaVariantRAJANewReduce(VariantID vid) Real_type tsumx = m_sumx_init; - RAJA::forall( - res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), RAJA::expt::Reduce(&tsumx), [=] __device__ (Index_type i, Real_type& sumx) { From 957f19893c38307838ee1fd30915ff591b422b44 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 3 Jul 2024 14:50:34 -0700 Subject: [PATCH 400/454] Add new reduction tunings of REDUCE_STRUCT kernel --- src/basic/REDUCE_STRUCT-Cuda.cpp | 80 ++++++++++++ src/basic/REDUCE_STRUCT-Hip.cpp | 81 +++++++++++- src/basic/REDUCE_STRUCT-OMP.cpp | 102 +++++++++++---- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 171 +++++++++++++++++--------- src/basic/REDUCE_STRUCT-Seq.cpp | 104 ++++++++++++---- src/basic/REDUCE_STRUCT.hpp | 13 +- 6 files changed, 441 insertions(+), 110 deletions(-) diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index 2c20b2488..ff6498c1c 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -207,6 +207,65 @@ void REDUCE_STRUCT::runCudaVariantRAJA(VariantID vid) } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE_STRUCT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=] __device__ (Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown CUDA variant id = " << vid << std::endl; + } + +} + void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -249,6 +308,19 @@ void REDUCE_STRUCT::runCudaVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + t += 1; + } }); @@ -283,6 +355,7 @@ void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } else if ( vid == RAJA_CUDA ) { @@ -294,6 +367,13 @@ void REDUCE_STRUCT::setCudaTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index db31819f5..cb0f7ad32 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -141,7 +141,7 @@ void REDUCE_STRUCT::runHipVariantBase(VariantID vid) points.SetXMax(rmem[2]); points.SetYMin(rmem[4]); points.SetYMax(rmem[5]); - m_points=points; + m_points = points; } stopTimer(); @@ -196,7 +196,7 @@ void REDUCE_STRUCT::runHipVariantRAJA(VariantID vid) points.SetXMax((xmax.get())); points.SetYMin((ymin.get())); points.SetYMax((ymax.get())); - m_points=points; + m_points = points; } stopTimer(); @@ -207,6 +207,65 @@ void REDUCE_STRUCT::runHipVariantRAJA(VariantID vid) } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void REDUCE_STRUCT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_STRUCT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=] __device__ (Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); + + } else { + getCout() << "\n REDUCE_STRUCT : Unknown HIP variant id = " << vid << std::endl; + } + +} + void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -249,6 +308,18 @@ void REDUCE_STRUCT::runHipVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + } }); @@ -294,6 +365,12 @@ void REDUCE_STRUCT::setHipTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + } }); diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 730134b95..24ee2ea30 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -55,7 +55,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -100,7 +100,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -110,31 +110,75 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter((xsum.get()/(points.N)), - (ysum.get()/(points.N))); - points.SetXMin((xmin.get())); - points.SetXMax((xmax.get())); - points.SetYMin((ymin.get())); - points.SetYMax((ymax.get())); - m_points=points; + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter((xsum.get()/(points.N)), + (ysum.get()/(points.N))); + points.SetXMin((xmin.get())); + points.SetXMax((xmax.get())); + points.SetYMin((ymin.get())); + points.SetYMax((ymax.get())); + m_points = points; + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=](Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); } - stopTimer(); break; } @@ -150,5 +194,13 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t #endif } +void REDUCE_STRUCT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index 0617ccaca..d6ad3fc78 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -27,8 +27,7 @@ namespace basic const size_t threads_per_team = 256; -void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, - size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -36,80 +35,142 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, REDUCE_STRUCT_DATA_SETUP; - if ( vid == Base_OpenMPTarget ) { + switch ( vid ) { - Real_ptr xa = points.x; - Real_ptr ya = points.y; + case Base_OpenMPTarget : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Real_ptr xa = points.x; + Real_ptr ya = points.y; - Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; - Real_type xmin = m_init_min; Real_type ymin = m_init_min; - Real_type xmax = m_init_max; Real_type ymax = m_init_max; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp target is_device_ptr(xa, ya) device( did ) map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \ + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; + + #pragma omp target is_device_ptr(xa, ya) device( did ) \ + map(tofrom:xsum, xmin, xmax, ysum, ymin, ymax) + #pragma omp teams distribute parallel for \ + thread_limit(threads_per_team) schedule(static,1) \ reduction(+:xsum) \ reduction(min:xmin) \ reduction(max:xmax), \ reduction(+:ysum), \ reduction(min:ymin), \ reduction(max:ymax) - for (Index_type i = ibegin; i < iend; ++i ) { - xsum += xa[i] ; - xmin = RAJA_MIN(xmin, xa[i]) ; - xmax = RAJA_MAX(xmax, xa[i]) ; - ysum += ya[i] ; - ymin = RAJA_MIN(ymin, ya[i]) ; - ymax = RAJA_MAX(ymax, ya[i]) ; - } + for (Index_type i = ibegin; i < iend; ++i ) { + xsum += xa[i] ; + xmin = RAJA_MIN(xmin, xa[i]) ; + xmax = RAJA_MAX(xmax, xa[i]) ; + ysum += ya[i] ; + ymin = RAJA_MIN(ymin, ya[i]) ; + ymax = RAJA_MAX(ymax, ya[i]) ; + } + + points.SetCenter(xsum/points.N, ysum/points.N); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); + m_points = points; - points.SetCenter(xsum/points.N, ysum/points.N); - points.SetXMin(xmin); - points.SetXMax(xmax); - points.SetYMin(ymin); - points.SetYMax(ymax); - m_points=points; + } + stopTimer(); + break; } - stopTimer(); - - } else if ( vid == RAJA_OpenMPTarget ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter(xsum.get()/(points.N), - ysum.get()/(points.N)); - points.SetXMin(xmin.get()); - points.SetXMax(xmax.get()); - points.SetYMin(ymin.get()); - points.SetYMax(ymax.get()); - m_points=points; + case RAJA_OpenMPTarget : { + + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter(xsum.get()/(points.N), + ysum.get()/(points.N)); + points.SetXMin(xmin.get()); + points.SetXMax(xmax.get()); + points.SetYMin(ymin.get()); + points.SetYMax(ymax.get()); + m_points = points; + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=](Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); + + } + + break; } - stopTimer(); - } else { + default: getCout() << "\n REDUCE_STRUCT : Unknown OMP Target variant id = " << vid << std::endl; } } +void REDUCE_STRUCT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index b83722a3e..7edf3ab48 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -19,8 +19,11 @@ namespace basic { -void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -47,7 +50,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -87,7 +90,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune points.SetXMax(xmax); points.SetYMin(ymin); points.SetYMax(ymax); - m_points=points; + m_points = points; } stopTimer(); @@ -97,31 +100,75 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum xsum(m_init_sum); + RAJA::ReduceSum ysum(m_init_sum); + RAJA::ReduceMin xmin(m_init_min); + RAJA::ReduceMin ymin(m_init_min); + RAJA::ReduceMax xmax(m_init_max); + RAJA::ReduceMax ymax(m_init_max); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE_STRUCT_BODY_RAJA; + }); + + points.SetCenter(xsum.get()/(points.N), + ysum.get()/(points.N)); + points.SetXMin(xmin.get()); + points.SetXMax(xmax.get()); + points.SetYMin(ymin.get()); + points.SetYMax(ymax.get()); + m_points = points; - RAJA::ReduceSum xsum(m_init_sum); - RAJA::ReduceSum ysum(m_init_sum); - RAJA::ReduceMin xmin(m_init_min); - RAJA::ReduceMin ymin(m_init_min); - RAJA::ReduceMax xmax(m_init_max); - RAJA::ReduceMax ymax(m_init_max); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE_STRUCT_BODY_RAJA; - }); - - points.SetCenter(xsum.get()/(points.N), - ysum.get()/(points.N)); - points.SetXMin(xmin.get()); - points.SetXMax(xmax.get()); - points.SetYMin(ymin.get()); - points.SetYMax(ymax.get()); - m_points=points; + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type txsum = m_init_sum; + Real_type tysum = m_init_sum; + Real_type txmin = m_init_min; + Real_type tymin = m_init_min; + Real_type txmax = m_init_max; + Real_type tymax = m_init_max; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&txsum), + RAJA::expt::Reduce(&tysum), + RAJA::expt::Reduce(&txmin), + RAJA::expt::Reduce(&tymin), + RAJA::expt::Reduce(&txmax), + RAJA::expt::Reduce(&tymax), + [=](Index_type i, Real_type& xsum, Real_type& ysum, + Real_type& xmin, Real_type& ymin, + Real_type& xmax, Real_type& ymax) { + REDUCE_STRUCT_BODY; + } + ); + + points.SetCenter(static_cast(txsum)/(points.N), + static_cast(tysum)/(points.N)); + points.SetXMin(static_cast(txmin)); + points.SetXMax(static_cast(txmax)); + points.SetYMin(static_cast(tymin)); + points.SetYMax(static_cast(tymax)); + m_points = points; + + } + stopTimer(); } - stopTimer(); break; } @@ -132,7 +179,14 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune } } +} +void REDUCE_STRUCT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } } } // end namespace basic diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index f3bdd8a16..e6f6e56c9 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -87,18 +87,25 @@ class REDUCE_STRUCT : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, typename MappingHelper > - void runHipVariantBase(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); struct PointsType { Index_type N; From b0580510e297bb5f7396ced7a243bc0b6bd45db4 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 3 Jul 2024 14:57:31 -0700 Subject: [PATCH 401/454] Quiet compiler warnings for corner case. --- src/basic/PI_REDUCE-OMP.cpp | 1 + src/basic/REDUCE3_INT-OMP.cpp | 1 + src/basic/REDUCE_STRUCT-OMP.cpp | 1 + src/basic/TRAP_INT-OMP.cpp | 1 + 4 files changed, 4 insertions(+) diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index acc823d16..f4b73db50 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -130,6 +130,7 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t tune_idx) #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index 89a6eeea8..a948a242f 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -152,6 +152,7 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 24ee2ea30..7e593db72 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -191,6 +191,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t tune_idx) #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index 7c3ef3437..1d867335f 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -131,6 +131,7 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } From f5d2e3b5325c15111010b7b88e4cdfc1baa2de8a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 3 Jul 2024 15:25:17 -0700 Subject: [PATCH 402/454] Add new reduce tuning for DOT kernel --- src/stream/DOT-Cuda.cpp | 61 +++++++++++++++++++++++ src/stream/DOT-Hip.cpp | 58 ++++++++++++++++++++++ src/stream/DOT-OMP.cpp | 55 +++++++++++++++++---- src/stream/DOT-OMPTarget.cpp | 95 ++++++++++++++++++++++++++---------- src/stream/DOT-Seq.cpp | 56 +++++++++++++++++---- src/stream/DOT.hpp | 13 +++-- 6 files changed, 290 insertions(+), 48 deletions(-) diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 45a2a5a8d..042c949b5 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -140,6 +140,46 @@ void DOT::runCudaVariantRAJA(VariantID vid) } } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void DOT::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] __device__ (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown Cuda variant id = " << vid << std::endl; + } +} + void DOT::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -182,6 +222,19 @@ void DOT::runCudaVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + + } + + t += 1; + } }); @@ -216,6 +269,7 @@ void DOT::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } else if ( vid == RAJA_CUDA ) { @@ -227,6 +281,13 @@ void DOT::setCudaTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 0badd32fb..5497d0480 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -140,6 +140,46 @@ void DOT::runHipVariantRAJA(VariantID vid) } } +template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +void DOT::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + DOT_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] __device__ (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); + + } else { + getCout() << "\n DOT : Unknown HIP variant id = " << vid << std::endl; + } +} + void DOT::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -182,6 +222,18 @@ void DOT::runHipVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + } }); @@ -226,6 +278,12 @@ void DOT::setHipTuningDefinitions(VariantID vid) std::to_string(block_size)); }); + + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); } diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 59e4cdf22..295437c1f 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -18,7 +18,7 @@ namespace stream { -void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void DOT::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -76,20 +76,46 @@ void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum dot(m_dot_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DOT_BODY; - }); + RAJA::ReduceSum dot(m_dot_init); - m_dot += dot; + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DOT_BODY; + }); + + m_dot += dot; + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); } - stopTimer(); break; } @@ -102,8 +128,17 @@ void DOT::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void DOT::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index f9a049770..123442c4f 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -26,7 +26,7 @@ namespace stream // const size_t threads_per_team = 256; -void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void DOT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -34,44 +34,89 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ DOT_DATA_SETUP; - if ( vid == Base_OpenMPTarget ) { + switch ( vid ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + case Base_OpenMPTarget : { - Real_type dot = m_dot_init; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - #pragma omp target is_device_ptr(a, b) device( did ) map(tofrom:dot) - #pragma omp teams distribute parallel for reduction(+:dot) \ - thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - DOT_BODY; - } + Real_type dot = m_dot_init; + + #pragma omp target is_device_ptr(a, b) device( did ) map(tofrom:dot) + #pragma omp teams distribute parallel for reduction(+:dot) \ + thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + DOT_BODY; + } + + m_dot += dot; - m_dot += dot; + } + stopTimer(); + break; } - stopTimer(); - } else if ( vid == RAJA_OpenMPTarget ) { + case RAJA_OpenMPTarget : { + + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 1) { - RAJA::ReduceSum dot(m_dot_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DOT_BODY; - }); + Real_type tdot = m_dot_init; - m_dot += static_cast(dot.get()); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + m_dot += static_cast(tdot); + + } + stopTimer(); + + } + + break; + } + + default : { + getCout() << "\n DOT : Unknown OMP Target variant id = " << vid << std::endl; } - stopTimer(); - } else { - getCout() << "\n DOT : Unknown OMP Target variant id = " << vid << std::endl; + } + +} + +void DOT::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); } } diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index 715d99cce..c639f848e 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -18,8 +18,11 @@ namespace stream { -void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void DOT::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -73,20 +76,45 @@ void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum dot(m_dot_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DOT_BODY; - }); + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DOT_BODY; + }); - m_dot += static_cast(dot.get()); + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tdot = m_dot_init; + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tdot), + [=] (Index_type i, Real_type& dot) { + DOT_BODY; + } + ); + + m_dot += static_cast(tdot); + + } + stopTimer(); } - stopTimer(); break; } @@ -100,5 +128,13 @@ void DOT::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) } +void DOT::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index f2cd455be..ea7b20d5c 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -55,19 +55,26 @@ class DOT : public KernelBase void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, typename MappingHelper > - void runHipVariantBase(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); From f84fb5b292d255e7d7028ca38c5007c03602e551 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 8 Jul 2024 14:55:59 -0700 Subject: [PATCH 403/454] Add new reduction "tunings" for FIRST_MIN kernel --- src/lcals/FIRST_MIN-Cuda.cpp | 58 +++++++++++++++++++++++++++++ src/lcals/FIRST_MIN-Hip.cpp | 56 ++++++++++++++++++++++++++++ src/lcals/FIRST_MIN-OMP.cpp | 59 ++++++++++++++++++++++++------ src/lcals/FIRST_MIN-OMPTarget.cpp | 58 +++++++++++++++++++++++------ src/lcals/FIRST_MIN-Seq.cpp | 61 +++++++++++++++++++++++++------ src/lcals/FIRST_MIN.hpp | 11 +++++- 6 files changed, 268 insertions(+), 35 deletions(-) diff --git a/src/lcals/FIRST_MIN-Cuda.cpp b/src/lcals/FIRST_MIN-Cuda.cpp index f2433183d..0d590d89d 100644 --- a/src/lcals/FIRST_MIN-Cuda.cpp +++ b/src/lcals/FIRST_MIN-Cuda.cpp @@ -151,6 +151,48 @@ void FIRST_MIN::runCudaVariantRAJA(VariantID vid) } } +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=] __device__ (Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown Cuda variant id = " << vid << std::endl; + } +} + void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -188,6 +230,16 @@ void FIRST_MIN::runCudaVariant(VariantID vid, size_t tune_idx) t += 1; + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + } }); @@ -222,6 +274,7 @@ void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } else if ( vid == RAJA_CUDA ) { @@ -231,6 +284,11 @@ void FIRST_MIN::setCudaTuningDefinitions(VariantID vid) decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/lcals/FIRST_MIN-Hip.cpp b/src/lcals/FIRST_MIN-Hip.cpp index fde3100e1..d9cd7d81c 100644 --- a/src/lcals/FIRST_MIN-Hip.cpp +++ b/src/lcals/FIRST_MIN-Hip.cpp @@ -151,6 +151,48 @@ void FIRST_MIN::runHipVariantRAJA(VariantID vid) } } +template < size_t block_size, typename MappingHelper > +void FIRST_MIN::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + FIRST_MIN_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=] __device__ (Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); + + } else { + getCout() << "\n FIRST_MIN : Unknown Hip variant id = " << vid << std::endl; + } +} + void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -188,6 +230,16 @@ void FIRST_MIN::runHipVariant(VariantID vid, size_t tune_idx) t += 1; + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + } }); @@ -231,6 +283,10 @@ void FIRST_MIN::setHipTuningDefinitions(VariantID vid) decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + } }); diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 7176bcacf..95affda9a 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -18,7 +18,7 @@ namespace lcals { -void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -87,21 +87,49 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - FIRST_MIN_BODY_RAJA; - }); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + FIRST_MIN_BODY_RAJA; + }); - m_minloc = loc.getLoc(); + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=](Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); } - stopTimer(); break; } @@ -114,8 +142,17 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void FIRST_MIN::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index dd39908d8..eb8d6035b 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace lcals const size_t threads_per_team = 256; -void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -60,21 +60,49 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - FIRST_MIN_BODY_RAJA; - }); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + FIRST_MIN_BODY_RAJA; + }); - m_minloc = loc.getLoc(); + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + using VL_TYPE = RAJA::expt::ValLoc; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + VL_TYPE tloc(m_xmin_init, m_initloc); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=](Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); } - stopTimer(); } else { getCout() << "\n FIRST_MIN : Unknown OMP Target variant id = " << vid << std::endl; @@ -82,6 +110,14 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG } +void FIRST_MIN::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index 3ac609753..aa0c0452a 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -18,8 +18,11 @@ namespace lcals { -void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void FIRST_MIN::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -76,21 +79,49 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + FIRST_MIN_BODY_RAJA; + }); + + m_minloc = loc.getLoc(); + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + using VL_TYPE = RAJA::expt::ValLoc; - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - FIRST_MIN_BODY_RAJA; - }); + VL_TYPE tloc(m_xmin_init, m_initloc); - m_minloc = loc.getLoc(); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tloc), + [=](Index_type i, VL_TYPE& loc) { + loc.min(x[i], i); + } + ); + + m_minloc = static_cast(tloc.getLoc()); + + } + stopTimer(); } - stopTimer(); break; } @@ -104,5 +135,13 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx } +void FIRST_MIN::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index c6161447f..1660739fb 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -83,19 +83,26 @@ class FIRST_MIN : public KernelBase void runKokkosVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); template < size_t block_size, typename MappingHelper > - void runHipVariantBase(VariantID vid); + void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); template < size_t block_size, typename MappingHelper > - void runCudaVariantRAJA(VariantID vid); + void runHipVariantBase(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > void runSyclVariantImpl(VariantID vid); From 2b404ab6c65914855b381cc3278e37d553b2d95a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 8 Jul 2024 14:56:36 -0700 Subject: [PATCH 404/454] Add new reduction variants of REDUCE_SUM kernel --- src/algorithm/REDUCE_SUM-Cuda.cpp | 61 ++++++++++++++++++++++++++ src/algorithm/REDUCE_SUM-Hip.cpp | 60 +++++++++++++++++++++++++ src/algorithm/REDUCE_SUM-OMP.cpp | 58 +++++++++++++++++++----- src/algorithm/REDUCE_SUM-OMPTarget.cpp | 58 +++++++++++++++++++----- src/algorithm/REDUCE_SUM-Seq.cpp | 56 ++++++++++++++++++----- src/algorithm/REDUCE_SUM.hpp | 15 +++++-- 6 files changed, 272 insertions(+), 36 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp index 6f79928e2..9d9a00f77 100644 --- a/src/algorithm/REDUCE_SUM-Cuda.cpp +++ b/src/algorithm/REDUCE_SUM-Cuda.cpp @@ -218,6 +218,49 @@ void REDUCE_SUM::runCudaVariantRAJA(VariantID vid) } +template < size_t block_size, typename MappingHelper > +void REDUCE_SUM::runCudaVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getCudaResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_CUDA ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] __device__ (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Cuda variant id = " << vid << std::endl; + + } + +} + void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -272,6 +315,16 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantRAJANewReduce(vid); + + } + + t += 1; + } }); @@ -312,6 +365,7 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ decltype(mapping_helper)::get_name()+"_"+ std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } else if ( vid == RAJA_CUDA ) { @@ -323,6 +377,13 @@ void REDUCE_SUM::setCudaTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp index 9999ea674..e433fa263 100644 --- a/src/algorithm/REDUCE_SUM-Hip.cpp +++ b/src/algorithm/REDUCE_SUM-Hip.cpp @@ -245,6 +245,49 @@ void REDUCE_SUM::runHipVariantRAJA(VariantID vid) } +template < size_t block_size, typename MappingHelper > +void REDUCE_SUM::runHipVariantRAJANewReduce(VariantID vid) +{ + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getHipResource()}; + + REDUCE_SUM_DATA_SETUP; + + if ( vid == RAJA_HIP ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + + RAJA::forall( res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] __device__ (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + + getCout() << "\n REDUCE_SUM : Unknown Hip variant id = " << vid << std::endl; + + } + +} + void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; @@ -299,6 +342,16 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx) }); + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantRAJANewReduce(vid); + + } + + t += 1; + } }); @@ -354,6 +407,13 @@ void REDUCE_SUM::setHipTuningDefinitions(VariantID vid) }); + auto algorithm_helper = gpu_algorithm::block_device_helper{}; + + addVariantTuningName(vid, decltype(algorithm_helper)::get_name()+"_"+ + decltype(mapping_helper)::get_name()+"_"+ + "new_"+std::to_string(block_size)); + RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning + } }); diff --git a/src/algorithm/REDUCE_SUM-OMP.cpp b/src/algorithm/REDUCE_SUM-OMP.cpp index ae5cc130c..ef352453b 100644 --- a/src/algorithm/REDUCE_SUM-OMP.cpp +++ b/src/algorithm/REDUCE_SUM-OMP.cpp @@ -18,7 +18,7 @@ namespace algorithm { -void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t tune_idx) { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -76,21 +76,48 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune case RAJA_OpenMP : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sum(m_sum_init); + RAJA::ReduceSum sum(m_sum_init); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_SUM_BODY; - }); + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); - m_sum = sum.get(); + m_sum = sum.get(); + + } + stopTimer(); } - stopTimer(); + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } + break; } @@ -103,8 +130,17 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune #else RAJA_UNUSED_VAR(vid); + RAJA_UNUSED_VAR(tune_idx); #endif } +void REDUCE_SUM::setOpenMPTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMP) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp index 394f71b07..d01a2976e 100644 --- a/src/algorithm/REDUCE_SUM-OMPTarget.cpp +++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp @@ -27,7 +27,7 @@ namespace algorithm const size_t threads_per_team = 256; -void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -56,21 +56,47 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } else if ( vid == RAJA_OpenMPTarget ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { - RAJA::ReduceSum sum(m_sum_init); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_SUM_BODY; - }); + RAJA::ReduceSum sum(m_sum_init); - m_sum = sum.get(); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); - } - stopTimer(); + m_sum = sum.get(); + + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } } else { getCout() << "\n REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl; @@ -78,6 +104,14 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_AR } +void REDUCE_SUM::setOpenMPTargetTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_OpenMPTarget) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM-Seq.cpp b/src/algorithm/REDUCE_SUM-Seq.cpp index 9223c3ac5..27c2d7f78 100644 --- a/src/algorithm/REDUCE_SUM-Seq.cpp +++ b/src/algorithm/REDUCE_SUM-Seq.cpp @@ -18,8 +18,11 @@ namespace algorithm { -void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +void REDUCE_SUM::runSeqVariant(VariantID vid, size_t tune_idx) { +#if !defined(RUN_RAJA_SEQ) + RAJA_UNUSED_VAR(tune_idx); +#endif const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); @@ -73,20 +76,45 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id case RAJA_Seq : { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if (tune_idx == 0) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sum(m_sum_init); - RAJA::ReduceSum sum(m_sum_init); + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE_SUM_BODY; + }); - RAJA::forall( RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE_SUM_BODY; - }); + m_sum = sum.get(); - m_sum = sum.get(); + } + stopTimer(); + + } + + if (tune_idx == 1) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); } - stopTimer(); break; } @@ -100,5 +128,13 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id } +void REDUCE_SUM::setSeqTuningDefinitions(VariantID vid) +{ + addVariantTuningName(vid, "default"); + if (vid == RAJA_Seq) { + addVariantTuningName(vid, "new"); + } +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 716794930..3bbc0e2aa 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -59,18 +59,27 @@ class REDUCE_SUM : public KernelBase void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void setSeqTuningDefinitions(VariantID vid); + void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void runCudaVariantCub(VariantID vid); - void runHipVariantRocprim(VariantID vid); template < size_t block_size, typename MappingHelper > void runCudaVariantBase(VariantID vid); - template < size_t block_size, typename MappingHelper > - void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runCudaVariantRAJANewReduce(VariantID vid); + + void runHipVariantRocprim(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); + template < size_t block_size, typename MappingHelper > + void runHipVariantRAJANewReduce(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 61f240e12c6e9029055e72a63c4ec306708093d8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 8 Jul 2024 15:27:57 -0700 Subject: [PATCH 405/454] Remove unnecessary template parameters --- src/basic/PI_REDUCE-Cuda.cpp | 5 +---- src/basic/PI_REDUCE-Hip.cpp | 5 +---- src/basic/PI_REDUCE.hpp | 4 ++-- src/basic/REDUCE3_INT-Cuda.cpp | 5 +---- src/basic/REDUCE3_INT-Hip.cpp | 4 +--- src/basic/REDUCE3_INT.hpp | 4 ++-- src/basic/REDUCE_STRUCT-Cuda.cpp | 5 +---- src/basic/REDUCE_STRUCT-Hip.cpp | 4 +--- src/basic/REDUCE_STRUCT.hpp | 4 ++-- src/basic/TRAP_INT-Cuda.cpp | 5 +---- src/basic/TRAP_INT-Hip.cpp | 5 +---- src/basic/TRAP_INT.hpp | 4 ++-- src/stream/DOT-Cuda.cpp | 5 +---- src/stream/DOT-Hip.cpp | 4 +--- src/stream/DOT.hpp | 4 ++-- 15 files changed, 20 insertions(+), 47 deletions(-) diff --git a/src/basic/PI_REDUCE-Cuda.cpp b/src/basic/PI_REDUCE-Cuda.cpp index 2d8555e46..19f338175 100644 --- a/src/basic/PI_REDUCE-Cuda.cpp +++ b/src/basic/PI_REDUCE-Cuda.cpp @@ -144,7 +144,7 @@ void PI_REDUCE::runCudaVariantRAJA(VariantID vid) } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void PI_REDUCE::runCudaVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); - RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/basic/PI_REDUCE-Hip.cpp b/src/basic/PI_REDUCE-Hip.cpp index 0050f7548..50f1a632d 100644 --- a/src/basic/PI_REDUCE-Hip.cpp +++ b/src/basic/PI_REDUCE-Hip.cpp @@ -143,7 +143,7 @@ void PI_REDUCE::runHipVariantRAJA(VariantID vid) } } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void PI_REDUCE::runHipVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); - RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index 6d2d60428..ca6860350 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -69,14 +69,14 @@ class PI_REDUCE : public KernelBase void runCudaVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runCudaVariantRAJANewReduce(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp index a66e45e28..faf2d52a4 100644 --- a/src/basic/REDUCE3_INT-Cuda.cpp +++ b/src/basic/REDUCE3_INT-Cuda.cpp @@ -165,7 +165,7 @@ void REDUCE3_INT::runCudaVariantRAJA(VariantID vid) } } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void REDUCE3_INT::runCudaVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); - RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/basic/REDUCE3_INT-Hip.cpp b/src/basic/REDUCE3_INT-Hip.cpp index 1984aa499..85140f46f 100644 --- a/src/basic/REDUCE3_INT-Hip.cpp +++ b/src/basic/REDUCE3_INT-Hip.cpp @@ -165,7 +165,7 @@ void REDUCE3_INT::runHipVariantRAJA(VariantID vid) } } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void REDUCE3_INT::runHipVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); } diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index c869b766d..a3719a845 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -85,14 +85,14 @@ class REDUCE3_INT : public KernelBase void runCudaVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runCudaVariantRAJANewReduce(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > diff --git a/src/basic/REDUCE_STRUCT-Cuda.cpp b/src/basic/REDUCE_STRUCT-Cuda.cpp index ff6498c1c..06134c2ce 100644 --- a/src/basic/REDUCE_STRUCT-Cuda.cpp +++ b/src/basic/REDUCE_STRUCT-Cuda.cpp @@ -207,7 +207,7 @@ void REDUCE_STRUCT::runCudaVariantRAJA(VariantID vid) } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void REDUCE_STRUCT::runCudaVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); - RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/basic/REDUCE_STRUCT-Hip.cpp b/src/basic/REDUCE_STRUCT-Hip.cpp index cb0f7ad32..295101182 100644 --- a/src/basic/REDUCE_STRUCT-Hip.cpp +++ b/src/basic/REDUCE_STRUCT-Hip.cpp @@ -207,7 +207,7 @@ void REDUCE_STRUCT::runHipVariantRAJA(VariantID vid) } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void REDUCE_STRUCT::runHipVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); } diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index e6f6e56c9..658d9eae4 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -97,14 +97,14 @@ class REDUCE_STRUCT : public KernelBase void runCudaVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runCudaVariantRAJANewReduce(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runHipVariantRAJANewReduce(VariantID vid); struct PointsType { diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp index f771b6bae..65463d3ce 100644 --- a/src/basic/TRAP_INT-Cuda.cpp +++ b/src/basic/TRAP_INT-Cuda.cpp @@ -151,7 +151,7 @@ void TRAP_INT::runCudaVariantRAJA(VariantID vid) } } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void TRAP_INT::runCudaVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); - RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/basic/TRAP_INT-Hip.cpp b/src/basic/TRAP_INT-Hip.cpp index c671f4673..a7aceb01d 100644 --- a/src/basic/TRAP_INT-Hip.cpp +++ b/src/basic/TRAP_INT-Hip.cpp @@ -151,7 +151,7 @@ void TRAP_INT::runHipVariantRAJA(VariantID vid) } } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void TRAP_INT::runHipVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); - RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 4a40ca84a..8f8ca9337 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -82,14 +82,14 @@ class TRAP_INT : public KernelBase void runCudaVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runCudaVariantRAJANewReduce(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp index 042c949b5..dbf34a8f9 100644 --- a/src/stream/DOT-Cuda.cpp +++ b/src/stream/DOT-Cuda.cpp @@ -140,7 +140,7 @@ void DOT::runCudaVariantRAJA(VariantID vid) } } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void DOT::runCudaVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); - RAJA_UNUSED_VAR(algorithm_helper); // to quiet compiler warning } diff --git a/src/stream/DOT-Hip.cpp b/src/stream/DOT-Hip.cpp index 5497d0480..c713f50d9 100644 --- a/src/stream/DOT-Hip.cpp +++ b/src/stream/DOT-Hip.cpp @@ -140,7 +140,7 @@ void DOT::runHipVariantRAJA(VariantID vid) } } -template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > +template < size_t block_size, typename MappingHelper > void DOT::runHipVariantRAJANewReduce(VariantID vid) { using exec_policy = std::conditional_t(vid); } diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index ea7b20d5c..2626dbc5e 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -66,14 +66,14 @@ class DOT : public KernelBase void runCudaVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runCudaVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runCudaVariantRAJANewReduce(VariantID vid); template < size_t block_size, typename MappingHelper > void runHipVariantBase(VariantID vid); template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > void runHipVariantRAJA(VariantID vid); - template < size_t block_size, typename AlgorithmHelper, typename MappingHelper > + template < size_t block_size, typename MappingHelper > void runHipVariantRAJANewReduce(VariantID vid); template < size_t work_group_size > From 4319bf275a7d37bbf3b4c3d20c5a33047f3ac823 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 9 Jul 2024 14:23:04 -0700 Subject: [PATCH 406/454] Added SYCL variant of REDUCE_SUM kernel --- src/algorithm/CMakeLists.txt | 1 + src/algorithm/REDUCE_SUM-Sycl.cpp | 103 ++++++++++++++++++++++++++++++ src/algorithm/REDUCE_SUM.hpp | 7 +- 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 src/algorithm/REDUCE_SUM-Sycl.cpp diff --git a/src/algorithm/CMakeLists.txt b/src/algorithm/CMakeLists.txt index 43e3279e0..cabd5a0df 100644 --- a/src/algorithm/CMakeLists.txt +++ b/src/algorithm/CMakeLists.txt @@ -30,6 +30,7 @@ blt_add_library( REDUCE_SUM-Cuda.cpp REDUCE_SUM-OMP.cpp REDUCE_SUM-OMPTarget.cpp + REDUCE_SUM-Sycl.cpp MEMSET.cpp MEMSET-Seq.cpp MEMSET-Hip.cpp diff --git a/src/algorithm/REDUCE_SUM-Sycl.cpp b/src/algorithm/REDUCE_SUM-Sycl.cpp new file mode 100644 index 000000000..ae4c198ac --- /dev/null +++ b/src/algorithm/REDUCE_SUM-Sycl.cpp @@ -0,0 +1,103 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DOT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace algorithm +{ + +template +void REDUCE_SUM::runSyclVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + REDUCE_SUM_DATA_SETUP; + + if ( vid == Base_SYCL ) { + + Real_ptr sum; + allocAndInitSyclDeviceData(sum, &m_sum_init, 1, qu); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t global_size = work_group_size * RAJA_DIVIDE_CEILING_INT(iend, work_group_size); + + initSyclDeviceData(sum, &m_sum_init, 1, qu); + + qu->submit([&] (sycl::handler& h) { + + auto sumReduction = sycl::reduction(sum, sycl::plus()); + + h.parallel_for(sycl::nd_range<1>(global_size, work_group_size), + sumReduction, + [=] (sycl::nd_item<1> item, auto& sum) { + + Index_type i = item.get_global_id(0); + if (i < iend) { + REDUCE_SUM_BODY; + } + + }); + }); + + Real_type lsum; + Real_ptr plsum = &lsum; + getSyclDeviceData(plsum, sum, 1, qu); + m_sum = lsum; + + } + stopTimer(); + + } else if ( vid == RAJA_SYCL ) { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type tsum = m_sum_init; + RAJA::forall< RAJA::sycl_exec >( + res, + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); + + m_sum = static_cast(tsum); + + } + stopTimer(); + + } else { + std::cout << "\n REDUCE_SUM : Unknown Sycl variant id = " << vid << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(REDUCE_SUM, Sycl) + +} // end namespace algorithm +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index 3bbc0e2aa..c9f1a3c74 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -58,12 +58,14 @@ class REDUCE_SUM : public KernelBase void runCudaVariant(VariantID vid, size_t tune_idx); void runHipVariant(VariantID vid, size_t tune_idx); void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + void runSyclVariant(VariantID vid, size_t tune_idx); void setSeqTuningDefinitions(VariantID vid); void setOpenMPTuningDefinitions(VariantID vid); void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - void setOpenMPTargetTuningDefinitions(VariantID vid); + void setOpenMPTargetTuningDefinitions(VariantID vid); + void setSyclTuningDefinitions(VariantID vid); void runCudaVariantCub(VariantID vid); template < size_t block_size, typename MappingHelper > @@ -81,6 +83,9 @@ class REDUCE_SUM : public KernelBase template < size_t block_size, typename MappingHelper > void runHipVariantRAJANewReduce(VariantID vid); + template < size_t work_group_size > + void runSyclVariantImpl(VariantID vid); + private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; From 681b9a6c71620715e74611c7ee52d36ee41147bc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 9 Jul 2024 14:26:15 -0700 Subject: [PATCH 407/454] Update to add more tuning options for HISTOGRAM --- src/algorithm/HISTOGRAM-Cuda.cpp | 227 +++++++++++++++++++++++++---- src/algorithm/HISTOGRAM-Hip.cpp | 241 +++++++++++++++++++++++++++---- src/algorithm/HISTOGRAM.cpp | 6 +- src/algorithm/HISTOGRAM.hpp | 16 +- 4 files changed, 424 insertions(+), 66 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 161bee82b..49a44179f 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -24,11 +24,26 @@ namespace rajaperf namespace algorithm { -// for these models the input is block_size and the output is cache lines -using histogram_global_atomic_model = CutoffModel<512, 2, 1>; // v100 - // for these models the input is block_size and the output is values -using histogram_shared_atomic_model = ConstantModel<4>; // v100 +using histogram_shared_atomic_model = ConstantModel<16>; + +// for these models the input is block_size and the output is cache lines +using histogram_global_atomic_model = CutoffModel<512, 2, 1>; + +// v100 +// 10 bins - 1 bin per iterate - single bin +// shared ConstantModel<16> global ConstantModel<1> +// 10 bins - 1 bin per iterate - random sized runs +// shared ConstantModel<16> global CutoffModel<512, 2, 1> +// 10 bins - 1 bin per iterate - random bin +// shared ConstantModel<8> global ConstantModel<1> +// +// 100 bins - 1 bin per iterate - single bin +// shared ConstantModel<> global ConstantModel<> +// 100 bins - 1 bin per iterate - random sized runs +// shared ConstantModel<> global ConstantModel<> +// 100 bins - 1 bin per iterate - random bin +// shared ConstantModel<> global ConstantModel<> template < size_t t_block_size, typename T, typename FunctionSignature > @@ -48,15 +63,17 @@ struct histogram_info return func_attr.maxDynamicSharedSizeBytes; } - FunctionSignature const& const func; + FunctionSignature const& func; const size_t grid_size; - const MultiReduceAtomicCalculator atomic_calc; + const MultiReduceAtomicCalculator atomic_calc; - histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins) + template < typename GlobalModel, typename SharedModel > + histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins, + GlobalModel const& global_atomic_model, SharedModel const& shared_atomic_model) : func(a_func) , grid_size(get_grid_size(problem_size)) , atomic_calc(num_bins, block_size, grid_size, get_max_shmem(a_func), - histogram_global_atomic_model{}, histogram_shared_atomic_model{}) + global_atomic_model, shared_atomic_model) { } std::string get_name() const @@ -116,7 +133,7 @@ __launch_bounds__(block_size) __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, Index_ptr bins, Index_type iend, - MultiReduceAtomicCalculator atomic_calc) + MultiReduceAtomicCalculator atomic_calc) { extern __shared__ HISTOGRAM::Data_type shared_counts[]; for (Index_type i = threadIdx.x; @@ -215,7 +232,8 @@ void HISTOGRAM::runCudaVariantLibrary(VariantID vid) } -template < size_t block_size, size_t global_replication > +template < size_t block_size, size_t global_replication, + bool warp_atomics, bool bunched_atomics > void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -279,18 +297,28 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) } else if ( vid == RAJA_CUDA ) { + using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< + RAJA::cuda::MultiReduceTuning< + RAJA::cuda::multi_reduce_algorithm::init_host_combine_global_atomic, + void, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + std::conditional_t, RAJA::cuda::block_xyz<>>, + std::conditional_t, RAJA::GetOffsetLeft>>>>; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); + RAJA::forall>( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + HISTOGRAM_BODY; }); - RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); + HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); } stopTimer(); @@ -303,10 +331,12 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) } -template < size_t block_size, size_t shared_replication, size_t global_replication > +template < size_t block_size, size_t shared_replication, size_t global_replication, + bool warp_atomics, bool bunched_atomics > void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) { const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; @@ -339,6 +369,38 @@ void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) } stopTimer(); + } else if ( vid == RAJA_CUDA ) { + + using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< + RAJA::cuda::MultiReduceTuning< + RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + std::conditional_t, RAJA::cuda::block_xyz<>>, + std::conditional_t, RAJA::GetOffsetLeft>>>>; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); + + RAJA::forall>( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + HISTOGRAM_BODY; + }); + + HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); + + } + stopTimer(); + } else { getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; } @@ -433,22 +495,106 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantAtomicGlobal(vid); + runCudaVariantAtomicGlobal(vid); } t += 1; + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicGlobal(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicGlobal(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicGlobal(vid); + + } + + t += 1; + + } + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - if ( vid == Base_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicShared(vid); + + } + + t += 1; + + } + + if ( vid == RAJA_CUDA ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicShared(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicShared(vid); + + } + + t += 1; if (tune_idx == t) { setBlockSize(block_size); runCudaVariantAtomicShared(vid); + decltype(global_replication)::value, true, true>(vid); + + } + + t += 1; + + } + + if ( vid == Base_CUDA ) { + + if (tune_idx == t) { + + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + ConstantModel{}, ConstantModel{}); + setBlockSize(block_size); + runCudaVariantAtomicRuntime(info, vid); } @@ -467,7 +613,8 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + histogram_global_atomic_model{}, histogram_shared_atomic_model{}); setBlockSize(block_size); runCudaVariantAtomicRuntime(info, vid); @@ -510,14 +657,43 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) run_params.validAtomicReplication(global_replication)) { addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">_"+std::to_string(block_size)); + ">block_unbunched_"+std::to_string(block_size)); + + if ( vid == RAJA_CUDA ) { + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">warp_unbunched_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">block_bunched_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">warp_bunched_"+std::to_string(block_size)); + } seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - if ( vid == Base_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">block_unbunched_"+std::to_string(block_size)); + } + + if ( vid == RAJA_CUDA ) { + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">warp_unbunched_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">block_bunched_"+std::to_string(block_size)); addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ ">_global<"+std::to_string(global_replication)+ - ">_"+std::to_string(block_size)); + ">warp_bunched_"+std::to_string(block_size)); + } + + if ( vid == Base_CUDA ) { + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + ConstantModel{}, ConstantModel{}); + auto name = info.get_name(); + addVariantTuningName(vid, name.c_str()); } }); @@ -528,7 +704,8 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) if ( vid == Base_CUDA ) { histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + histogram_global_atomic_model{}, histogram_shared_atomic_model{}); auto name = info.get_name(); addVariantTuningName(vid, name.c_str()); } diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 4b7d00a96..1a7d71947 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -29,13 +29,30 @@ namespace rajaperf namespace algorithm { -// for these models the input is block_size and the output is cache lines -using histogram_global_atomic_model = CutoffModel<512, 32, 16>; // gfx942 -// using histogram_global_atomic_model = ConstantModel<1>; // gfx90a - // for these models the input is block_size and the output is values -using histogram_shared_atomic_model = ConstantModel<4>; // gfx942 -// using histogram_shared_atomic_model = ConstantModel<4>; // gfx90a +using histogram_shared_atomic_model = ConstantModel<4>; + +// for these models the input is block_size and the output is cache lines +using histogram_global_atomic_model = CutoffModel<512, 32, 16>; + +// gfx90a +// 10 bins - 1 bin per iterate - random sized runs +// shared ConstantModel<4> global ConstantModel<1> + +// gfx942 +// 10 bins - 1 bin per iterate - single bin +// shared ConstantModel<4> global CutoffModel<512, 32, 16> +// 10 bins - 1 bin per iterate - random sized runs +// shared ConstantModel<4> global CutoffModel<512, 32, 16> +// 10 bins - 1 bin per iterate - random bin +// shared ConstantModel<1> global ConstantModel<32> +// +// 100 bins - 1 bin per iterate - single bin +// shared ConstantModel<2> global ConstantModel<32> +// 100 bins - 1 bin per iterate - random sized runs +// shared ConstantModel<> global ConstantModel<> +// 100 bins - 1 bin per iterate - random bin +// shared ConstantModel<> global ConstantModel<> template < size_t t_block_size, typename T, typename FunctionSignature > @@ -57,20 +74,22 @@ struct histogram_info FunctionSignature const& func; const size_t grid_size; - const MultiReduceAtomicCalculator atomic_calc; + const MultiReduceAtomicCalculator atomic_calc; - histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins) + template < typename GlobalModel, typename SharedModel > + histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins, + GlobalModel const& global_atomic_model, SharedModel const& shared_atomic_model) : func(a_func) , grid_size(get_grid_size(problem_size)) , atomic_calc(num_bins, block_size, grid_size, get_max_shmem(a_func), - histogram_global_atomic_model{}, histogram_shared_atomic_model{}) + global_atomic_model, shared_atomic_model) { } std::string get_name() const { return "atomic_shared("+std::to_string(atomic_calc.shared_replication())+ ")_global("+std::to_string(atomic_calc.global_replication())+ - ")_"+std::to_string(block_size); + ")block_"+std::to_string(block_size); } }; @@ -125,7 +144,7 @@ __launch_bounds__(block_size) __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, Index_ptr bins, Index_type iend, - MultiReduceAtomicCalculator atomic_calc) + MultiReduceAtomicCalculator atomic_calc) { extern __shared__ HISTOGRAM::Data_type shared_counts[]; for (Index_type i = threadIdx.x; @@ -248,7 +267,8 @@ void HISTOGRAM::runHipVariantLibrary(VariantID vid) } -template < size_t block_size, size_t global_replication > +template < size_t block_size, size_t global_replication, + bool warp_atomics, bool bunched_atomics > void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -312,18 +332,28 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) } else if ( vid == RAJA_HIP ) { + using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< + RAJA::hip::MultiReduceTuning< + RAJA::hip::multi_reduce_algorithm::init_host_combine_global_atomic, + void, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + std::conditional_t, RAJA::hip::block_xyz<>>, + std::conditional_t, RAJA::GetOffsetLeft>>>>; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); + RAJA::forall>( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + HISTOGRAM_BODY; }); - RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); + HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); } stopTimer(); @@ -336,20 +366,23 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) } -template < size_t block_size, size_t shared_replication, size_t global_replication > +template < size_t block_size, size_t shared_replication, size_t global_replication, + bool warp_atomics, bool bunched_atomics > void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) { const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; HISTOGRAM_DATA_SETUP; - RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); if ( vid == Base_HIP ) { + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -372,11 +405,44 @@ void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) } stopTimer(); + RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); + + } else if ( vid == RAJA_HIP ) { + + using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< + RAJA::hip::MultiReduceTuning< + RAJA::hip::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + std::conditional_t, RAJA::hip::block_xyz<>>, + std::conditional_t, RAJA::GetOffsetLeft>>>>; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); + + RAJA::forall>( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + HISTOGRAM_BODY; + }); + + HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); + + } + stopTimer(); + } else { getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; } - RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); } @@ -466,22 +532,106 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { setBlockSize(block_size); - runHipVariantAtomicGlobal(vid); + runHipVariantAtomicGlobal(vid); } t += 1; + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicGlobal(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicGlobal(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicGlobal(vid); + + } + + t += 1; + + } + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - if ( vid == Base_HIP ) { + if ( vid == Base_HIP || vid == RAJA_HIP ) { if (tune_idx == t) { setBlockSize(block_size); runHipVariantAtomicShared(vid); + decltype(global_replication)::value, false, false>(vid); + + } + + t += 1; + + } + + if ( vid == RAJA_HIP ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicShared(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicShared(vid); + + } + + t += 1; + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicShared(vid); + + } + + t += 1; + + } + + if ( vid == Base_HIP ) { + + if (tune_idx == t) { + + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + ConstantModel{}, ConstantModel{}); + setBlockSize(block_size); + runHipVariantAtomicRuntime(info, vid); } @@ -500,7 +650,8 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) if (tune_idx == t) { histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + histogram_global_atomic_model{}, histogram_shared_atomic_model{}); setBlockSize(block_size); runHipVariantAtomicRuntime(info, vid); @@ -542,15 +693,44 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - addVariantTuningName(vid, "replicate_"+std::to_string(global_replication)+ - "_global_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">block_unbunched_"+std::to_string(block_size)); + + if ( vid == RAJA_HIP ) { + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">warp_unbunched_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">block_bunched_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ + ">warp_bunched_"+std::to_string(block_size)); + } seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - if ( vid == Base_HIP ) { + if ( vid == Base_HIP || vid == RAJA_HIP ) { addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ ">_global<"+std::to_string(global_replication)+ - ">_"+std::to_string(block_size)); + ">block_unbunched_"+std::to_string(block_size)); + } + + if ( vid == RAJA_HIP ) { + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">warp_unbunched_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">block_bunched_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ + ">_global<"+std::to_string(global_replication)+ + ">warp_bunched_"+std::to_string(block_size)); + } + + if ( vid == Base_HIP ) { + histogram_info)> info( + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + ConstantModel{}, ConstantModel{}); + auto name = info.get_name(); + addVariantTuningName(vid, name.c_str()); } }); @@ -561,7 +741,8 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) if ( vid == Base_HIP ) { histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins); + histogram_atomic_runtime, getActualProblemSize(), m_num_bins, + histogram_global_atomic_model{}, histogram_shared_atomic_model{}); auto name = info.get_name(); addVariantTuningName(vid, name.c_str()); } diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 01932f43e..492a0b27c 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -28,7 +28,7 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - m_num_bins = 10; + m_num_bins = 100; setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); @@ -71,8 +71,8 @@ void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); bool init_even_sizes = false; - bool init_random_sizes = true; - bool init_all_one = false; + bool init_random_sizes = false; + bool init_all_one = true; bool init_random_per_iterate = false; if (init_even_sizes || init_random_sizes || init_all_one) { Real_ptr data = nullptr; diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 812f40e53..c09b966a6 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -107,13 +107,13 @@ class HISTOGRAM : public KernelBase void setHipTuningDefinitions(VariantID vid); void runCudaVariantLibrary(VariantID vid); void runHipVariantLibrary(VariantID vid); - template < size_t block_size, size_t global_replication > + template < size_t block_size, size_t global_replication, bool warp_atomics, bool bunched_atomics > void runCudaVariantAtomicGlobal(VariantID vid); - template < size_t block_size, size_t global_replication > + template < size_t block_size, size_t global_replication, bool warp_atomics, bool bunched_atomics > void runHipVariantAtomicGlobal(VariantID vid); - template < size_t block_size, size_t shared_replication, size_t global_replication > + template < size_t block_size, size_t shared_replication, size_t global_replication, bool warp_atomics, bool bunched_atomics > void runCudaVariantAtomicShared(VariantID vid); - template < size_t block_size, size_t shared_replication, size_t global_replication > + template < size_t block_size, size_t shared_replication, size_t global_replication, bool warp_atomics, bool bunched_atomics > void runHipVariantAtomicShared(VariantID vid); template < typename MultiReduceInfo > void runCudaVariantAtomicRuntime(MultiReduceInfo info, VariantID vid); @@ -125,8 +125,8 @@ class HISTOGRAM : public KernelBase using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; static const size_t default_gpu_atomic_global_replication = 2048; // 512, 512 // using gpu_atomic_global_replications_type = integer::make_atomic_replication_list_type; - using gpu_atomic_global_replications_type = integer::list_type<32, 64, 128, 256, 512, 1024, 2048, 4096>; - using gpu_atomic_shared_replications_type = integer::list_type<1, 2, 4, 8, 16, 32, 64>; + using gpu_atomic_global_replications_type = integer::list_type<1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2*1024, 4*1024, 8*1024, 16*1024>; + using gpu_atomic_shared_replications_type = integer::list_type<1, 2, 4, 8, 16, 32>; Index_type m_num_bins; Index_ptr m_bins; @@ -135,7 +135,7 @@ class HISTOGRAM : public KernelBase }; -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HP) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) // Compute lhs % rhs between non-negative lhs and positive power of 2 rhs template < typename L, typename R > @@ -254,7 +254,7 @@ struct MultiReduceAtomicCalculator template < typename IterFinal, typename IterGlobal, typename Op > void combine_globals(IterFinal counts_final, IterGlobal counts_global, Op combiner) { - for (IndexType bin = 0; bin < num_bins; ++bin) { + for (IndexType bin = 0; bin < num_bins(); ++bin) { counts_final[bin] = combine_global(bin, counts_global, combiner); } } From fbe296dbab4f44e8c66185338ddeec76394558bb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 9 Jul 2024 14:26:40 -0700 Subject: [PATCH 408/454] Use RAJA MultiReduce in MULTI_REDUCE cuda/hip --- src/basic/MULTI_REDUCE-Cuda.cpp | 7 +++---- src/basic/MULTI_REDUCE-Hip.cpp | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index 705a43cd9..d22cdc733 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -104,15 +104,14 @@ void MULTI_REDUCE::runCudaVariantAtomicGlobal(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::cuda_multi_reduce_atomic); RAJA::forall< RAJA::cuda_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - MULTI_REDUCE_GPU_RAJA_BODY(RAJA::cuda_atomic); + MULTI_REDUCE_BODY; }); - RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); - MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::cuda_multi_reduce_atomic); } stopTimer(); diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 2f5897a95..08eb8132d 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -104,15 +104,14 @@ void MULTI_REDUCE::runHipVariantAtomicGlobal(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::hip_multi_reduce_atomic); RAJA::forall< RAJA::hip_exec >( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - MULTI_REDUCE_GPU_RAJA_BODY(RAJA::hip_atomic); + MULTI_REDUCE_BODY; }); - RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); - MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::hip_multi_reduce_atomic); } stopTimer(); From 1dc771e19a554f90a8d7e35437f8232cf9fc4f9b Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 9 Jul 2024 14:27:41 -0700 Subject: [PATCH 409/454] Fix header file name --- src/algorithm/REDUCE_SUM-Sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithm/REDUCE_SUM-Sycl.cpp b/src/algorithm/REDUCE_SUM-Sycl.cpp index ae4c198ac..516048863 100644 --- a/src/algorithm/REDUCE_SUM-Sycl.cpp +++ b/src/algorithm/REDUCE_SUM-Sycl.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "DOT.hpp" +#include "REDUCE_SUM.hpp" #include "RAJA/RAJA.hpp" From 9bb40c061afc71149222a976551637aeffc31387 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 9 Jul 2024 14:35:35 -0700 Subject: [PATCH 410/454] Turn on SYCL variants in REDUCE_SUM kernel --- src/algorithm/REDUCE_SUM.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 233ef36d8..1fb1b78f7 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -51,6 +51,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Base_SYCL ); + setVariantDefined( RAJA_SYCL ); } REDUCE_SUM::~REDUCE_SUM() From 3b119b9d0ba7e5a50ff2667e773c572f89bbe8b6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Tue, 9 Jul 2024 15:58:07 -0700 Subject: [PATCH 411/454] Replace default reduction interface version accidentally removed --- src/basic/REDUCE3_INT-OMPTarget.cpp | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index 5b4a01060..50bd2b914 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -67,23 +67,18 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Int_type tvsum = m_vsum_init; - Int_type tvmin = m_vmin_init; - Int_type tvmax = m_vmax_init; + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - RAJA::expt::Reduce(&tvsum), - RAJA::expt::Reduce(&tvmin), - RAJA::expt::Reduce(&tvmax), - [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { - REDUCE3_INT_BODY; - } - ); - - m_vsum += static_cast(tvsum); - m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); - m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); } stopTimer(); From 1a0a6fe6105ec7089e8936ee2950a890d1d49cc8 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 10 Jul 2024 09:58:46 -0700 Subject: [PATCH 412/454] Change tuning variant control logic and add error condition --- src/algorithm/REDUCE_SUM-OMP.cpp | 6 +++--- src/algorithm/REDUCE_SUM-OMPTarget.cpp | 6 +++--- src/algorithm/REDUCE_SUM-Seq.cpp | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-OMP.cpp b/src/algorithm/REDUCE_SUM-OMP.cpp index ef352453b..1295887f5 100644 --- a/src/algorithm/REDUCE_SUM-OMP.cpp +++ b/src/algorithm/REDUCE_SUM-OMP.cpp @@ -94,9 +94,7 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -116,6 +114,8 @@ void REDUCE_SUM::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE_SUM : Unknown OpenMP tuning index = " << tune_idx << std::endl; } diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp index d01a2976e..1bab3b14a 100644 --- a/src/algorithm/REDUCE_SUM-OMPTarget.cpp +++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp @@ -74,9 +74,7 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -96,6 +94,8 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE_SUM : Unknown OpenMP Target tuning index = " << tune_idx << std::endl; } } else { diff --git a/src/algorithm/REDUCE_SUM-Seq.cpp b/src/algorithm/REDUCE_SUM-Seq.cpp index 27c2d7f78..8d4fdacb2 100644 --- a/src/algorithm/REDUCE_SUM-Seq.cpp +++ b/src/algorithm/REDUCE_SUM-Seq.cpp @@ -93,9 +93,7 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -114,10 +112,12 @@ void REDUCE_SUM::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE_SUM : Unknown Seq tuning index = " << tune_idx << std::endl; } break; - } + } #endif default : { From 60fb45fe4e6971d35d602837d738942d068ee715 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 10 Jul 2024 10:52:04 -0700 Subject: [PATCH 413/454] Change tuning index control logic --- src/basic/PI_REDUCE-OMP.cpp | 6 ++-- src/basic/PI_REDUCE-OMPTarget.cpp | 6 ++-- src/basic/PI_REDUCE-Seq.cpp | 6 ++-- src/basic/REDUCE3_INT-OMP.cpp | 8 +++-- src/basic/REDUCE3_INT-OMPTarget.cpp | 45 ++++++++++++++------------- src/basic/REDUCE3_INT-Seq.cpp | 6 ++-- src/basic/REDUCE_STRUCT-OMP.cpp | 6 ++-- src/basic/REDUCE_STRUCT-OMPTarget.cpp | 6 ++-- src/basic/REDUCE_STRUCT-Seq.cpp | 6 ++-- src/basic/TRAP_INT-OMP.cpp | 6 ++-- src/basic/TRAP_INT-OMPTarget.cpp | 8 ++--- src/basic/TRAP_INT-Seq.cpp | 6 ++-- 12 files changed, 59 insertions(+), 56 deletions(-) diff --git a/src/basic/PI_REDUCE-OMP.cpp b/src/basic/PI_REDUCE-OMP.cpp index f4b73db50..5c83aba6f 100644 --- a/src/basic/PI_REDUCE-OMP.cpp +++ b/src/basic/PI_REDUCE-OMP.cpp @@ -95,9 +95,7 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -117,6 +115,8 @@ void PI_REDUCE::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n PI_REDUCE : Unknown OpenMP tuning index = " << tune_idx << std::endl; } break; diff --git a/src/basic/PI_REDUCE-OMPTarget.cpp b/src/basic/PI_REDUCE-OMPTarget.cpp index cf9d2ce85..351580471 100644 --- a/src/basic/PI_REDUCE-OMPTarget.cpp +++ b/src/basic/PI_REDUCE-OMPTarget.cpp @@ -74,9 +74,7 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -96,6 +94,8 @@ void PI_REDUCE::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n PI_REDUCE : Unknown OMP Target tuning index = " << tune_idx << std::endl; } } else { diff --git a/src/basic/PI_REDUCE-Seq.cpp b/src/basic/PI_REDUCE-Seq.cpp index ee3dbae5e..4a5b28815 100644 --- a/src/basic/PI_REDUCE-Seq.cpp +++ b/src/basic/PI_REDUCE-Seq.cpp @@ -94,9 +94,7 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -115,6 +113,8 @@ void PI_REDUCE::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n PI_REDUCE : Unknown Seq tuning index = " << tune_idx << std::endl; } break; diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index a948a242f..8fe6e52e4 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -112,9 +112,7 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -141,6 +139,10 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE3_INT : Unknown OpenMP tuning index = " << tune_idx << std::endl; + } + break; } diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index 50bd2b914..d92d37667 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -83,35 +83,36 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Int_type tvsum = m_vsum_init; - Int_type tvmin = m_vmin_init; - Int_type tvmax = m_vmax_init; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - RAJA::expt::Reduce(&tvsum), - RAJA::expt::Reduce(&tvmin), - RAJA::expt::Reduce(&tvmax), - [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { - REDUCE3_INT_BODY; - } - ); + Int_type tvsum = m_vsum_init; + Int_type tvmin = m_vmin_init; + Int_type tvmax = m_vmax_init; - m_vsum += static_cast(tvsum); - m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); - m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tvsum), + RAJA::expt::Reduce(&tvmin), + RAJA::expt::Reduce(&tvmax), + [=](Index_type i, Int_type& vsum, Int_type& vmin, Int_type& vmax) { + REDUCE3_INT_BODY; + } + ); + + m_vsum += static_cast(tvsum); + m_vmin = RAJA_MIN(m_vmin, static_cast(tvmin)); + m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); - } } stopTimer(); + } else { + getCout() << "\n REDUCE3_INT : Unknown OMP Target tuning index = " << tune_idx << std::endl; + } + } else { getCout() << "\n REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl; } diff --git a/src/basic/REDUCE3_INT-Seq.cpp b/src/basic/REDUCE3_INT-Seq.cpp index 9ea03f0d9..32bcfbef6 100644 --- a/src/basic/REDUCE3_INT-Seq.cpp +++ b/src/basic/REDUCE3_INT-Seq.cpp @@ -108,9 +108,7 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -136,6 +134,8 @@ void REDUCE3_INT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE3_INT : Unknown Seq tuning index = " << tune_idx << std::endl; } break; diff --git a/src/basic/REDUCE_STRUCT-OMP.cpp b/src/basic/REDUCE_STRUCT-OMP.cpp index 7e593db72..8c44d02c0 100644 --- a/src/basic/REDUCE_STRUCT-OMP.cpp +++ b/src/basic/REDUCE_STRUCT-OMP.cpp @@ -138,9 +138,7 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -178,6 +176,8 @@ void REDUCE_STRUCT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE_STRUCT : Unknown OpenMP tuning index = " << tune_idx << std::endl; } break; diff --git a/src/basic/REDUCE_STRUCT-OMPTarget.cpp b/src/basic/REDUCE_STRUCT-OMPTarget.cpp index d6ad3fc78..594c62ccb 100644 --- a/src/basic/REDUCE_STRUCT-OMPTarget.cpp +++ b/src/basic/REDUCE_STRUCT-OMPTarget.cpp @@ -112,9 +112,7 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -152,6 +150,8 @@ void REDUCE_STRUCT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE_STRUCT : Unknown OMP Target tuning index = " << tune_idx << std::endl; } break; diff --git a/src/basic/REDUCE_STRUCT-Seq.cpp b/src/basic/REDUCE_STRUCT-Seq.cpp index 7edf3ab48..1e2a68d43 100644 --- a/src/basic/REDUCE_STRUCT-Seq.cpp +++ b/src/basic/REDUCE_STRUCT-Seq.cpp @@ -128,9 +128,7 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -168,6 +166,8 @@ void REDUCE_STRUCT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n REDUCE_STRUCT : Unknown Seq tuning index = " << tune_idx << std::endl; } break; diff --git a/src/basic/TRAP_INT-OMP.cpp b/src/basic/TRAP_INT-OMP.cpp index 1d867335f..f1961483a 100644 --- a/src/basic/TRAP_INT-OMP.cpp +++ b/src/basic/TRAP_INT-OMP.cpp @@ -96,9 +96,7 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -118,6 +116,8 @@ void TRAP_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n TRAP_INT : Unknown OpenMP tuning index = " << tune_idx << std::endl; } break; diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index 0bd9155b2..2c5e4cf56 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -80,9 +80,7 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -102,10 +100,12 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n TRAP_INT : Unknown OMP Target tuning index = " << tune_idx << std::endl; } } else { - getCout() << "\n TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl; + getCout() << "\n TRAP_INT : Unknown OMP Target variant id = " << vid << std::endl; } } diff --git a/src/basic/TRAP_INT-Seq.cpp b/src/basic/TRAP_INT-Seq.cpp index 6c75ffb04..fa74efdcf 100644 --- a/src/basic/TRAP_INT-Seq.cpp +++ b/src/basic/TRAP_INT-Seq.cpp @@ -96,9 +96,7 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -118,6 +116,8 @@ void TRAP_INT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n TRAP_INT : Unknown Seq tuning index = " << tune_idx << std::endl; } break; From c4377b9e36c410d828de87e08ce9fbc006004e47 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 10 Jul 2024 10:56:01 -0700 Subject: [PATCH 414/454] Update to a one atomic runtime multi-reduce implementation --- src/algorithm/HISTOGRAM-Cuda.cpp | 528 +++++------------------------- src/algorithm/HISTOGRAM-Hip.cpp | 534 +++++-------------------------- src/algorithm/HISTOGRAM.cpp | 2 - src/algorithm/HISTOGRAM.hpp | 156 +-------- src/basic/MULTI_REDUCE-Cuda.cpp | 211 +++++++----- src/basic/MULTI_REDUCE-Hip.cpp | 211 +++++++----- src/basic/MULTI_REDUCE.cpp | 2 - src/basic/MULTI_REDUCE.hpp | 19 +- src/common/GPUUtils.hpp | 66 ---- 9 files changed, 452 insertions(+), 1277 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 49a44179f..eb656f693 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -24,140 +24,58 @@ namespace rajaperf namespace algorithm { -// for these models the input is block_size and the output is values -using histogram_shared_atomic_model = ConstantModel<16>; - -// for these models the input is block_size and the output is cache lines -using histogram_global_atomic_model = CutoffModel<512, 2, 1>; - -// v100 -// 10 bins - 1 bin per iterate - single bin -// shared ConstantModel<16> global ConstantModel<1> -// 10 bins - 1 bin per iterate - random sized runs -// shared ConstantModel<16> global CutoffModel<512, 2, 1> -// 10 bins - 1 bin per iterate - random bin -// shared ConstantModel<8> global ConstantModel<1> -// -// 100 bins - 1 bin per iterate - single bin -// shared ConstantModel<> global ConstantModel<> -// 100 bins - 1 bin per iterate - random sized runs -// shared ConstantModel<> global ConstantModel<> -// 100 bins - 1 bin per iterate - random bin -// shared ConstantModel<> global ConstantModel<> - - -template < size_t t_block_size, typename T, typename FunctionSignature > -struct histogram_info -{ - static constexpr size_t block_size = t_block_size; +constexpr Index_type warp_size = 32; - static size_t get_grid_size(size_t problem_size) - { - return RAJA_DIVIDE_CEILING_INT(problem_size, block_size); - } +constexpr Index_type default_shared_replication = 16; +constexpr Index_type default_global_replication = 2; - static size_t get_max_shmem(FunctionSignature const& func) - { - cudaFuncAttributes func_attr; - cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func)); - return func_attr.maxDynamicSharedSizeBytes; - } - FunctionSignature const& func; - const size_t grid_size; - const MultiReduceAtomicCalculator atomic_calc; - - template < typename GlobalModel, typename SharedModel > - histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins, - GlobalModel const& global_atomic_model, SharedModel const& shared_atomic_model) - : func(a_func) - , grid_size(get_grid_size(problem_size)) - , atomic_calc(num_bins, block_size, grid_size, get_max_shmem(a_func), - global_atomic_model, shared_atomic_model) - { } - - std::string get_name() const - { - return "atomic_shared("+std::to_string(atomic_calc.shared_replication())+ - ")_global("+std::to_string(atomic_calc.global_replication())+ - ")_"+std::to_string(block_size); - } -}; - - -template < Index_type block_size, Index_type global_replication > +template < Index_type block_size > __launch_bounds__(block_size) -__global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, - Index_ptr bins, - Index_type iend) +__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) { - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); - } -} + if (shared_replication > 0) { -template < Index_type block_size, Index_type shared_replication, Index_type global_replication > -__launch_bounds__(block_size) -__global__ void histogram_atomic_shared_global(HISTOGRAM::Data_ptr global_counts, - Index_ptr bins, - Index_type num_bins, - Index_type iend) -{ - extern __shared__ HISTOGRAM::Data_type shared_counts[]; - for (Index_type i = threadIdx.x; i < num_bins * shared_replication; i += block_size) { - shared_counts[i] = HISTOGRAM::Data_type(0); - } - __syncthreads(); + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_counts[t] = HISTOGRAM::Data_type(0); + } + __syncthreads(); - { - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, shared_counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], threadIdx.x, shared_replication), HISTOGRAM::Data_type(1)); + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); + } } - } - __syncthreads(); - for (Index_type b = threadIdx.x; b < num_bins; b += block_size) { - Index_type i = blockIdx.x * num_bins + b; - auto block_sum = HISTOGRAM::Data_type(0); - for (Index_type s = 0; s < shared_replication; ++s) { - block_sum += shared_counts[HISTOGRAM_GPU_BIN_INDEX(b, s, shared_replication)]; + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != HISTOGRAM::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], block_sum); + } } - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, global_counts, HISTOGRAM_GPU_BIN_INDEX(b, i, global_replication), block_sum); - } -} -template < Index_type block_size > -__launch_bounds__(block_size) -__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, - Index_ptr bins, - Index_type iend, - MultiReduceAtomicCalculator atomic_calc) -{ - extern __shared__ HISTOGRAM::Data_type shared_counts[]; - for (Index_type i = threadIdx.x; - i < Index_type(atomic_calc.num_bins() * atomic_calc.shared_replication()); - i += block_size) { - shared_counts[i] = HISTOGRAM::Data_type(0); - } - __syncthreads(); + } else { - { Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, shared_counts, atomic_calc.get_shared_offset(bins[i], threadIdx.x), HISTOGRAM::Data_type(1)); - } - } - __syncthreads(); - - for (Index_type b = threadIdx.x; b < atomic_calc.num_bins(); b += block_size) { - auto block_sum = HISTOGRAM::Data_type(0); - for (Index_type s = 0; s < atomic_calc.shared_replication(); ++s) { - block_sum += shared_counts[atomic_calc.get_shared_offset(b, s)]; - } - if (block_sum != HISTOGRAM::Data_type(0)) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, global_counts, atomic_calc.get_global_offset(b, blockIdx.x), block_sum); + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], HISTOGRAM::Data_type(1)); } } } @@ -232,9 +150,11 @@ void HISTOGRAM::runCudaVariantLibrary(VariantID vid) } -template < size_t block_size, size_t global_replication, - bool warp_atomics, bool bunched_atomics > -void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) + +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > +void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -244,131 +164,54 @@ void HISTOGRAM::runCudaVariantAtomicGlobal(VariantID vid) HISTOGRAM_DATA_SETUP; - RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); - if ( vid == Base_CUDA ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - RPlaunchCudaKernel( (histogram_atomic_global), - grid_size, block_size, - shmem, res.get_stream(), - counts, - bins, - iend ); - - RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); - - } - stopTimer(); + auto* func = &histogram_atomic_runtime; - } else if ( vid == Lambda_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); - - auto histogram_lambda = [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::cuda_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); - }; - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - RPlaunchCudaKernel( (lambda_cuda_forall), - grid_size, block_size, - shmem, res.get_stream(), - ibegin, iend, histogram_lambda ); - - RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); - - } - stopTimer(); - - } else if ( vid == RAJA_CUDA ) { - - using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< - RAJA::cuda::MultiReduceTuning< - RAJA::cuda::multi_reduce_algorithm::init_host_combine_global_atomic, - void, - RAJA::cuda::AtomicReplicationTuning< - RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer< - RAJA::cuda::ConstantPreferredReplicationConcretizer>, - std::conditional_t, RAJA::cuda::block_xyz<>>, - std::conditional_t, RAJA::GetOffsetLeft>>>>; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); - - RAJA::forall>( res, - RAJA::RangeSegment(ibegin, iend), - [=] __device__ (Index_type i) { - HISTOGRAM_BODY; - }); - - HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); - - } - stopTimer(); - - } else { - getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; - } - - RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); - -} - -template < size_t block_size, size_t shared_replication, size_t global_replication, - bool warp_atomics, bool bunched_atomics > -void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); + cudaFuncAttributes func_attr; + cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - auto res{getCudaResource()}; + const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - HISTOGRAM_DATA_SETUP; + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); - if ( vid == Base_CUDA ) { + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - const size_t shmem = num_bins*shared_replication*sizeof(Data_type); - - RPlaunchCudaKernel( (histogram_atomic_shared_global), + RPlaunchCudaKernel( func, grid_size, block_size, shmem, res.get_stream(), counts, bins, + iend, num_bins, - iend ); + shared_replication, + global_replication ); RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type count_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + count_final += hcounts[offset]; + } + counts_final[bin] = count_final; + } } stopTimer(); + RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); + } else if ( vid == RAJA_CUDA ) { using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< @@ -376,14 +219,14 @@ void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, RAJA::cuda::AtomicReplicationTuning< RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer< - RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::ConstantPreferredReplicationConcretizer>, RAJA::cuda::thread_xyz<>, RAJA::GetOffsetRight>, RAJA::cuda::AtomicReplicationTuning< RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer< - RAJA::cuda::ConstantPreferredReplicationConcretizer>, - std::conditional_t, RAJA::cuda::block_xyz<>>, - std::conditional_t, RAJA::GetOffsetLeft>>>>; + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -405,62 +248,6 @@ void HISTOGRAM::runCudaVariantAtomicShared(VariantID vid) getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; } - RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); - -} - - -template < typename MultiReduceInfo > -void HISTOGRAM::runCudaVariantAtomicRuntime(MultiReduceInfo info, VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type iend = getActualProblemSize(); - - auto res{getCudaResource()}; - - HISTOGRAM_DATA_SETUP; - - static constexpr size_t block_size = info.block_size; - const size_t grid_size = info.grid_size; - const auto atomic_calc = info.atomic_calc; - const size_t global_replication = atomic_calc.global_replication(); - const size_t shmem = atomic_calc.shared_memory_in_bytes(); - - RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); - - if ( vid == Base_CUDA ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); - - RPlaunchCudaKernel( info.func, - grid_size, block_size, - shmem, res.get_stream(), - counts, - bins, - iend, - atomic_calc ); - - RAJAPERF_CUDA_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - for (Index_type b = 0; b < num_bins; ++b) { - Data_type count_final = 0; - for (size_t r = 0; r < global_replication; ++r) { - count_final += hcounts[atomic_calc.get_global_offset(b, r)]; - } - counts_final[b] = count_final; - } - - } - stopTimer(); - - } else { - getCout() << "\n HISTOGRAM : Unknown Cuda variant id = " << vid << std::endl; - } - - RAJAPERF_CUDA_REDUCER_TEARDOWN(counts, hcounts); - } @@ -480,7 +267,7 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) } - if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -492,138 +279,25 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantAtomicGlobal(vid); - - } - - t += 1; - - if ( vid == RAJA_CUDA ) { - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantAtomicGlobal(vid); - - } - - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantAtomicGlobal(vid); - - } - - t += 1; + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { if (tune_idx == t) { setBlockSize(block_size); - runCudaVariantAtomicGlobal(vid); + runCudaVariantAtomicRuntime(vid); } t += 1; - } - - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - - if ( vid == Base_CUDA || vid == RAJA_CUDA ) { - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantAtomicShared(vid); - - } - - t += 1; - - } - - if ( vid == RAJA_CUDA ) { - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantAtomicShared(vid); - - } - - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantAtomicShared(vid); - - } - - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runCudaVariantAtomicShared(vid); - - } - - t += 1; - - } - - if ( vid == Base_CUDA ) { - - if (tune_idx == t) { - - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - ConstantModel{}, ConstantModel{}); - setBlockSize(block_size); - runCudaVariantAtomicRuntime(info, vid); - - } - - t += 1; - - } - }); } }); - if ( vid == Base_CUDA ) { - - if (tune_idx == t) { - - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - histogram_global_atomic_model{}, histogram_shared_atomic_model{}); - setBlockSize(block_size); - runCudaVariantAtomicRuntime(info, vid); - - } - - t += 1; - - } - } }); @@ -644,7 +318,7 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) } - if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { @@ -656,45 +330,11 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">block_unbunched_"+std::to_string(block_size)); - - if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">warp_unbunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">block_bunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">warp_bunched_"+std::to_string(block_size)); - } - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - if ( vid == Base_CUDA || vid == RAJA_CUDA ) { - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">block_unbunched_"+std::to_string(block_size)); - } - - if ( vid == RAJA_CUDA ) { - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">warp_unbunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">block_bunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">warp_bunched_"+std::to_string(block_size)); - } - - if ( vid == Base_CUDA ) { - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - ConstantModel{}, ConstantModel{}); - auto name = info.get_name(); - addVariantTuningName(vid, name.c_str()); - } + addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ + ")_global("+std::to_string(global_replication)+ + ")_"+std::to_string(block_size)); }); @@ -702,14 +342,6 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) }); - if ( vid == Base_CUDA ) { - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - histogram_global_atomic_model{}, histogram_shared_atomic_model{}); - auto name = info.get_name(); - addVariantTuningName(vid, name.c_str()); - } - } }); diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 1a7d71947..0f595164a 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -29,146 +29,58 @@ namespace rajaperf namespace algorithm { -// for these models the input is block_size and the output is values -using histogram_shared_atomic_model = ConstantModel<4>; - -// for these models the input is block_size and the output is cache lines -using histogram_global_atomic_model = CutoffModel<512, 32, 16>; - -// gfx90a -// 10 bins - 1 bin per iterate - random sized runs -// shared ConstantModel<4> global ConstantModel<1> - -// gfx942 -// 10 bins - 1 bin per iterate - single bin -// shared ConstantModel<4> global CutoffModel<512, 32, 16> -// 10 bins - 1 bin per iterate - random sized runs -// shared ConstantModel<4> global CutoffModel<512, 32, 16> -// 10 bins - 1 bin per iterate - random bin -// shared ConstantModel<1> global ConstantModel<32> -// -// 100 bins - 1 bin per iterate - single bin -// shared ConstantModel<2> global ConstantModel<32> -// 100 bins - 1 bin per iterate - random sized runs -// shared ConstantModel<> global ConstantModel<> -// 100 bins - 1 bin per iterate - random bin -// shared ConstantModel<> global ConstantModel<> - - -template < size_t t_block_size, typename T, typename FunctionSignature > -struct histogram_info -{ - static constexpr size_t block_size = t_block_size; - - static size_t get_grid_size(size_t problem_size) - { - return RAJA_DIVIDE_CEILING_INT(problem_size, block_size); - } - - static size_t get_max_shmem(FunctionSignature const& func) - { - hipFuncAttributes func_attr; - hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func)); - return func_attr.maxDynamicSharedSizeBytes; - } +constexpr Index_type warp_size = 64; - FunctionSignature const& func; - const size_t grid_size; - const MultiReduceAtomicCalculator atomic_calc; - - template < typename GlobalModel, typename SharedModel > - histogram_info(FunctionSignature const& a_func, size_t problem_size, size_t num_bins, - GlobalModel const& global_atomic_model, SharedModel const& shared_atomic_model) - : func(a_func) - , grid_size(get_grid_size(problem_size)) - , atomic_calc(num_bins, block_size, grid_size, get_max_shmem(a_func), - global_atomic_model, shared_atomic_model) - { } - - std::string get_name() const - { - return "atomic_shared("+std::to_string(atomic_calc.shared_replication())+ - ")_global("+std::to_string(atomic_calc.global_replication())+ - ")block_"+std::to_string(block_size); - } -}; +constexpr Index_type default_shared_replication = 4; +constexpr Index_type default_global_replication = 16; -template < Index_type block_size, Index_type global_replication > -__launch_bounds__(block_size) -__global__ void histogram_atomic_global(HISTOGRAM::Data_ptr counts, - Index_ptr bins, - Index_type iend) -{ - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); - } -} - -template < Index_type block_size, Index_type shared_replication, Index_type global_replication > +template < Index_type block_size > __launch_bounds__(block_size) -__global__ void histogram_atomic_shared_global(HISTOGRAM::Data_ptr global_counts, - Index_ptr bins, - Index_type num_bins, - Index_type iend) +__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) { - extern __shared__ HISTOGRAM::Data_type shared_counts[]; - for (Index_type sb = threadIdx.x; sb < num_bins * shared_replication; sb += block_size) { - shared_counts[sb] = HISTOGRAM::Data_type(0); - } - __syncthreads(); + if (shared_replication > 0) { - { - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, shared_counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], threadIdx.x, shared_replication), HISTOGRAM::Data_type(1)); + extern __shared__ HISTOGRAM::Data_type shared_counts[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_counts[t] = HISTOGRAM::Data_type(0); } - } - __syncthreads(); + __syncthreads(); - for (Index_type b = threadIdx.x; b < num_bins; b += block_size) { - Index_type i = blockIdx.x * num_bins + b; - auto block_sum = HISTOGRAM::Data_type(0); - for (Index_type s = 0; s < shared_replication; ++s) { - block_sum += shared_counts[HISTOGRAM_GPU_BIN_INDEX(b, s, shared_replication)]; + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); + } } - if (block_sum != HISTOGRAM::Data_type(0)) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, global_counts, HISTOGRAM_GPU_BIN_INDEX(b, i, global_replication), block_sum); + + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = HISTOGRAM::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != HISTOGRAM::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], block_sum); + } } - } -} -template < Index_type block_size > -__launch_bounds__(block_size) -__global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, - Index_ptr bins, - Index_type iend, - MultiReduceAtomicCalculator atomic_calc) -{ - extern __shared__ HISTOGRAM::Data_type shared_counts[]; - for (Index_type i = threadIdx.x; - i < Index_type(atomic_calc.num_bins() * atomic_calc.shared_replication()); - i += block_size) { - shared_counts[i] = HISTOGRAM::Data_type(0); - } - __syncthreads(); + } else { - { Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; if (i < iend) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, shared_counts, atomic_calc.get_shared_offset(bins[i], threadIdx.x), HISTOGRAM::Data_type(1)); - } - } - __syncthreads(); - - for (Index_type b = threadIdx.x; b < atomic_calc.num_bins(); b += block_size) { - auto block_sum = HISTOGRAM::Data_type(0); - for (Index_type s = 0; s < atomic_calc.shared_replication(); ++s) { - block_sum += shared_counts[atomic_calc.get_shared_offset(b, s)]; - } - if (block_sum != HISTOGRAM::Data_type(0)) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, global_counts, atomic_calc.get_global_offset(b, blockIdx.x), block_sum); + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_counts[offset], HISTOGRAM::Data_type(1)); } } } @@ -267,9 +179,11 @@ void HISTOGRAM::runHipVariantLibrary(VariantID vid) } -template < size_t block_size, size_t global_replication, - bool warp_atomics, bool bunched_atomics > -void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) + +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > +void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -279,128 +193,48 @@ void HISTOGRAM::runHipVariantAtomicGlobal(VariantID vid) HISTOGRAM_DATA_SETUP; - RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); - if ( vid == Base_HIP ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); + auto* func = &histogram_atomic_runtime; - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; + hipFuncAttributes func_attr; + hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - RPlaunchHipKernel( (histogram_atomic_global), - grid_size, block_size, - shmem, res.get_stream(), - counts, - bins, - iend ); + const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - } - stopTimer(); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); - } else if ( vid == Lambda_HIP ) { + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); - auto histogram_lambda = [=] __device__ (Index_type i) { - HISTOGRAM_GPU_RAJA_BODY(RAJA::hip_atomic, counts, HISTOGRAM_GPU_BIN_INDEX(bins[i], i, global_replication), HISTOGRAM::Data_type(1)); - }; - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - RPlaunchHipKernel( (lambda_hip_forall), + RPlaunchHipKernel( func, grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, histogram_lambda ); - - RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); - - } - stopTimer(); - - } else if ( vid == RAJA_HIP ) { - - using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< - RAJA::hip::MultiReduceTuning< - RAJA::hip::multi_reduce_algorithm::init_host_combine_global_atomic, - void, - RAJA::hip::AtomicReplicationTuning< - RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer< - RAJA::hip::ConstantPreferredReplicationConcretizer>, - std::conditional_t, RAJA::hip::block_xyz<>>, - std::conditional_t, RAJA::GetOffsetLeft>>>>; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); - - RAJA::forall>( res, - RAJA::RangeSegment(ibegin, iend), - [=] __device__ (Index_type i) { - HISTOGRAM_BODY; - }); - - HISTOGRAM_FINALIZE_COUNTS_RAJA(multi_reduce_policy); - - } - stopTimer(); - - } else { - getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; - } - - RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); - -} - -template < size_t block_size, size_t shared_replication, size_t global_replication, - bool warp_atomics, bool bunched_atomics > -void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - HISTOGRAM_DATA_SETUP; - - - if ( vid == Base_HIP ) { - - RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - const size_t shmem = num_bins*shared_replication*sizeof(Data_type); - - RPlaunchHipKernel( (histogram_atomic_shared_global), - grid_size, block_size, - shmem, res.get_stream(), - counts, - bins, - num_bins, - iend ); + counts, + bins, + iend, + num_bins, + shared_replication, + global_replication ); RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - HISTOGRAM_GPU_FINALIZE_COUNTS(hcounts, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type count_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + count_final += hcounts[offset]; + } + counts_final[bin] = count_final; + } } stopTimer(); @@ -414,14 +248,14 @@ void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) RAJA::hip::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, RAJA::hip::AtomicReplicationTuning< RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer< - RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::ConstantPreferredReplicationConcretizer>, RAJA::hip::thread_xyz<>, RAJA::GetOffsetRight>, RAJA::hip::AtomicReplicationTuning< RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer< - RAJA::hip::ConstantPreferredReplicationConcretizer>, - std::conditional_t, RAJA::hip::block_xyz<>>, - std::conditional_t, RAJA::GetOffsetLeft>>>>; + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -443,61 +277,6 @@ void HISTOGRAM::runHipVariantAtomicShared(VariantID vid) getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; } - -} - - -template < typename MultiReduceInfo > -void HISTOGRAM::runHipVariantAtomicRuntime(MultiReduceInfo info, VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type iend = getActualProblemSize(); - - auto res{getHipResource()}; - - HISTOGRAM_DATA_SETUP; - - static constexpr size_t block_size = info.block_size; - const size_t grid_size = info.grid_size; - const auto atomic_calc = info.atomic_calc; - const size_t global_replication = atomic_calc.global_replication(); - const size_t shmem = atomic_calc.shared_memory_in_bytes(); - - RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); - - if ( vid == Base_HIP ) { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(counts_init, counts, hcounts, num_bins, global_replication); - - RPlaunchHipKernel( info.func, - grid_size, block_size, - shmem, res.get_stream(), - counts, - bins, - iend, - atomic_calc ); - - RAJAPERF_HIP_REDUCER_COPY_BACK(counts, hcounts, num_bins, global_replication); - for (Index_type b = 0; b < num_bins; ++b) { - Data_type count_final = 0; - for (size_t r = 0; r < global_replication; ++r) { - count_final += hcounts[atomic_calc.get_global_offset(b, r)]; - } - counts_final[b] = count_final; - } - - } - stopTimer(); - - } else { - getCout() << "\n HISTOGRAM : Unknown Hip variant id = " << vid << std::endl; - } - - RAJAPERF_HIP_REDUCER_TEARDOWN(counts, hcounts); - } @@ -529,138 +308,25 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantAtomicGlobal(vid); - - } - - t += 1; - - if ( vid == RAJA_HIP ) { - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantAtomicGlobal(vid); - - } - - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantAtomicGlobal(vid); - - } - - t += 1; + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { if (tune_idx == t) { setBlockSize(block_size); - runHipVariantAtomicGlobal(vid); + runHipVariantAtomicRuntime(vid); } t += 1; - } - - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - - if ( vid == Base_HIP || vid == RAJA_HIP ) { - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantAtomicShared(vid); - - } - - t += 1; - - } - - if ( vid == RAJA_HIP ) { - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantAtomicShared(vid); - - } - - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantAtomicShared(vid); - - } - - t += 1; - - if (tune_idx == t) { - - setBlockSize(block_size); - runHipVariantAtomicShared(vid); - - } - - t += 1; - - } - - if ( vid == Base_HIP ) { - - if (tune_idx == t) { - - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - ConstantModel{}, ConstantModel{}); - setBlockSize(block_size); - runHipVariantAtomicRuntime(info, vid); - - } - - t += 1; - - } - }); } }); - if ( vid == Base_HIP ) { - - if (tune_idx == t) { - - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - histogram_global_atomic_model{}, histogram_shared_atomic_model{}); - setBlockSize(block_size); - runHipVariantAtomicRuntime(info, vid); - - } - - t += 1; - - } - } }); @@ -693,45 +359,11 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">block_unbunched_"+std::to_string(block_size)); - - if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">warp_unbunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">block_bunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_global<"+std::to_string(global_replication)+ - ">warp_bunched_"+std::to_string(block_size)); - } - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - if ( vid == Base_HIP || vid == RAJA_HIP ) { - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">block_unbunched_"+std::to_string(block_size)); - } - - if ( vid == RAJA_HIP ) { - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">warp_unbunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">block_bunched_"+std::to_string(block_size)); - addVariantTuningName(vid, "atomic_shared<"+std::to_string(shared_replication)+ - ">_global<"+std::to_string(global_replication)+ - ">warp_bunched_"+std::to_string(block_size)); - } - - if ( vid == Base_HIP ) { - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - ConstantModel{}, ConstantModel{}); - auto name = info.get_name(); - addVariantTuningName(vid, name.c_str()); - } + addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ + ")_global("+std::to_string(global_replication)+ + ")_"+std::to_string(block_size)); }); @@ -739,14 +371,6 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) }); - if ( vid == Base_HIP ) { - histogram_info)> info( - histogram_atomic_runtime, getActualProblemSize(), m_num_bins, - histogram_global_atomic_model{}, histogram_shared_atomic_model{}); - auto name = info.get_name(); - addVariantTuningName(vid, name.c_str()); - } - } }); diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 492a0b27c..51125eaea 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -50,11 +50,9 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setVariantDefined( Base_OpenMPTarget ); setVariantDefined( Base_CUDA ); - setVariantDefined( Lambda_CUDA ); setVariantDefined( RAJA_CUDA ); setVariantDefined( Base_HIP ); - setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index c09b966a6..a060693d1 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -107,18 +107,15 @@ class HISTOGRAM : public KernelBase void setHipTuningDefinitions(VariantID vid); void runCudaVariantLibrary(VariantID vid); void runHipVariantLibrary(VariantID vid); - template < size_t block_size, size_t global_replication, bool warp_atomics, bool bunched_atomics > - void runCudaVariantAtomicGlobal(VariantID vid); - template < size_t block_size, size_t global_replication, bool warp_atomics, bool bunched_atomics > - void runHipVariantAtomicGlobal(VariantID vid); - template < size_t block_size, size_t shared_replication, size_t global_replication, bool warp_atomics, bool bunched_atomics > - void runCudaVariantAtomicShared(VariantID vid); - template < size_t block_size, size_t shared_replication, size_t global_replication, bool warp_atomics, bool bunched_atomics > - void runHipVariantAtomicShared(VariantID vid); - template < typename MultiReduceInfo > - void runCudaVariantAtomicRuntime(MultiReduceInfo info, VariantID vid); - template < typename MultiReduceInfo > - void runHipVariantAtomicRuntime(MultiReduceInfo info, VariantID vid); + + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > + void runCudaVariantAtomicRuntime(VariantID vid); + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > + void runHipVariantAtomicRuntime(VariantID vid); private: static const size_t default_gpu_block_size = 256; @@ -126,7 +123,7 @@ class HISTOGRAM : public KernelBase static const size_t default_gpu_atomic_global_replication = 2048; // 512, 512 // using gpu_atomic_global_replications_type = integer::make_atomic_replication_list_type; using gpu_atomic_global_replications_type = integer::list_type<1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2*1024, 4*1024, 8*1024, 16*1024>; - using gpu_atomic_shared_replications_type = integer::list_type<1, 2, 4, 8, 16, 32>; + using gpu_atomic_shared_replications_type = integer::list_type<0, 1, 2, 4, 8, 16, 32>; Index_type m_num_bins; Index_ptr m_bins; @@ -134,139 +131,6 @@ class HISTOGRAM : public KernelBase std::vector m_counts_final; }; - -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) - -// Compute lhs % rhs between non-negative lhs and positive power of 2 rhs -template < typename L, typename R > -constexpr auto power_of_2_mod(L lhs, R rhs) noexcept -{ - return lhs & (rhs-1); -} - -template < size_t value > -struct ConstantModel -{ - static constexpr size_t get_replication(size_t RAJAPERF_UNUSED_ARG(parallelism)) noexcept - { - return value; - } -}; - -template < size_t cutoff, size_t value_before_cutoff, size_t value_after_cutoff > -struct CutoffModel -{ - static constexpr size_t get_replication(size_t parallelism) noexcept - { - return parallelism <= cutoff ? value_before_cutoff : value_after_cutoff; - } -}; - -template < typename T, typename IndexType > -struct MultiReduceAtomicCalculator -{ - template < typename SharedAtomicModel > - static constexpr IndexType calculate_shared_replication(IndexType num_bins, - IndexType threads_per_block, - IndexType max_shmem_per_block_in_bytes, - SharedAtomicModel shared_atomic_model) - { - IndexType shared_replication = shared_atomic_model.get_replication(threads_per_block); - IndexType max_shared_replication = max_shmem_per_block_in_bytes / sizeof(T) / num_bins; - return prev_pow2(std::min(shared_replication, max_shared_replication)); - } - - template < typename GlobalAtomicModel > - static constexpr IndexType calculate_global_replication(IndexType threads_per_block, - IndexType blocks_per_kernel, - GlobalAtomicModel global_atomic_model) - { - IndexType global_replication = global_atomic_model.get_replication(threads_per_block); - return next_pow2(std::min(global_replication, blocks_per_kernel)); - } - - template < typename GlobalAtomicModel, typename SharedAtomicModel > - constexpr MultiReduceAtomicCalculator(IndexType num_bins, - IndexType threads_per_block, - IndexType blocks_per_kernel, - IndexType max_shmem_per_block_in_bytes, - GlobalAtomicModel global_atomic_model, - SharedAtomicModel shared_atomic_model) - : m_num_bins(num_bins) - , m_shared_replication(calculate_shared_replication(num_bins, threads_per_block, max_shmem_per_block_in_bytes, shared_atomic_model)) - , m_global_replication(calculate_global_replication(threads_per_block, blocks_per_kernel, global_atomic_model)) - { } - - // get the shared memory usage in bytes - __host__ __device__ - constexpr IndexType shared_memory_in_bytes() const - { - return m_shared_replication * m_num_bins * sizeof(T); - } - - // get the number of bins - __host__ __device__ - constexpr IndexType num_bins() const - { - return m_num_bins; - } - - // get the shared replication, always a power of 2 - __host__ __device__ - constexpr IndexType shared_replication() const - { - return m_shared_replication; - } - - // get the global replication, always a power of 2 - __host__ __device__ - constexpr IndexType global_replication() const - { - return m_global_replication; - } - - // get the offset into shared memory - __host__ __device__ - constexpr IndexType get_shared_offset(IndexType bin, IndexType rep) const - { - // make rep stride-1 to avoid bank conflicts - return bin * shared_replication() + power_of_2_mod(rep, shared_replication()); - } - - // get the offset into global memory - __host__ __device__ - constexpr IndexType get_global_offset(IndexType bin, IndexType rep) const - { - // make bin stride-1 so atomics from a single block can coalesce - return bin + power_of_2_mod(rep, global_replication()) * num_bins(); - } - - template < typename IterFinal, typename IterGlobal, typename Op > - T combine_global(IndexType bin, IterGlobal counts_global, Op combiner) - { - T count_final = combiner.identity(); - for (IndexType rep = 0; rep < global_replication(); ++rep) { - combiner(count_final, counts_global[get_global_offset(bin, rep)]); - } - return count_final; - } - - template < typename IterFinal, typename IterGlobal, typename Op > - void combine_globals(IterFinal counts_final, IterGlobal counts_global, Op combiner) - { - for (IndexType bin = 0; bin < num_bins(); ++bin) { - counts_final[bin] = combine_global(bin, counts_global, combiner); - } - } - -private: - IndexType m_num_bins; - IndexType m_shared_replication; - IndexType m_global_replication; -}; - -#endif - } // end namespace algorithm } // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index d22cdc733..3448067bf 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -21,23 +21,67 @@ namespace rajaperf namespace basic { -template < size_t block_size, size_t replication > +constexpr Index_type warp_size = 32; + +constexpr Index_type default_shared_replication = 16; +constexpr Index_type default_global_replication = 2; + + +template < Index_type block_size > __launch_bounds__(block_size) -__global__ void multi_reduce(MULTI_REDUCE::Data_ptr values, - Index_ptr bins, - MULTI_REDUCE::Data_ptr data, - Index_type iend) +__global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values, + MULTI_REDUCE::Data_ptr data, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) { - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - MULTI_REDUCE_GPU_RAJA_BODY(RAJA::cuda_atomic); - } -} + if (shared_replication > 0) { + + extern __shared__ MULTI_REDUCE::Data_type shared_values[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_values[t] = MULTI_REDUCE::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + RAJA::atomicAdd(&shared_values[offset], data[i]); + } + } + + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = MULTI_REDUCE::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != MULTI_REDUCE::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], block_sum); + } + } + } else { + Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; + if (i < iend) { + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], data[i]); + } + } +} -template < size_t block_size, size_t replication > -void MULTI_REDUCE::runCudaVariantAtomicGlobal(VariantID vid) +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > +void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -47,71 +91,83 @@ void MULTI_REDUCE::runCudaVariantAtomicGlobal(VariantID vid) MULTI_REDUCE_DATA_SETUP; - RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, replication); - if ( vid == Base_CUDA ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + auto* func = &multi_reduce_atomic_runtime; - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; + cudaFuncAttributes func_attr; + cudaErrchk(cudaFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - RPlaunchCudaKernel( (multi_reduce), - grid_size, block_size, - shmem, res.get_stream(), - values, - bins, - data, - iend ); + const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); - MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - } - stopTimer(); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); - } else if ( vid == Lambda_CUDA ) { + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); - - auto multi_reduce_lambda = [=] __device__ (Index_type i) { - MULTI_REDUCE_GPU_RAJA_BODY(RAJA::cuda_atomic); - }; + RAJAPERF_CUDA_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, global_replication); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - RPlaunchCudaKernel( (lambda_cuda_forall), + RPlaunchCudaKernel( func, grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, multi_reduce_lambda ); - - RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); - MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + values, + data, + bins, + iend, + num_bins, + shared_replication, + global_replication ); + + RAJAPERF_CUDA_REDUCER_COPY_BACK(values, hvalues, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type value_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + value_final += hvalues[offset]; + } + values_final[bin] = value_final; + } } stopTimer(); + RAJAPERF_CUDA_REDUCER_TEARDOWN(values, hvalues); + } else if ( vid == RAJA_CUDA ) { + using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< + RAJA::cuda::MultiReduceTuning< + RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::cuda::AtomicReplicationTuning< + RAJA::cuda::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::cuda::ConstantPreferredReplicationConcretizer>, + RAJA::cuda::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::cuda_multi_reduce_atomic); + MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy); - RAJA::forall< RAJA::cuda_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - MULTI_REDUCE_BODY; + RAJA::forall>( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + MULTI_REDUCE_BODY; }); - MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::cuda_multi_reduce_atomic); + MULTI_REDUCE_FINALIZE_VALUES_RAJA(multi_reduce_policy); } stopTimer(); @@ -120,34 +176,38 @@ void MULTI_REDUCE::runCudaVariantAtomicGlobal(VariantID vid) getCout() << "\n MULTI_REDUCE : Unknown Cuda variant id = " << vid << std::endl; } - RAJAPERF_CUDA_REDUCER_TEARDOWN(values, hvalues); - } void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + if ( vid == Base_CUDA || vid == RAJA_CUDA ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + run_params.validAtomicReplication(global_replication)) { - if (tune_idx == t) { + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - setBlockSize(block_size); - runCudaVariantAtomicGlobal(vid); + if (tune_idx == t) { - } + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); - t += 1; + } + + t += 1; + + }); } @@ -167,30 +227,31 @@ void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) void MULTI_REDUCE::setCudaTuningDefinitions(VariantID vid) { - if ( vid == Base_CUDA || vid == Lambda_CUDA || vid == RAJA_CUDA ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ - "_global_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ + ")_global("+std::to_string(global_replication)+ + ")_"+std::to_string(block_size)); - } + }); - }); + } - } + }); - }); + } - } + }); } diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 08eb8132d..b82949f5f 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -21,23 +21,67 @@ namespace rajaperf namespace basic { -template < size_t block_size, size_t replication > +constexpr Index_type warp_size = 64; + +constexpr Index_type default_shared_replication = 4; +constexpr Index_type default_global_replication = 32; + + +template < Index_type block_size > __launch_bounds__(block_size) -__global__ void multi_reduce(MULTI_REDUCE::Data_ptr values, - Index_ptr bins, - MULTI_REDUCE::Data_ptr data, - Index_type iend) +__global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values, + MULTI_REDUCE::Data_ptr data, + Index_ptr bins, + Index_type iend, + Index_type num_bins, + Index_type shared_replication, + Index_type global_replication) { - Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { - MULTI_REDUCE_GPU_RAJA_BODY(RAJA::hip_atomic); - } -} + if (shared_replication > 0) { + + extern __shared__ MULTI_REDUCE::Data_type shared_values[]; + for (Index_type t = threadIdx.x; + t < Index_type(num_bins * shared_replication); + t += block_size) { + shared_values[t] = MULTI_REDUCE::Data_type(0); + } + __syncthreads(); + + { + Index_type i = blockIdx.x * block_size + threadIdx.x; + if (i < iend) { + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + RAJA::atomicAdd(&shared_values[offset], data[i]); + } + } + + __syncthreads(); + for (Index_type bin = threadIdx.x; bin < num_bins; bin += block_size) { + auto block_sum = MULTI_REDUCE::Data_type(0); + for (Index_type s = 0; s < shared_replication; ++s) { + block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; + } + if (block_sum != MULTI_REDUCE::Data_type(0)) { + Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], block_sum); + } + } + } else { + Index_type i = blockIdx.x * block_size + threadIdx.x; + Index_type warp = i / warp_size; + if (i < iend) { + Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; + RAJA::atomicAdd(&global_values[offset], data[i]); + } + } +} -template < size_t block_size, size_t replication > -void MULTI_REDUCE::runHipVariantAtomicGlobal(VariantID vid) +template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > +void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -47,71 +91,83 @@ void MULTI_REDUCE::runHipVariantAtomicGlobal(VariantID vid) MULTI_REDUCE_DATA_SETUP; - RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, replication); - if ( vid == Base_HIP ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); + auto* func = &multi_reduce_atomic_runtime; - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; + hipFuncAttributes func_attr; + hipErrchk(hipFuncGetAttributes(&func_attr, (const void*)func)); + const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; + const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - RPlaunchHipKernel( (multi_reduce), - grid_size, block_size, - shmem, res.get_stream(), - values, - bins, - data, - iend ); + const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); - MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - } - stopTimer(); + const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); - } else if ( vid == Lambda_HIP ) { + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, replication); - - auto multi_reduce_lambda = [=] __device__ (Index_type i) { - MULTI_REDUCE_GPU_RAJA_BODY(RAJA::hip_atomic); - }; + RAJAPERF_HIP_REDUCER_INITIALIZE(values_init, values, hvalues, num_bins, global_replication); - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - constexpr size_t shmem = 0; - - RPlaunchHipKernel( (lambda_hip_forall), + RPlaunchHipKernel( func, grid_size, block_size, shmem, res.get_stream(), - ibegin, iend, multi_reduce_lambda ); - - RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, replication); - MULTI_REDUCE_GPU_FINALIZE_VALUES(hvalues, num_bins, replication); + values, + data, + bins, + iend, + num_bins, + shared_replication, + global_replication ); + + RAJAPERF_HIP_REDUCER_COPY_BACK(values, hvalues, num_bins, global_replication); + for (Index_type bin = 0; bin < num_bins; ++bin) { + Data_type value_final = Data_type(0); + for (Index_type r = 0; r < global_replication; ++r) { + Index_type offset = bin + RAJA::power_of_2_mod(r, global_replication) * num_bins; + value_final += hvalues[offset]; + } + values_final[bin] = value_final; + } } stopTimer(); + RAJAPERF_HIP_REDUCER_TEARDOWN(values, hvalues); + } else if ( vid == RAJA_HIP ) { + using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< + RAJA::hip::MultiReduceTuning< + RAJA::hip::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::thread_xyz<>, + RAJA::GetOffsetRight>, + RAJA::hip::AtomicReplicationTuning< + RAJA::hip::GlobalAtomicReplicationMinPow2Concretizer< + RAJA::hip::ConstantPreferredReplicationConcretizer>, + RAJA::hip::warp_global_xyz<>, + RAJA::GetOffsetLeft>>>; + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - MULTI_REDUCE_INIT_VALUES_RAJA(RAJA::hip_multi_reduce_atomic); + MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy); - RAJA::forall< RAJA::hip_exec >( res, - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - MULTI_REDUCE_BODY; + RAJA::forall>( res, + RAJA::RangeSegment(ibegin, iend), + [=] __device__ (Index_type i) { + MULTI_REDUCE_BODY; }); - MULTI_REDUCE_FINALIZE_VALUES_RAJA(RAJA::hip_multi_reduce_atomic); + MULTI_REDUCE_FINALIZE_VALUES_RAJA(multi_reduce_policy); } stopTimer(); @@ -120,34 +176,38 @@ void MULTI_REDUCE::runHipVariantAtomicGlobal(VariantID vid) getCout() << "\n MULTI_REDUCE : Unknown Hip variant id = " << vid << std::endl; } - RAJAPERF_HIP_REDUCER_TEARDOWN(values, hvalues); - } void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) { size_t t = 0; - if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + if ( vid == Base_HIP || vid == RAJA_HIP ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + run_params.validAtomicReplication(global_replication)) { - if (tune_idx == t) { + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - setBlockSize(block_size); - runHipVariantAtomicGlobal(vid); + if (tune_idx == t) { - } + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); - t += 1; + } + + t += 1; + + }); } @@ -167,30 +227,31 @@ void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) void MULTI_REDUCE::setHipTuningDefinitions(VariantID vid) { - if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { - if (run_params.numValidGPUBlockSize() == 0u || - run_params.validGPUBlockSize(block_size)) { + seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { - seq_for(gpu_atomic_replications_type{}, [&](auto replication) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(replication)) { + seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { - addVariantTuningName(vid, "replicate_"+std::to_string(replication)+ - "_global_"+std::to_string(block_size)); + addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ + ")_global("+std::to_string(global_replication)+ + ")_"+std::to_string(block_size)); - } + }); - }); + } - } + }); - }); + } - } + }); } diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 4d111444c..435ed9ab4 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -49,11 +49,9 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setVariantDefined( Base_OpenMPTarget ); setVariantDefined( Base_CUDA ); - setVariantDefined( Lambda_CUDA ); setVariantDefined( RAJA_CUDA ); setVariantDefined( Base_HIP ); - setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); setVariantDefined( Kokkos_Lambda ); diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index a85a5473a..0b0b745de 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -67,9 +67,6 @@ #define MULTI_REDUCE_RAJA_BODY(policy) \ RAJA::atomicAdd(&values[bins[i]], data[i]); -#define MULTI_REDUCE_GPU_RAJA_BODY(policy) \ - RAJA::atomicAdd(&values[bins[i]*replication + (i%replication)], data[i]); - #include "common/KernelBase.hpp" @@ -103,16 +100,22 @@ class MULTI_REDUCE : public KernelBase void setCudaTuningDefinitions(VariantID vid); void setHipTuningDefinitions(VariantID vid); - template < size_t block_size, size_t replication > - void runCudaVariantAtomicGlobal(VariantID vid); - template < size_t block_size, size_t replication > - void runHipVariantAtomicGlobal(VariantID vid); + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > + void runCudaVariantAtomicRuntime(VariantID vid); + template < Index_type block_size, + Index_type preferred_global_replication, + Index_type preferred_shared_replication > + void runHipVariantAtomicRuntime(VariantID vid); private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; static const size_t default_gpu_atomic_replication = 2048; // 512, 512 - using gpu_atomic_replications_type = integer::make_atomic_replication_list_type; + // using gpu_atomic_global_replications_type = integer::make_atomic_replication_list_type; + using gpu_atomic_global_replications_type = integer::list_type<1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2*1024, 4*1024, 8*1024, 16*1024>; + using gpu_atomic_shared_replications_type = integer::list_type<0, 1, 2, 4, 8, 16, 32>; Index_type m_num_bins; Index_ptr m_bins; diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 0061928f5..06c34a219 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -226,72 +226,6 @@ using reducer_helpers = camp::list< } // closing brace for gpu_mapping namespace - - -/*! - \brief evaluate log base 2 of n - - For positive n calculate log base 2 of n, and round the result down to the - nearest integer. - For zero or negative n return 0 - -*/ -template < typename T, - std::enable_if_t::value>* = nullptr > -constexpr T log2(T n) noexcept -{ - T result = 0; - if (n > 0) { - while(n >>= 1) { - ++result; - } - } - return result; -} - -/*! - \brief "round up" to the next greatest power of 2 - - For a integer n, - if n is non-negative, - if n is a power of 2, return n - if n is not a power of 2, return the next greater power of 2 - if n is negative, return 0 -*/ -template < typename T, - std::enable_if_t::value>* = nullptr > -constexpr T next_pow2(T n) noexcept -{ - --n; - for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) { - n |= n >> s; - } - ++n; - return n; -} - -/*! - \brief "round down" to the next smallest power of 2 - - For a integer n, - if n is non-negative, - if n is a power of 2, return n - if n is not a power of 2, return the next smaller power of 2 - if n is negative, return 0 -*/ -template < typename T, - std::enable_if_t::value>* = nullptr > -constexpr T prev_pow2(T n) noexcept -{ - if ( n < 0 ) return 0; - for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) { - n |= n >> s; - } - return n - (n >> 1); -} - - - } // closing brace for rajaperf namespace // Get the max number of blocks to launch with the given MappingHelper From 0a01bb4717b2b3bba01b2e175931bef21f649e9b Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 10 Jul 2024 11:05:38 -0700 Subject: [PATCH 415/454] Change tuning logic --- src/lcals/FIRST_MIN-OMP.cpp | 6 +++--- src/lcals/FIRST_MIN-OMPTarget.cpp | 6 +++--- src/lcals/FIRST_MIN-Seq.cpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/lcals/FIRST_MIN-OMP.cpp b/src/lcals/FIRST_MIN-OMP.cpp index 95affda9a..a9a7f1ba1 100644 --- a/src/lcals/FIRST_MIN-OMP.cpp +++ b/src/lcals/FIRST_MIN-OMP.cpp @@ -105,9 +105,7 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { using VL_TYPE = RAJA::expt::ValLoc; @@ -129,6 +127,8 @@ void FIRST_MIN::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n FIRST_MIN : Unknown OpenMP tuning index = " << tune_idx << std::endl; } break; diff --git a/src/lcals/FIRST_MIN-OMPTarget.cpp b/src/lcals/FIRST_MIN-OMPTarget.cpp index eb8d6035b..578bdfe63 100644 --- a/src/lcals/FIRST_MIN-OMPTarget.cpp +++ b/src/lcals/FIRST_MIN-OMPTarget.cpp @@ -78,9 +78,7 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { using VL_TYPE = RAJA::expt::ValLoc; @@ -102,6 +100,8 @@ void FIRST_MIN::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n FIRST_MIN : Unknown OMP Target tuning index = " << tune_idx << std::endl; } } else { diff --git a/src/lcals/FIRST_MIN-Seq.cpp b/src/lcals/FIRST_MIN-Seq.cpp index aa0c0452a..a32ed4962 100644 --- a/src/lcals/FIRST_MIN-Seq.cpp +++ b/src/lcals/FIRST_MIN-Seq.cpp @@ -97,9 +97,7 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { using VL_TYPE = RAJA::expt::ValLoc; @@ -121,6 +119,8 @@ void FIRST_MIN::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n FIRST_MIN : Unknown Seq tuning index = " << tune_idx << std::endl; } break; From 96fe94ab6e2e09fbad57093f4447834b20266365 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 10 Jul 2024 11:06:48 -0700 Subject: [PATCH 416/454] Change tuning control logic --- src/algorithm/REDUCE_SUM-OMPTarget.cpp | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/algorithm/REDUCE_SUM-OMPTarget.cpp b/src/algorithm/REDUCE_SUM-OMPTarget.cpp index 1bab3b14a..1c1be1ab7 100644 --- a/src/algorithm/REDUCE_SUM-OMPTarget.cpp +++ b/src/algorithm/REDUCE_SUM-OMPTarget.cpp @@ -74,29 +74,29 @@ void REDUCE_SUM::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } else if (tune_idx == 1) { + } else if (tune_idx == 1) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Real_type tsum = m_sum_init; + Real_type tsum = m_sum_init; - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - RAJA::expt::Reduce(&tsum), - [=] (Index_type i, Real_type& sum) { - REDUCE_SUM_BODY; - } - ); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + RAJA::expt::Reduce(&tsum), + [=] (Index_type i, Real_type& sum) { + REDUCE_SUM_BODY; + } + ); - m_sum = static_cast(tsum); + m_sum = static_cast(tsum); - } - stopTimer(); + } + stopTimer(); - } else { - getCout() << "\n REDUCE_SUM : Unknown OpenMP Target tuning index = " << tune_idx << std::endl; - } + } else { + getCout() << "\n REDUCE_SUM : Unknown OMP Target tuning index = " << tune_idx << std::endl; + } } else { getCout() << "\n REDUCE_SUM : Unknown OMP Target variant id = " << vid << std::endl; From 171330eb112af340a62a79cf053bb6d48a74472a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 10 Jul 2024 11:09:46 -0700 Subject: [PATCH 417/454] Change tuning control logic --- src/stream/DOT-OMP.cpp | 8 +++----- src/stream/DOT-OMPTarget.cpp | 6 +++--- src/stream/DOT-Seq.cpp | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index 295437c1f..e1012b0fe 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -64,8 +64,6 @@ void DOT::runOpenMPVariant(VariantID vid, size_t tune_idx) #pragma omp parallel for reduction(+:dot) for (Index_type i = ibegin; i < iend; ++i ) { dot += dot_base_lam(i); - } - m_dot += dot; } @@ -93,9 +91,7 @@ void DOT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -115,6 +111,8 @@ void DOT::runOpenMPVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n DOT : Unknown OpenMP tuning index = " << tune_idx << std::endl; } break; diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp index 123442c4f..238f8fbae 100644 --- a/src/stream/DOT-OMPTarget.cpp +++ b/src/stream/DOT-OMPTarget.cpp @@ -77,9 +77,7 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -99,6 +97,8 @@ void DOT::runOpenMPTargetVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n DOT : Unknown OMP Target tuning index = " << tune_idx << std::endl; } break; diff --git a/src/stream/DOT-Seq.cpp b/src/stream/DOT-Seq.cpp index c639f848e..4d359775f 100644 --- a/src/stream/DOT-Seq.cpp +++ b/src/stream/DOT-Seq.cpp @@ -93,9 +93,7 @@ void DOT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); - } - - if (tune_idx == 1) { + } else if (tune_idx == 1) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -114,6 +112,8 @@ void DOT::runSeqVariant(VariantID vid, size_t tune_idx) } stopTimer(); + } else { + getCout() << "\n DOT : Unknown Seq tuning index = " << tune_idx << std::endl; } break; From 7cb61f38f9d3a0afc55f0380e4417aae5ca10d37 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 10 Jul 2024 14:10:29 -0700 Subject: [PATCH 418/454] Update build scripts --- .../blueos_nvcc_clang-mpi_caliper.sh | 75 +++++++++++++++++++ .../lc-builds/blueos_nvcc_clang_caliper.sh | 2 +- .../lc-builds/blueos_nvcc_gcc-mpi_caliper.sh | 75 +++++++++++++++++++ 3 files changed, 151 insertions(+), 1 deletion(-) create mode 100755 scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh create mode 100755 scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh diff --git a/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh new file mode 100755 index 000000000..14118494a --- /dev/null +++ b/scripts/lc-builds/blueos_nvcc_clang-mpi_caliper.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 5 ]]; then + echo + echo "You must pass 5 arguments to the script (in this order): " + echo " 1) compiler version number for nvcc" + echo " 2) CUDA compute architecture (number only, not 'sm_70' for example)" + echo " 3) compiler version number for clang. " + echo " 4) path to caliper cmake directory" + echo " 5) path to adiak cmake directory" + echo + echo "For example: " + echo " blueos_nvcc_clang-mpi_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak" + exit +fi + +COMP_NVCC_VER=$1 +COMP_ARCH=$2 +COMP_CLANG_VER=$3 +CALI_DIR=$4 +ADIAK_DIR=$5 +shift 5 + +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-clang-mpi-${COMP_CLANG_VER}-caliper +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_clang_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.20.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DENABLE_OPENMP=On \ + -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ + -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=128,256,512,1024 \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to disable CUDA GPU hooks when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " lrun -1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe" +echo +echo "***********************************************************************" diff --git a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh index c13a40c25..238b9a30e 100755 --- a/scripts/lc-builds/blueos_nvcc_clang_caliper.sh +++ b/scripts/lc-builds/blueos_nvcc_clang_caliper.sh @@ -17,7 +17,7 @@ if [[ $# -lt 5 ]]; then echo " 5) path to adiak cmake directory" echo echo "For example: " - echo " blueos_nvcc_clang_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/adiak-lassen/lib/cmake/adiak" + echo " blueos_nvcc_clang_caliper.sh 10.2.89 70 10.0.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak" exit fi diff --git a/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh b/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh new file mode 100755 index 000000000..9fdcdb3a7 --- /dev/null +++ b/scripts/lc-builds/blueos_nvcc_gcc-mpi_caliper.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [[ $# -lt 3 ]]; then + echo + echo "You must pass 5 arguments to the script (in this order): " + echo " 1) compiler version number for nvcc" + echo " 2) CUDA compute architecture (number only, not 'sm_70' for example)" + echo " 3) compiler version number for gcc" + echo " 4) path to caliper cmake directory" + echo " 5) path to adiak cmake directory" + echo + echo "For example: " + echo " blueos_nvcc_gcc-mpi_caliper.sh 10.2.89 70 8.3.1 /usr/workspace/wsb/asde/caliper-lassen/share/cmake/caliper /usr/workspace/wsb/asde/caliper-lassen/lib/cmake/adiak" + exit +fi + +COMP_NVCC_VER=$1 +COMP_ARCH=$2 +COMP_GCC_VER=$3 +CALI_DIR=$4 +ADIAK_DIR=$5 +shift 5 + +BUILD_SUFFIX=lc_blueos-nvcc-${COMP_NVCC_VER}-${COMP_ARCH}-gcc-${COMP_GCC_VER}-mpi-caliper +RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/nvcc_gcc_X.cmake + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} >/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.20.2 + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \ + -DBLT_CXX_STD=c++14 \ + -C ${RAJA_HOSTCONFIG} \ + -DENABLE_MPI=ON \ + -DENABLE_OPENMP=On \ + -DENABLE_CUDA=On \ + -DCUDA_SEPARABLE_COMPILATION=On \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ + -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + -DRAJA_PERFSUITE_USE_CALIPER=ON \ + -Dcaliper_DIR=${CALI_DIR} \ + -Dadiak_DIR=${ADIAK_DIR} \ + -DRAJA_PERFSUITE_GPU_BLOCKSIZES=128,256,512,1024 \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA Perf Suite" +echo +echo " Please note that you have to disable CUDA GPU hooks when you run" +echo " the RAJA Perf Suite; for example," +echo +echo " lrun -1 --smpiargs="-disable_gpu_hooks" ./bin/raja-perf.exe" +echo +echo "***********************************************************************" From 74515fca58453e22786ee2e936bed9501359a4fc Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 10 Jul 2024 14:20:33 -0700 Subject: [PATCH 419/454] Fix compilation errors --- src/basic/REDUCE3_INT-OMP.cpp | 3 +-- src/stream/DOT-OMP.cpp | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/basic/REDUCE3_INT-OMP.cpp b/src/basic/REDUCE3_INT-OMP.cpp index 8fe6e52e4..c9848ac98 100644 --- a/src/basic/REDUCE3_INT-OMP.cpp +++ b/src/basic/REDUCE3_INT-OMP.cpp @@ -136,8 +136,7 @@ void REDUCE3_INT::runOpenMPVariant(VariantID vid, size_t tune_idx) m_vmax = RAJA_MAX(m_vmax, static_cast(tvmax)); } - } - stopTimer(); + stopTimer(); } else { getCout() << "\n REDUCE3_INT : Unknown OpenMP tuning index = " << tune_idx << std::endl; diff --git a/src/stream/DOT-OMP.cpp b/src/stream/DOT-OMP.cpp index e1012b0fe..d7112336a 100644 --- a/src/stream/DOT-OMP.cpp +++ b/src/stream/DOT-OMP.cpp @@ -64,6 +64,8 @@ void DOT::runOpenMPVariant(VariantID vid, size_t tune_idx) #pragma omp parallel for reduction(+:dot) for (Index_type i = ibegin; i < iend; ++i ) { dot += dot_base_lam(i); + } + m_dot += dot; } From 1190af131bfe281aa899f6ae0a156cc8097b6631 Mon Sep 17 00:00:00 2001 From: Michael Richard Mckinsey Date: Wed, 10 Jul 2024 19:40:26 -0700 Subject: [PATCH 420/454] Make more readable --- src/common/KernelBase.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 248200e71..82b69dba4 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -632,12 +632,22 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, } )json"; - if(!ran_spot_config_check && ((!addToSpotConfig.empty()) || (!addToCaliConfig.empty()))) { + // Skip check if both empty + if ((!addToSpotConfig.empty() || !addToCaliConfig.empty()) && !ran_spot_config_check) { cali::ConfigManager cm; - std::string check_profile = "spot(" + addToSpotConfig + ")"; - if (!addToCaliConfig.empty()) { - check_profile += "," + addToCaliConfig; + std::string check_profile; + // If both not empty + if (!addToSpotConfig.empty() && !addToCaliConfig.empty()) { + check_profile = "spot(" + addToSpotConfig + ")," + addToCaliConfig; + } + else if (!addToSpotConfig.empty()) { + check_profile = "spot(" + addToSpotConfig + ")"; } + // if !addToCaliConfig.empty() + else { + check_profile = addToCaliConfig; + } + std::string msg = cm.check(check_profile.c_str()); if(!msg.empty()) { std::cerr << "Problem with Cali Config: " << check_profile << "\n"; From 1f8d94ba7f42b7b793d75df0007d3761575ee9dd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 11 Jul 2024 13:58:21 -0700 Subject: [PATCH 421/454] Update RAJA branch --- src/algorithm/HISTOGRAM-Cuda.cpp | 2 +- src/algorithm/HISTOGRAM-Hip.cpp | 2 +- src/basic/MULTI_REDUCE-Cuda.cpp | 2 +- src/basic/MULTI_REDUCE-Hip.cpp | 2 +- tpl/RAJA | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index eb656f693..e0852d2e8 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -216,7 +216,7 @@ void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid) using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< RAJA::cuda::MultiReduceTuning< - RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, RAJA::cuda::AtomicReplicationTuning< RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer< RAJA::cuda::ConstantPreferredReplicationConcretizer>, diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 0f595164a..376ea9ad3 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -245,7 +245,7 @@ void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid) using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< RAJA::hip::MultiReduceTuning< - RAJA::hip::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, RAJA::hip::AtomicReplicationTuning< RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer< RAJA::hip::ConstantPreferredReplicationConcretizer>, diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index 3448067bf..605c9a3fc 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -144,7 +144,7 @@ void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid) using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< RAJA::cuda::MultiReduceTuning< - RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, RAJA::cuda::AtomicReplicationTuning< RAJA::cuda::SharedAtomicReplicationMaxPow2Concretizer< RAJA::cuda::ConstantPreferredReplicationConcretizer>, diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index b82949f5f..6fa09853c 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -144,7 +144,7 @@ void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid) using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< RAJA::hip::MultiReduceTuning< - RAJA::hip::multi_reduce_algorithm::init_host_combine_block_then_grid_atomic, + RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, RAJA::hip::AtomicReplicationTuning< RAJA::hip::SharedAtomicReplicationMaxPow2Concretizer< RAJA::hip::ConstantPreferredReplicationConcretizer>, diff --git a/tpl/RAJA b/tpl/RAJA index dfaab80f7..361b37fe2 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit dfaab80f75dc8f9fc872de233ffa13b104ebac55 +Subproject commit 361b37fe2c9330b6196cd4341aa9ba6de4609702 From 32c18949ee2907893799e5d0977b8b406efd5e66 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 11 Jul 2024 14:00:01 -0700 Subject: [PATCH 422/454] Add options for num_bins and bin assignment --- src/algorithm/HISTOGRAM.cpp | 15 ++++++--- src/basic/MULTI_REDUCE.cpp | 64 ++++++++++++++++++++++++++++++++----- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 51125eaea..0a819a78e 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -13,6 +13,7 @@ #include "common/DataUtils.hpp" #include +#include namespace rajaperf { @@ -28,7 +29,8 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - m_num_bins = 100; + const char* e_num_bins = getenv("RAJAPERF_MULTI_REDUCE_NUM_BINS"); + m_num_bins = e_num_bins ? atoi(e_num_bins) : 10; setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); @@ -68,10 +70,13 @@ void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); - bool init_even_sizes = false; - bool init_random_sizes = false; - bool init_all_one = true; - bool init_random_per_iterate = false; + const char* e_algorithm = getenv("RAJAPERF_MULTI_REDUCE_BIN_ASSIGNMENT"); + const int algorithm = e_algorithm ? atoi(e_algorithm) : 0; + const bool init_random_per_iterate = algorithm == 0; + const bool init_random_sizes = algorithm == 1; + const bool init_even_sizes = algorithm == 2; + const bool init_all_one = algorithm == 3; + if (init_even_sizes || init_random_sizes || init_all_one) { Real_ptr data = nullptr; if (init_even_sizes) { diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 435ed9ab4..0e179a7ec 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -12,6 +12,9 @@ #include "common/DataUtils.hpp" +#include +#include + namespace rajaperf { namespace basic @@ -26,7 +29,8 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - m_num_bins = 10; + const char* e_num_bins = getenv("RAJAPERF_MULTI_REDUCE_NUM_BINS"); + m_num_bins = e_num_bins ? atoi(e_num_bins) : 10; setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); @@ -67,16 +71,60 @@ void MULTI_REDUCE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) allocAndInitDataRandValue(m_data, getActualProblemSize(), vid); { auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); - auto reset_data = scopedMoveData(m_data, getActualProblemSize(), vid); - for (Index_type i = 0; i < getActualProblemSize(); ++i) { - m_bins[i] = static_cast(m_data[i] * m_num_bins); - if (m_bins[i] >= m_num_bins) { - m_bins[i] = m_num_bins - 1; + const char* e_algorithm = getenv("RAJAPERF_MULTI_REDUCE_BIN_ASSIGNMENT"); + const int algorithm = e_algorithm ? atoi(e_algorithm) : 0; + const bool init_random_per_iterate = algorithm == 0; + const bool init_random_sizes = algorithm == 1; + const bool init_even_sizes = algorithm == 2; + const bool init_all_one = algorithm == 3; + + if (init_even_sizes || init_random_sizes || init_all_one) { + Real_ptr data = nullptr; + if (init_even_sizes) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(b+1) / m_num_bins; + } + } else if (init_random_sizes) { + allocAndInitDataRandValue(data, m_num_bins, Base_Seq); + std::sort(data, data+m_num_bins); + } else if (init_all_one) { + allocData(data, m_num_bins, Base_Seq); + for (Index_type b = 0; b < m_num_bins; ++b) { + data[b] = static_cast(0); + } } - if (m_bins[i] < 0) { - m_bins[i] = 0; + + Index_type actual_prob_size = getActualProblemSize(); + Index_type bin = 0; + for (Index_type i = 0; i < actual_prob_size; ++i) { + Real_type pos = static_cast(i) / actual_prob_size; + while (bin+1 < m_num_bins && pos >= data[bin]) { + bin += 1; + } + m_bins[i] = bin; } + + deallocData(data, Base_Seq); + + } else if (init_random_per_iterate) { + Real_ptr data; + allocAndInitDataRandValue(data, getActualProblemSize(), Base_Seq); + + for (Index_type i = 0; i < getActualProblemSize(); ++i) { + m_bins[i] = static_cast(data[i] * m_num_bins); + if (m_bins[i] >= m_num_bins) { + m_bins[i] = m_num_bins - 1; + } + if (m_bins[i] < 0) { + m_bins[i] = 0; + } + } + + deallocData(data, Base_Seq); + } else { + throw 1; } } From 7e78fd3724632fa51516d84a780f33a1ea982faf Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 12 Jul 2024 11:09:09 -0700 Subject: [PATCH 423/454] Update to use same defaults as RAJA --- src/algorithm/HISTOGRAM-Cuda.cpp | 8 ++++---- src/algorithm/HISTOGRAM-Hip.cpp | 8 ++++---- src/algorithm/HISTOGRAM.hpp | 14 ++++++++++---- src/basic/MULTI_REDUCE-Cuda.cpp | 8 ++++---- src/basic/MULTI_REDUCE-Hip.cpp | 8 ++++---- src/basic/MULTI_REDUCE.hpp | 14 ++++++++++---- 6 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index e0852d2e8..a0f5ea766 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -274,12 +274,12 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { if (tune_idx == t) { @@ -325,12 +325,12 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ ")_global("+std::to_string(global_replication)+ diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 376ea9ad3..331e2e745 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -303,12 +303,12 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { if (tune_idx == t) { @@ -354,12 +354,12 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ ")_global("+std::to_string(global_replication)+ diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index a060693d1..8d8fcd985 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -120,10 +120,16 @@ class HISTOGRAM : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const size_t default_gpu_atomic_global_replication = 2048; // 512, 512 - // using gpu_atomic_global_replications_type = integer::make_atomic_replication_list_type; - using gpu_atomic_global_replications_type = integer::list_type<1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2*1024, 4*1024, 8*1024, 16*1024>; - using gpu_atomic_shared_replications_type = integer::list_type<0, 1, 2, 4, 8, 16, 32>; + + static const size_t default_cuda_atomic_global_replication = 2; + static const size_t default_cuda_atomic_shared_replication = 16; + using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type; + using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type; + + static const size_t default_hip_atomic_global_replication = 32; + static const size_t default_hip_atomic_shared_replication = 4; + using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type; + using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; Index_ptr m_bins; diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index 605c9a3fc..e6a28b8d9 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -189,12 +189,12 @@ void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { if (tune_idx == t) { @@ -232,12 +232,12 @@ void MULTI_REDUCE::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ ")_global("+std::to_string(global_replication)+ diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 6fa09853c..4f2eb32f5 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -189,12 +189,12 @@ void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { if (tune_idx == t) { @@ -232,12 +232,12 @@ void MULTI_REDUCE::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(gpu_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || run_params.validAtomicReplication(global_replication)) { - seq_for(gpu_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ ")_global("+std::to_string(global_replication)+ diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 0b0b745de..3f022f261 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -112,10 +112,16 @@ class MULTI_REDUCE : public KernelBase private: static const size_t default_gpu_block_size = 256; using gpu_block_sizes_type = integer::make_gpu_block_size_list_type; - static const size_t default_gpu_atomic_replication = 2048; // 512, 512 - // using gpu_atomic_global_replications_type = integer::make_atomic_replication_list_type; - using gpu_atomic_global_replications_type = integer::list_type<1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2*1024, 4*1024, 8*1024, 16*1024>; - using gpu_atomic_shared_replications_type = integer::list_type<0, 1, 2, 4, 8, 16, 32>; + + static const size_t default_cuda_atomic_global_replication = 2; + static const size_t default_cuda_atomic_shared_replication = 16; + using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type; + using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type; + + static const size_t default_hip_atomic_global_replication = 32; + static const size_t default_hip_atomic_shared_replication = 4; + using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type; + using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; Index_ptr m_bins; From ffaaaab76d38d5bd3cb80dcb94a109133d5fc3dc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 12 Jul 2024 12:41:24 -0700 Subject: [PATCH 424/454] Update RAJA to develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 361b37fe2..c1cffa924 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 361b37fe2c9330b6196cd4341aa9ba6de4609702 +Subproject commit c1cffa924db3ab3e00de5fb91c8b54f9eabe1d96 From 8b9110b9f81240f5544def584223dfe391274b69 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 12 Jul 2024 14:03:59 -0700 Subject: [PATCH 425/454] Cast indices before use in pow2_mod --- src/algorithm/HISTOGRAM-Cuda.cpp | 4 ++-- src/algorithm/HISTOGRAM-Hip.cpp | 4 ++-- src/basic/MULTI_REDUCE-Cuda.cpp | 4 ++-- src/basic/MULTI_REDUCE-Hip.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index a0f5ea766..4b82a3c74 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -52,7 +52,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); } } @@ -64,7 +64,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; } if (block_sum != HISTOGRAM::Data_type(0)) { - Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; RAJA::atomicAdd(&global_counts[offset], block_sum); } } diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 331e2e745..1a85f5006 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -57,7 +57,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); } } @@ -69,7 +69,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, block_sum += shared_counts[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; } if (block_sum != HISTOGRAM::Data_type(0)) { - Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; RAJA::atomicAdd(&global_counts[offset], block_sum); } } diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index e6a28b8d9..1d82e2775 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -50,7 +50,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_values[offset], data[i]); } } @@ -62,7 +62,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; } if (block_sum != MULTI_REDUCE::Data_type(0)) { - Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; RAJA::atomicAdd(&global_values[offset], block_sum); } } diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 4f2eb32f5..9fcd62299 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -50,7 +50,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values { Index_type i = blockIdx.x * block_size + threadIdx.x; if (i < iend) { - Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(threadIdx.x, shared_replication); + Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_values[offset], data[i]); } } @@ -62,7 +62,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values block_sum += shared_values[bin * shared_replication + RAJA::power_of_2_mod(s, shared_replication)]; } if (block_sum != MULTI_REDUCE::Data_type(0)) { - Index_type offset = bin + RAJA::power_of_2_mod(blockIdx.x, global_replication) * num_bins; + Index_type offset = bin + RAJA::power_of_2_mod(Index_type{blockIdx.x}, global_replication) * num_bins; RAJA::atomicAdd(&global_values[offset], block_sum); } } From a0a8cc18a947a67e29672e05984b5a4af0257158 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 17:03:49 -0700 Subject: [PATCH 426/454] Split bytesPerRep into bytesReadPerRep, bytesWrittenPerRep, and bytesAtomicModifyWrittenPerRep Avoid negative itsPerRep in FIR --- src/algorithm/ATOMIC.cpp | 5 ++-- src/algorithm/MEMCPY.cpp | 4 ++- src/algorithm/MEMSET.cpp | 5 ++-- src/algorithm/REDUCE_SUM.cpp | 5 ++-- src/algorithm/SCAN.cpp | 4 ++- src/algorithm/SCAN.hpp | 7 +++-- src/algorithm/SORT.cpp | 4 ++- src/algorithm/SORTPAIRS.cpp | 4 ++- src/apps/CONVECTION3DPA.cpp | 9 +++--- src/apps/DEL_DOT_VEC_2D.cpp | 7 +++-- src/apps/DIFFUSION3DPA.cpp | 9 +++--- src/apps/EDGE3D.cpp | 7 ++--- src/apps/ENERGY.cpp | 23 ++++++++++----- src/apps/FIR-Cuda.cpp | 2 +- src/apps/FIR-Hip.cpp | 2 +- src/apps/FIR-OMP.cpp | 2 +- src/apps/FIR-OMPTarget.cpp | 2 +- src/apps/FIR-Seq.cpp | 2 +- src/apps/FIR-Sycl.cpp | 2 +- src/apps/FIR.cpp | 12 ++++---- src/apps/LTIMES.cpp | 8 +++-- src/apps/LTIMES_NOVIEW.cpp | 8 +++-- src/apps/MASS3DEA.cpp | 7 +++-- src/apps/MASS3DPA.cpp | 10 +++---- src/apps/MATVEC_3D_STENCIL.cpp | 9 +++--- src/apps/NODAL_ACCUMULATION_3D.cpp | 8 +++-- src/apps/PRESSURE.cpp | 7 +++-- src/apps/VOL3D.cpp | 5 ++-- src/apps/ZONAL_ACCUMULATION_3D.cpp | 7 +++-- src/basic/ARRAY_OF_PTRS.cpp | 4 ++- src/basic/COPY8.cpp | 4 ++- src/basic/DAXPY.cpp | 4 ++- src/basic/DAXPY_ATOMIC.cpp | 4 ++- src/basic/IF_QUAD.cpp | 4 ++- src/basic/INDEXLIST.cpp | 8 +++-- src/basic/INDEXLIST_3LOOP.cpp | 17 +++++++---- src/basic/INIT3.cpp | 4 ++- src/basic/INIT_VIEW1D.cpp | 4 ++- src/basic/INIT_VIEW1D_OFFSET.cpp | 4 ++- src/basic/MAT_MAT_SHARED.cpp | 5 ++-- src/basic/MULADDSUB.cpp | 4 ++- src/basic/NESTED_INIT.cpp | 4 ++- src/basic/PI_ATOMIC.cpp | 5 ++-- src/basic/PI_REDUCE.cpp | 5 ++-- src/basic/REDUCE3_INT.cpp | 6 ++-- src/basic/REDUCE_STRUCT.cpp | 5 +++- src/basic/TRAP_INT.cpp | 5 ++-- src/comm/HALO_EXCHANGE.cpp | 19 ++++++++---- src/comm/HALO_EXCHANGE_FUSED.cpp | 19 ++++++++---- src/comm/HALO_PACKING.cpp | 13 ++++++--- src/comm/HALO_PACKING_FUSED.cpp | 13 ++++++--- src/comm/HALO_SENDRECV.cpp | 5 ++-- src/common/Executor.cpp | 34 ++++++++++++++++++++-- src/common/KernelBase.cpp | 29 ++++++++++++++++-- src/common/KernelBase.hpp | 16 ++++++++-- src/lcals/DIFF_PREDICT.cpp | 4 ++- src/lcals/EOS.cpp | 6 ++-- src/lcals/FIRST_DIFF.cpp | 6 ++-- src/lcals/FIRST_MIN.cpp | 9 ++++-- src/lcals/FIRST_SUM.cpp | 5 ++-- src/lcals/GEN_LIN_RECUR.cpp | 7 +++-- src/lcals/HYDRO_1D.cpp | 6 ++-- src/lcals/HYDRO_2D.cpp | 12 ++++---- src/lcals/INT_PREDICT.cpp | 4 ++- src/lcals/PLANCKIAN.cpp | 4 ++- src/lcals/TRIDIAG_ELIM.cpp | 4 ++- src/polybench/POLYBENCH_2MM.cpp | 14 +++++---- src/polybench/POLYBENCH_3MM.cpp | 21 +++++++------ src/polybench/POLYBENCH_ADI.cpp | 7 +++-- src/polybench/POLYBENCH_ATAX.cpp | 13 +++++---- src/polybench/POLYBENCH_FDTD_2D.cpp | 25 ++++++++++------ src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 4 ++- src/polybench/POLYBENCH_GEMM.cpp | 7 +++-- src/polybench/POLYBENCH_GEMVER.cpp | 22 +++++++++----- src/polybench/POLYBENCH_GESUMMV.cpp | 6 ++-- src/polybench/POLYBENCH_HEAT_3D.cpp | 15 +++++----- src/polybench/POLYBENCH_JACOBI_1D.cpp | 15 +++++----- src/polybench/POLYBENCH_JACOBI_2D.cpp | 15 +++++----- src/polybench/POLYBENCH_MVT.cpp | 13 ++++++--- src/stream/ADD.cpp | 5 ++-- src/stream/COPY.cpp | 5 ++-- src/stream/DOT.cpp | 7 +++-- src/stream/MUL.cpp | 5 ++-- src/stream/TRIAD.cpp | 5 ++-- 84 files changed, 462 insertions(+), 239 deletions(-) diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp index b2054c937..8da1c2421 100644 --- a/src/algorithm/ATOMIC.cpp +++ b/src/algorithm/ATOMIC.cpp @@ -28,8 +28,9 @@ ATOMIC::ATOMIC(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 0 ); + setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); setFLOPsPerRep(getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index 84203bf03..f8ced7ac7 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -28,7 +28,9 @@ MEMCPY::MEMCPY(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index ece31a1ad..d35332d58 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -28,8 +28,9 @@ MEMSET::MEMSET(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (0*sizeof(Real_type) + 1*sizeof(Real_type)) + - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 1fb1b78f7..4aebb5b0f 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -28,8 +28,9 @@ REDUCE_SUM::REDUCE_SUM(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * (1+getActualProblemSize()) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index 30cb534df..a5c04abc4 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -28,7 +28,9 @@ SCAN::SCAN(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); checksum_scale_factor = 1e-2 * diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index 102e02981..f55381d21 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -10,9 +10,10 @@ /// SCAN kernel reference implementation: /// /// // exclusive scan -/// y[ibegin] = 0; -/// for (Index_type i = ibegin+1; i < iend; ++i) { -/// y[i] = y[i-1] + x[i-1]; +/// Real_type scan_var = 0; +/// for (Index_type i = ibegin; i < iend; ++i) { +/// y[i] = scan_var; +/// scan_var += x[i]; /// } /// diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index 55441375b..bc99df634 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -28,7 +28,9 @@ SORT::SORT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); // touched data size, not actual number of stores and loads + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Sort); diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 0e903e116..6315970b2 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -28,7 +28,9 @@ SORTPAIRS::SORTPAIRS(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); // touched data size, not actual number of stores and loads + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); // not useful in this case due to O(n*log(n)) algorithm + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Sort); diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index b99238078..bbbd14db4 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -35,10 +35,11 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( 3*CPA_Q1D*CPA_D1D*sizeof(Real_type) + - CPA_VDIM*CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE*sizeof(Real_type) + - CPA_D1D*CPA_D1D*CPA_D1D*m_NE*sizeof(Real_type) + - CPA_D1D*CPA_D1D*CPA_D1D*m_NE*sizeof(Real_type) ); + setBytesReadPerRep( 3*sizeof(Real_type) * CPA_Q1D*CPA_D1D + // b, bt, g + 2*sizeof(Real_type) * CPA_D1D*CPA_D1D*CPA_D1D*m_NE + // x, y + CPA_VDIM*sizeof(Real_type) * CPA_Q1D*CPA_Q1D*CPA_Q1D*m_NE ); // d + setBytesWrittenPerRep( 1*sizeof(Real_type) + CPA_D1D*CPA_D1D*CPA_D1D*m_NE ); // y + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * ( 4 * CPA_D1D * CPA_Q1D * CPA_D1D * CPA_D1D + //2 diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index 1d211daf3..b4c041e46 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -37,9 +37,10 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + - (1*sizeof(Real_type) + 0*sizeof(Real_type) ) * getItsPerRep() + - (0*sizeof(Real_type) + 4*sizeof(Real_type) ) * m_domain->n_real_nodes ) ; // touched data size, not actual number of stores and loads + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 4*sizeof(Real_type) * m_domain->n_real_nodes ); // 4 variables with 2d nodal stencil pattern: 4 touches per iterate + setBytesWrittenPerRep( 1*sizeof(Index_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(54 * m_domain->n_real_zones); setUsesFeature(Forall); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 0c6e57e2e..68cd57ad2 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -35,10 +35,11 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( 2*DPA_Q1D*DPA_D1D*sizeof(Real_type) + - DPA_Q1D*DPA_Q1D*DPA_Q1D*SYM*m_NE*sizeof(Real_type) + - DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) + - DPA_D1D*DPA_D1D*DPA_D1D*m_NE*sizeof(Real_type) ); + setBytesReadPerRep( 2*sizeof(Real_type) * DPA_Q1D*DPA_D1D + // b, g + 2*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE + // x, y + SYM*sizeof(Real_type) * DPA_Q1D*DPA_Q1D*DPA_Q1D*m_NE ); // d + setBytesWrittenPerRep( 1*sizeof(Real_type) * DPA_D1D*DPA_D1D*DPA_D1D*m_NE ); // y + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * (DPA_Q1D * DPA_D1D + 5 * DPA_D1D * DPA_D1D * DPA_Q1D * DPA_D1D + diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index 0e8b7c3ea..3bd5dcce4 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -40,10 +40,9 @@ EDGE3D::EDGE3D(const RunParams& params) // touched data size, not actual number of stores and loads // see VOL3D.cpp - size_t reads_per_node = 3*sizeof(Real_type); - size_t writes_per_zone = 1*sizeof(Real_type); - setBytesPerRep( writes_per_zone * getItsPerRep() + - reads_per_node * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); constexpr size_t flops_k_loop = 15 + 6*flops_Jxx() diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index bf5e35c5e..7f480d00f 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -29,13 +29,22 @@ ENERGY::ENERGY(const RunParams& params) setItsPerRep( 6 * getActualProblemSize() ); setKernelsPerRep(6); // some branches are never taken due to the nature of the initialization of delvc - // the additional reads and writes that would be done if those branches were taken are noted in the comments - setBytesPerRep( (1*sizeof(Real_type) + 5*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + /* 1 + 8 */ - (1*sizeof(Real_type) + 6*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 7*sizeof(Real_type)) * getActualProblemSize() + /* 1 + 12 */ - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); /* 1 + 8 */ + // the additional reads that would be done if those branches were taken are noted in the comments + setBytesReadPerRep((5*sizeof(Real_type) + + 1*sizeof(Real_type) + // 8 + 6*sizeof(Real_type) + + 2*sizeof(Real_type) + + 7*sizeof(Real_type) + // 12 + 1*sizeof(Real_type) // 8 + ) * getActualProblemSize() ); + setBytesWrittenPerRep((1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 1*sizeof(Real_type) + + 0*sizeof(Real_type) + ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((6 + 11 + // 1 sqrt 8 + diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp index 32ebd761b..9605d85b1 100644 --- a/src/apps/FIR-Cuda.cpp +++ b/src/apps/FIR-Cuda.cpp @@ -83,7 +83,7 @@ void FIR::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); auto res{getCudaResource()}; diff --git a/src/apps/FIR-Hip.cpp b/src/apps/FIR-Hip.cpp index 2627b2c5e..a3272cb23 100644 --- a/src/apps/FIR-Hip.cpp +++ b/src/apps/FIR-Hip.cpp @@ -81,7 +81,7 @@ void FIR::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); auto res{getHipResource()}; diff --git a/src/apps/FIR-OMP.cpp b/src/apps/FIR-OMP.cpp index 5475e0061..5fcad1616 100644 --- a/src/apps/FIR-OMP.cpp +++ b/src/apps/FIR-OMP.cpp @@ -25,7 +25,7 @@ void FIR::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); FIR_COEFF; diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp index a7f476aa0..3ba913846 100644 --- a/src/apps/FIR-OMPTarget.cpp +++ b/src/apps/FIR-OMPTarget.cpp @@ -43,7 +43,7 @@ void FIR::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); FIR_DATA_SETUP; diff --git a/src/apps/FIR-Seq.cpp b/src/apps/FIR-Seq.cpp index 59594798e..001ffd194 100644 --- a/src/apps/FIR-Seq.cpp +++ b/src/apps/FIR-Seq.cpp @@ -23,7 +23,7 @@ void FIR::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); FIR_COEFF; diff --git a/src/apps/FIR-Sycl.cpp b/src/apps/FIR-Sycl.cpp index 74b23f8d0..eee240a5f 100644 --- a/src/apps/FIR-Sycl.cpp +++ b/src/apps/FIR-Sycl.cpp @@ -42,7 +42,7 @@ void FIR::runSyclVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize() - m_coefflen; + const Index_type iend = getActualProblemSize(); auto res{getSyclResource()}; auto qu = res.get_queue(); diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 96cc28296..f4a2de7e8 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -28,11 +28,13 @@ FIR::FIR(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - setItsPerRep( getActualProblemSize() - m_coefflen ); + setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); - setFLOPsPerRep((2 * m_coefflen) * (getActualProblemSize() - m_coefflen)); + setBytesReadPerRep( m_coefflen*sizeof(Real_type) + + 1*sizeof(Real_type) * (getActualProblemSize() + m_coefflen-1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); + setFLOPsPerRep((2 * m_coefflen) * getActualProblemSize()); checksum_scale_factor = 0.0001 * ( static_cast(getDefaultProblemSize()) / @@ -67,7 +69,7 @@ FIR::~FIR() void FIR::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - allocAndInitData(m_in, getActualProblemSize(), vid); + allocAndInitData(m_in, getActualProblemSize() + m_coefflen-1, vid); allocAndInitDataConst(m_out, getActualProblemSize(), 0.0, vid); } diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 83ba52774..f52eaea4f 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -47,9 +47,11 @@ LTIMES::LTIMES(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // using total data size instead of writes and reads - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_philen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_elllen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_psilen ); + setBytesReadPerRep( 1*sizeof(Real_type) * m_philen + + 1*sizeof(Real_type) * m_elllen + + 1*sizeof(Real_type) * m_psilen ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d); checksum_scale_factor = 0.001 * diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 0154ad060..941e9da9c 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -47,9 +47,11 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // using total data size instead of writes and reads - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_philen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_elllen + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_psilen ); + setBytesReadPerRep( 1*sizeof(Real_type) * m_philen + + 1*sizeof(Real_type) * m_elllen + + 1*sizeof(Real_type) * m_psilen ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_philen ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_num_z * m_num_g * m_num_m * m_num_d); checksum_scale_factor = 0.001 * diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index 503ccd72d..9beaddba0 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -37,9 +37,10 @@ MASS3DEA::MASS3DEA(const RunParams& params) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( MEA_Q1D*MEA_D1D*sizeof(Real_type) + // B - MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE*sizeof(Real_type) + // D - ea_mat_entries*m_NE*sizeof(Real_type) ); // M_e + setBytesReadPerRep( 1*sizeof(Real_type) * MEA_Q1D*MEA_D1D + // B + 1*sizeof(Real_type) * MEA_Q1D*MEA_Q1D*MEA_Q1D*m_NE ); // D + setBytesWrittenPerRep( 1*sizeof(Real_type) * ea_mat_entries*m_NE ); // M_e + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * 7 * ea_mat_entries); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 4f918d49c..6f0058d63 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -35,11 +35,11 @@ MASS3DPA::MASS3DPA(const RunParams& params) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( MPA_Q1D*MPA_D1D*sizeof(Real_type) + - MPA_Q1D*MPA_D1D*sizeof(Real_type) + - MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE*sizeof(Real_type) + - MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) + - MPA_D1D*MPA_D1D*MPA_D1D*m_NE*sizeof(Real_type) ); + setBytesReadPerRep( 2*sizeof(Real_type) * MPA_Q1D*MPA_D1D + // B, Bt + 2*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE + // X, Y + 1*sizeof(Real_type) * MPA_Q1D*MPA_Q1D*MPA_Q1D*m_NE ); // D + setBytesWrittenPerRep( 1*sizeof(Real_type) * MPA_D1D*MPA_D1D*MPA_D1D*m_NE ); // Y + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_NE * (2 * MPA_D1D * MPA_D1D * MPA_D1D * MPA_Q1D + 2 * MPA_D1D * MPA_D1D * MPA_Q1D * MPA_Q1D + diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp index 95215e90d..ea01e9ed4 100644 --- a/src/apps/MATVEC_3D_STENCIL.cpp +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -69,10 +69,11 @@ MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params) get_size_matrix(1, 1, 1) + get_size_matrix(0, 1, 1) + get_size_matrix(1, 1, 1) ; - setBytesPerRep( getItsPerRep()*sizeof(Index_type) + - b_accessed*sizeof(Real_type) + - x_accessed*sizeof(Real_type) + - m_accessed*sizeof(Real_type) ); + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 1*sizeof(Real_type) * x_accessed + + 1*sizeof(Real_type) * m_accessed ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * b_accessed ); + setBytesAtomicModifyWrittenPerRep( 0 ); const size_t multiplies = 27; const size_t adds = 26; diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index fc32eba04..124ef01e1 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -39,9 +39,11 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // touched data size, not actual number of stores and loads - setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * m_domain->n_real_nodes); + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 1*sizeof(Real_type) * getItsPerRep() + + 1*sizeof(Real_type) * m_domain->n_real_nodes); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_domain->n_real_nodes ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(9 * getItsPerRep()); checksum_scale_factor = 0.001 * diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index a37130072..ed1e00306 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -28,8 +28,11 @@ PRESSURE::PRESSURE(const RunParams& params) setItsPerRep( 2 * getActualProblemSize() ); setKernelsPerRep(2); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + - (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + + 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() + + 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((2 + 1 ) * getActualProblemSize()); diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 45cd4b0a9..c56e77f3d 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -38,8 +38,9 @@ VOL3D::VOL3D(const RunParams& params) setItsPerRep( m_domain->lpz+1 - m_domain->fpz ); setKernelsPerRep(1); // touched data size, not actual number of stores and loads - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 3*sizeof(Real_type)) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesReadPerRep( 3*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz)); checksum_scale_factor = 0.001 * diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 5b0f4c20d..ce47e2057 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -39,9 +39,10 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); // touched data size, not actual number of stores and loads - setBytesPerRep( (0*sizeof(Index_type) + 1*sizeof(Index_type)) * getItsPerRep() + - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_domain->n_real_nodes); + setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + + 1*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(8 * getItsPerRep()); checksum_scale_factor = 0.001 * diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 17f6e5c1c..f95c7e384 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -30,7 +30,9 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + m_array_size*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( m_array_size*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(m_array_size * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index 3f6e49044..ce8847032 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -28,7 +28,9 @@ COPY8::COPY8(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (8*sizeof(Real_type) + 8*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 8*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index c324a8df5..fafb9bb66 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -28,7 +28,9 @@ DAXPY::DAXPY(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index a311589f5..24ec906c4 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -28,7 +28,9 @@ DAXPY_ATOMIC::DAXPY_ATOMIC(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 0 ); + setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); setFLOPsPerRep(2 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 70c90adc5..58ccb9f58 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -28,7 +28,9 @@ IF_QUAD::IF_QUAD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type) + 3*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(11 * getActualProblemSize()); // 1 sqrt checksum_scale_factor = 0.0001 * diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index cb559c8b2..0336d0643 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -28,9 +28,11 @@ INDEXLIST::INDEXLIST(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 + // about 50% output - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Index_type) + + 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Index_type) + + 1*sizeof(Int_type) * getActualProblemSize() / 2 ); // about 50% output + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 49117dc66..1759f10b0 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -28,14 +28,19 @@ INDEXLIST_3LOOP::INDEXLIST_3LOOP(const RunParams& params) setItsPerRep( 3 * getActualProblemSize() + 1 ); setKernelsPerRep(3); - setBytesPerRep( (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getActualProblemSize() + + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() + - (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (1*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) + + 1*sizeof(Index_type) + + 1*sizeof(Index_type) * (getActualProblemSize()+1) + - (0*sizeof(Int_type) + 1*sizeof(Int_type)) * (getActualProblemSize()+1) + - (1*sizeof(Int_type) + 0*sizeof(Int_type)) * getActualProblemSize() / 2 ); // about 50% output + 1*sizeof(Index_type) * (getActualProblemSize()+1) ); + setBytesWrittenPerRep( 1*sizeof(Index_type) * getActualProblemSize() + + + 1*sizeof(Index_type) + + 1*sizeof(Index_type) * (getActualProblemSize()+1) + + + 1*sizeof(Int_type) * (getActualProblemSize()+1) / 2 ); // about 50% output + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 902de8ec6..1f0da97f3 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -28,7 +28,9 @@ INIT3::INIT3(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (3*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 3c101f6ce..b853507bd 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -28,7 +28,9 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index aa89b6112..323cc7f2e 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -28,7 +28,9 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index b1ad9d127..e448bfa8e 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -31,8 +31,9 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( m_N*m_N*sizeof(Real_type) + - m_N*m_N*sizeof(Real_type) ); + setBytesReadPerRep( 2*sizeof(Real_type) * m_N*m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * m_N*m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); const Index_type no_tiles = (TL_SZ + m_N - 1) / TL_SZ; const Index_type no_blocks = RAJA_DIVIDE_CEILING_INT(m_N, TL_SZ); diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index 09a310275..4ab19194c 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -28,7 +28,9 @@ MULADDSUB::MULADDSUB(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (3*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 3*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index f827bd7f4..e76c59610 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -39,7 +39,9 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * getActualProblemSize()); setUsesFeature(Kernel); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 40321c919..5a6a5bc04 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -28,8 +28,9 @@ PI_ATOMIC::PI_ATOMIC(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 0 ); + setBytesWrittenPerRep( 0 ); + setBytesAtomicModifyWrittenPerRep( 1*sizeof(Real_type) ); setFLOPsPerRep(6 * getActualProblemSize() + 1); setUsesFeature(Forall); diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 2ae27e762..a258ae8cd 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -28,8 +28,9 @@ PI_REDUCE::PI_REDUCE(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(6 * getActualProblemSize() + 1); setUsesFeature(Forall); diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 670209336..3be262b77 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -33,8 +33,10 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (3*sizeof(Int_type) + 3*sizeof(Int_type)) + - (0*sizeof(Int_type) + 1*sizeof(Int_type)) * getActualProblemSize() ); + setBytesReadPerRep( 3*sizeof(Int_type) + + 1*sizeof(Int_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 3*sizeof(Int_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize() + 1); setUsesFeature(Forall); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 2fd5f9a3f..764e82f67 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -33,7 +33,10 @@ REDUCE_STRUCT::REDUCE_STRUCT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( 6*sizeof(Real_type) + 2*sizeof(Real_type)*getActualProblemSize()); + setBytesReadPerRep( 6*sizeof(Real_type) + + 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 6*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize() + 2); diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 859a72815..09da695ea 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -28,8 +28,9 @@ TRAP_INT::TRAP_INT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(10 * getActualProblemSize()); // 1 sqrt setUsesFeature(Forall); diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp index abfed942b..00a3bc5c4 100644 --- a/src/comm/HALO_EXCHANGE.cpp +++ b/src/comm/HALO_EXCHANGE.cpp @@ -31,12 +31,19 @@ HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params) setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + // recv - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // send + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // recv + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp index 11651755f..b691f0df7 100644 --- a/src/comm/HALO_EXCHANGE_FUSED.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED.cpp @@ -31,12 +31,19 @@ HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params) setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + // recv - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // send + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() + // recv + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Workgroup); diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp index 4c0326b1f..3287d1280 100644 --- a/src/comm/HALO_PACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -25,10 +25,15 @@ HALO_PACKING::HALO_PACKING(const RunParams& params) setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 * s_num_neighbors * m_num_vars ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp index 2e683b90b..bbc58f581 100644 --- a/src/comm/HALO_PACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -25,10 +25,15 @@ HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params) setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 2 ); - setBytesPerRep( (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // pack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // pack - (0*sizeof(Int_type) + 1*sizeof(Int_type) ) * getItsPerRep() + // unpack - (1*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() ); // unpack + setBytesReadPerRep( 1*sizeof(Int_type) * getItsPerRep() + // pack + 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Int_type) * getItsPerRep() + // unpack + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() + // pack + + 1*sizeof(Real_type) * getItsPerRep() ); // unpack + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Workgroup); diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp index 3e0990248..a7e2c51cc 100644 --- a/src/comm/HALO_SENDRECV.cpp +++ b/src/comm/HALO_SENDRECV.cpp @@ -31,8 +31,9 @@ HALO_SENDRECV::HALO_SENDRECV(const RunParams& params) setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); setKernelsPerRep( 0 ); - setBytesPerRep( (0*sizeof(Real_type) + 1*sizeof(Real_type)) * getItsPerRep() + // send - (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() ); // recv + setBytesReadPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // send + setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); // recv + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index bd9d43392..a79b2d2f2 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -522,15 +522,21 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const Index_type itsrep_width = 0; Index_type bytesrep_width = 0; Index_type flopsrep_width = 0; + Index_type bytesReadrep_width = 0; + Index_type bytesWrittenrep_width = 0; + Index_type bytesAtomicModifyWrittenrep_width = 0; Index_type dash_width = 0; for (size_t ik = 0; ik < kernels.size(); ++ik) { kercol_width = max(kercol_width, kernels[ik]->getName().size()); psize_width = max(psize_width, kernels[ik]->getActualProblemSize()); reps_width = max(reps_width, kernels[ik]->getRunReps()); - itsrep_width = max(reps_width, kernels[ik]->getItsPerRep()); + itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep()); bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep()); - flopsrep_width = max(bytesrep_width, kernels[ik]->getFLOPsPerRep()); + flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep()); + bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep()); + bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep()); + bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep()); } const string sepchr(" , "); @@ -574,6 +580,24 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const static_cast(frsize) ) + 3; dash_width += flopsrep_width + static_cast(sepchr.size()); + double brrsize = log10( static_cast(bytesReadrep_width) ); + string bytesReadrep_head("BytesRead/rep"); + bytesReadrep_width = max( static_cast(bytesReadrep_head.size()), + static_cast(brrsize) ) + 3; + dash_width += bytesReadrep_width + static_cast(sepchr.size()); + + double bwrsize = log10( static_cast(bytesWrittenrep_width) ); + string bytesWrittenrep_head("BytesWritten/rep"); + bytesWrittenrep_width = max( static_cast(bytesWrittenrep_head.size()), + static_cast(bwrsize) ) + 3; + dash_width += bytesWrittenrep_width + static_cast(sepchr.size()); + + double bamrrsize = log10( static_cast(bytesAtomicModifyWrittenrep_width) ); + string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep"); + bytesAtomicModifyWrittenrep_width = max( static_cast(bytesAtomicModifyWrittenrep_head.size()), + static_cast(bamrrsize) ) + 3; + dash_width += bytesAtomicModifyWrittenrep_width + static_cast(sepchr.size()); + str <getKernelsPerRep() << sepchr <getBytesPerRep() << sepchr <getFLOPsPerRep() + << sepchr <getBytesReadPerRep() + << sepchr <getBytesWrittenPerRep() + << sepchr <getBytesAtomicModifyWrittenPerRep() << endl; } diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 3d0e55302..2d00b948b 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -38,7 +38,9 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) its_per_rep = -1; kernels_per_rep = -1; - bytes_per_rep = -1; + bytes_read_per_rep = -1; + bytes_written_per_rep = -1; + bytes_atomic_modify_written_per_rep = -1; FLOPs_per_rep = -1; running_variant = NumVariants; @@ -69,6 +71,18 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); + Bytes_Read_Rep_attr = cali_create_attribute("BytesRead/Rep", CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Bytes_Rep_Written_attr = cali_create_attribute("BytesWritten/Rep", CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); + Bytes_AtomicModifyWritten_Rep_attr = cali_create_attribute("BytesAtomicModifyWritten/Rep", CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | + CALI_ATTR_AGGREGATABLE | + CALI_ATTR_SKIP_EVENTS); Flops_Rep_attr = cali_create_attribute("Flops/Rep", CALI_TYPE_DOUBLE, CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | @@ -493,7 +507,9 @@ void KernelBase::print(std::ostream& os) const } os << "\t\t\t its_per_rep = " << its_per_rep << std::endl; os << "\t\t\t kernels_per_rep = " << kernels_per_rep << std::endl; - os << "\t\t\t bytes_per_rep = " << bytes_per_rep << std::endl; + os << "\t\t\t bytes_read_per_rep = " << bytes_read_per_rep << std::endl; + os << "\t\t\t bytes_written_per_rep = " << bytes_written_per_rep << std::endl; + os << "\t\t\t bytes_atomic_modify_written_per_rep = " << bytes_atomic_modify_written_per_rep << std::endl; os << "\t\t\t FLOPs_per_rep = " << FLOPs_per_rep << std::endl; os << "\t\t\t num_exec: " << std::endl; for (unsigned j = 0; j < NumVariants; ++j) { @@ -548,6 +564,9 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx) cali_set_double(Iters_Rep_attr,(double)getItsPerRep()); cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep()); cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep()); + cali_set_double(BytesRead_Rep_attr,(double)getBytesReadPerRep()); + cali_set_double(BytesWritten_Rep_attr,(double)getBytesWrittenPerRep()); + cali_set_double(BytesAtomicModifyWritten_Rep_attr,(double)getBytesAtomicModifyWrittenPerRep()); cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep()); cali_set_double(BlockSize_attr, getBlockSize()); for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { @@ -590,6 +609,9 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, { "expr": "any(max#Iterations/Rep)", "as": "Iterations/Rep" }, { "expr": "any(max#Kernels/Rep)", "as": "Kernels/Rep" }, { "expr": "any(max#Bytes/Rep)", "as": "Bytes/Rep" }, + { "expr": "any(max#BytesRead/Rep)", "as": "BytesRead/Rep" }, + { "expr": "any(max#BytesWritten/Rep)", "as": "BytesWritten/Rep" }, + { "expr": "any(max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" }, { "expr": "any(max#Flops/Rep)", "as": "Flops/Rep" }, { "expr": "any(max#BlockSize)", "as": "BlockSize" }, { "expr": "any(max#Forall)", "as": "FeatureForall" }, @@ -613,6 +635,9 @@ void KernelBase::setCaliperMgrVariantTuning(VariantID vid, { "expr": "any(any#max#Iterations/Rep)", "as": "Iterations/Rep" }, { "expr": "any(any#max#Kernels/Rep)", "as": "Kernels/Rep" }, { "expr": "any(any#max#Bytes/Rep)", "as": "Bytes/Rep" }, + { "expr": "any(any#max#BytesRead/Rep)", "as": "BytesRead/Rep" }, + { "expr": "any(any#max#BytesWritten/Rep)", "as": "BytesWritten/Rep" }, + { "expr": "any(any#max#BytesAtomicModifyWritten/Rep)", "as": "BytesAtomicModifyWritten/Rep" }, { "expr": "any(any#max#Flops/Rep)", "as": "Flops/Rep" }, { "expr": "any(any#max#BlockSize)", "as": "BlockSize" }, { "expr": "any(any#max#Forall)", "as": "FeatureForall" }, diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 4feda83d9..e778a6918 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -102,7 +102,9 @@ class KernelBase void setDefaultReps(Index_type reps) { default_reps = reps; } void setItsPerRep(Index_type its) { its_per_rep = its; }; void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; }; - void setBytesPerRep(Index_type bytes) { bytes_per_rep = bytes;} + void setBytesReadPerRep(Index_type bytes) { bytes_read_per_rep = bytes;} + void setBytesWrittenPerRep(Index_type bytes) { bytes_written_per_rep = bytes;} + void setBytesAtomicModifyWrittenPerRep(Index_type bytes) { bytes_atomic_modify_written_per_rep = bytes;} void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; } void setBlockSize(Index_type size) { kernel_block_size = size; } @@ -155,7 +157,10 @@ class KernelBase Index_type getDefaultReps() const { return default_reps; } Index_type getItsPerRep() const { return its_per_rep; }; Index_type getKernelsPerRep() const { return kernels_per_rep; }; - Index_type getBytesPerRep() const { return bytes_per_rep; } + Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting + Index_type getBytesReadPerRep() const { return bytes_read_per_rep; } + Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; } + Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; } Index_type getFLOPsPerRep() const { return FLOPs_per_rep; } double getBlockSize() const { return kernel_block_size; } @@ -549,7 +554,9 @@ class KernelBase // Index_type its_per_rep; Index_type kernels_per_rep; - Index_type bytes_per_rep; + Index_type bytes_read_per_rep; + Index_type bytes_written_per_rep; + Index_type bytes_atomic_modify_written_per_rep; Index_type FLOPs_per_rep; double kernel_block_size = nan(""); // Set default value for non GPU kernels @@ -568,6 +575,9 @@ class KernelBase cali_id_t Iters_Rep_attr; cali_id_t Kernels_Rep_attr; cali_id_t Bytes_Rep_attr; + cali_id_t Bytes_Read_Rep_attr; + cali_id_t Bytes_Written_Rep_attr; + cali_id_t Bytes_AtomicModifyWritten_Rep_attr; cali_id_t Flops_Rep_attr; cali_id_t BlockSize_attr; std::map Feature_attrs; diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 344300b56..40ff30713 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -28,7 +28,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (10*sizeof(Real_type) + 10*sizeof(Real_type)) * getActualProblemSize()); + setBytesReadPerRep( 10*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 10*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(9 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index f9b6b0158..a9076c144 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -31,8 +31,10 @@ EOS::EOS(const RunParams& params) setItsPerRep( getActualProblemSize() ); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * getActualProblemSize() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_array_length ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() + + 1*sizeof(Real_type) * m_array_length ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(16 * getActualProblemSize()); checksum_scale_factor = 0.0001 * diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 337c96939..aa5aaa31a 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -28,11 +28,11 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params) m_N = getActualProblemSize()+1; - setItsPerRep( getActualProblemSize() ); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getActualProblemSize() + - (0*sizeof(Real_type) + 1*sizeof(Real_type)) * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type) * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 4cc3e276c..63a3be8df 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -33,9 +33,12 @@ FIRST_MIN::FIRST_MIN(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) + - (1*sizeof(Index_type) + 1*sizeof(Index_type)) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N ); + setBytesReadPerRep( 1*sizeof(Index_type) + + 1*sizeof(Real_type ) + + 1*sizeof(Real_type ) * m_N ); + setBytesWrittenPerRep( 1*sizeof(Index_type) + + 1*sizeof(Real_type ) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature(Forall); diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index a4765bfc6..a40bf533a 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -30,8 +30,9 @@ FIRST_SUM::FIRST_SUM(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_N-1) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * (m_N-1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * (getActualProblemSize()-1)); setUsesFeature(Forall); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 8a33be330..80b7f9b10 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -30,8 +30,11 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(2); - setBytesPerRep( (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_N + - (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_N ); + setBytesReadPerRep( 3*sizeof(Real_type ) * m_N + + 3*sizeof(Real_type ) * m_N ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N + + 2*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((3 + 3 ) * getActualProblemSize()); diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 316675f37..c4821788f 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -30,8 +30,10 @@ HYDRO_1D::HYDRO_1D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * getActualProblemSize() + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * (getActualProblemSize()+1) ); + setBytesReadPerRep( 1*sizeof(Real_type ) * getActualProblemSize() + + 1*sizeof(Real_type ) * (getActualProblemSize()+1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(5 * getActualProblemSize()); checksum_scale_factor = 0.001 * diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index ad70508a8..5c698167a 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -40,11 +40,13 @@ HYDRO_2D::HYDRO_2D(const RunParams& params) setItsPerRep( 3 * getActualProblemSize() ); setKernelsPerRep(3); - setBytesPerRep( (2*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) + - (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_array_length + - (2*sizeof(Real_type ) + 0*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) + - (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_array_length + - (2*sizeof(Real_type ) + 4*sizeof(Real_type )) * (m_kn-2) * (m_jn-2) ); + setBytesReadPerRep( 4*sizeof(Real_type ) * m_array_length + + 4*sizeof(Real_type ) * m_array_length + + 4*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + + 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) + + 2*sizeof(Real_type ) * (m_kn-2) * (m_jn-2) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((14 + 26 + 4 ) * (m_jn-2)*(m_kn-2)); diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index 5fc2c7f2e..afb4a2ea9 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -28,7 +28,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 10*sizeof(Real_type )) * getActualProblemSize() ); + setBytesReadPerRep( 10*sizeof(Real_type ) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(17 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 85d067918..cf15e6a29 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -28,7 +28,9 @@ PLANCKIAN::PLANCKIAN(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type ) + 3*sizeof(Real_type )) * getActualProblemSize() ); + setBytesReadPerRep( 3*sizeof(Real_type ) * getActualProblemSize() ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(4 * getActualProblemSize()); // 1 exp setUsesFeature(Forall); diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 6120eb2e0..cc8b0fa97 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -30,7 +30,9 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 3*sizeof(Real_type )) * (m_N-1) ); + setBytesReadPerRep( 3*sizeof(Real_type ) * (m_N-1) ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * (m_N-1) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * (getActualProblemSize()-1)); setUsesFeature(Forall); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 56830658d..99db797f7 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -44,13 +44,15 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setItsPerRep( m_ni*m_nj + m_ni*m_nl ); setKernelsPerRep(2); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk + + setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + + 1*sizeof(Real_type ) * m_nj * m_nk + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nl + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nl ); + 1*sizeof(Real_type ) * m_ni * m_nj + + 1*sizeof(Real_type ) * m_nj * m_nl ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + + + 1*sizeof(Real_type ) * m_ni * m_nl ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(3 * m_ni*m_nj*m_nk + 2 * m_ni*m_nj*m_nl ); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index bbb6e072f..14651cafe 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -47,17 +47,20 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setItsPerRep( m_ni*m_nj + m_nj*m_nl + m_ni*m_nl ); setKernelsPerRep(3); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk + + setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + + 1*sizeof(Real_type ) * m_nj * m_nk + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_nj * m_nl + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nm + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nl * m_nm + + 1*sizeof(Real_type ) * m_nj * m_nm + + 1*sizeof(Real_type ) * m_nl * m_nm + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nl + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nl ); + 1*sizeof(Real_type ) * m_ni * m_nj + + 1*sizeof(Real_type ) * m_nj * m_nl ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj + + + 1*sizeof(Real_type ) * m_nj * m_nl + + + 1*sizeof(Real_type ) * m_ni * m_nl ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_ni*m_nj*m_nk + 2 * m_nj*m_nl*m_nm + 2 * m_ni*m_nj*m_nl ); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 211f23be3..a323f1708 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -34,8 +34,11 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setActualProblemSize( (m_n-2) * (m_n-2) ); setKernelsPerRep( m_tsteps * 2 ); - setBytesPerRep( m_tsteps * ( (3*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_n * (m_n-2) + - (3*sizeof(Real_type ) + 3*sizeof(Real_type )) * m_n * (m_n-2) ) ); + setBytesReadPerRep((3*sizeof(Real_type ) * m_n * (m_n-2) + + 3*sizeof(Real_type ) * m_n * (m_n-2)) * m_tsteps ); + setBytesWrittenPerRep((3*sizeof(Real_type ) * m_n * (m_n-2) + + 3*sizeof(Real_type ) * m_n * (m_n-2)) * m_tsteps ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( (15 + 2) * (m_n-2)*(m_n-2) + (15 + 2) * (m_n-2)*(m_n-2) ) ); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index 71423d1ce..f1df0d99b 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -33,11 +33,14 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setItsPerRep( m_N + m_N ); setKernelsPerRep(2); - setBytesPerRep( (2*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N + - - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N + + + 1*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 2*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_N*m_N + 2 * m_N*m_N ); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index eeac18818..f231ae655 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -43,18 +43,25 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) m_nx*(m_ny-1) + (m_nx-1)*(m_ny-1) ) ); setKernelsPerRep(m_tsteps * 4); - setBytesPerRep( m_tsteps * ( (0*sizeof(Real_type ) + 1*sizeof(Real_type )) + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ny + + setBytesReadPerRep((1*sizeof(Real_type ) + - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * m_ny + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * m_ny + + 1*sizeof(Real_type ) * (m_nx-1) * m_ny + + 1*sizeof(Real_type ) * m_nx * m_ny + - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * (m_ny-1) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * m_ny + + 1*sizeof(Real_type ) * m_nx * (m_ny-1) + + 1*sizeof(Real_type ) * m_nx * m_ny + - (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * (m_ny-1) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * (m_nx-1) * m_ny + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nx * (m_ny-1) ) ); + 1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1) + + 1*sizeof(Real_type ) * (m_nx-1) * m_ny + + 1*sizeof(Real_type ) * m_nx * (m_ny-1)) * m_tsteps ); + setBytesWrittenPerRep((1*sizeof(Real_type ) * m_ny + + + 1*sizeof(Real_type ) * (m_nx-1) * m_ny + + + 1*sizeof(Real_type ) * m_nx * (m_ny-1) + + + 1*sizeof(Real_type ) * (m_nx-1) * (m_ny-1)) * m_tsteps ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 0 * m_ny + 3 * (m_nx-1)*m_ny + 3 * m_nx*(m_ny-1) + diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 3e5844805..f35841db8 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -33,7 +33,9 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setItsPerRep( m_N*m_N ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * m_N*m_N*m_N ); checksum_scale_factor = 1.0 * diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 1462eaf66..505bf5673 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -40,9 +40,10 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setItsPerRep( m_ni * m_nj ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * m_ni * m_nj + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_ni * m_nk + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_nj * m_nk ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_ni * m_nk + + 1*sizeof(Real_type ) * m_nj * m_nk ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_ni * m_nj); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((1 + 3 * m_nk) * m_ni*m_nj); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index ea331fdf3..bb06b90d1 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -39,16 +39,24 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) m_n + m_n*m_n ); setKernelsPerRep(4); - setBytesPerRep( (1*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n + - (0*sizeof(Real_type ) + 4*sizeof(Real_type )) * m_n + + setBytesReadPerRep( 1*sizeof(Real_type ) * m_n * m_n + + 4*sizeof(Real_type ) * m_n + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n + + 1*sizeof(Real_type ) * m_n * m_n + + 2*sizeof(Real_type ) * m_n + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n + + 2*sizeof(Real_type ) * m_n + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_n * m_n + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_n ); + 1*sizeof(Real_type ) * m_n * m_n + + 2*sizeof(Real_type ) * m_n ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_n * m_n + + + 1*sizeof(Real_type ) * m_n + + + 1*sizeof(Real_type ) * m_n + + + 1*sizeof(Real_type ) * m_n ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(4 * m_n*m_n + 3 * m_n*m_n + 1 * m_n + diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 99efcc0b2..baa845003 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -36,8 +36,10 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setItsPerRep( m_N ); setKernelsPerRep(1); - setBytesPerRep( (2*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 1*sizeof(Real_type ) * m_N + + 2*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep((4 * m_N + 3 ) * m_N ); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 430aef043..b8662c488 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -35,14 +35,13 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) ); setKernelsPerRep( m_tsteps * 2 ); - setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N * m_N - 12*(m_N-2) - 8) + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N * m_N - 12*(m_N-2) - 8) ) ); + setBytesReadPerRep((1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8) + + + 1*sizeof(Real_type ) * (m_N * m_N * m_N - 12*(m_N-2) - 8)) * m_tsteps); + setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2) + + + 1*sizeof(Real_type ) * (m_N-2) * (m_N-2) * (m_N-2)) * m_tsteps); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 15 * (m_N-2) * (m_N-2) * (m_N-2) + 15 * (m_N-2) * (m_N-2) * (m_N-2) ) ); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 40e2c2c04..c2c71208f 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -34,14 +34,13 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) setItsPerRep( m_tsteps * ( 2 * getActualProblemSize() ) ); setKernelsPerRep(m_tsteps * 2); - setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - m_N + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - m_N ) ); + setBytesReadPerRep((1*sizeof(Real_type ) * m_N + + + 1*sizeof(Real_type ) * m_N) * m_tsteps); + setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) + + + 1*sizeof(Real_type ) * (m_N-2)) * m_tsteps); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 3 * (m_N-2) + 3 * (m_N-2) ) ); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index f1db49000..aca9f5434 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -34,14 +34,13 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) setItsPerRep( m_tsteps * (2 * (m_N-2) * (m_N-2)) ); setKernelsPerRep(2); - setBytesPerRep( m_tsteps * ( (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N - 4) + - (1*sizeof(Real_type ) + 0*sizeof(Real_type )) * - (m_N-2) * (m_N-2) + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * - (m_N * m_N - 4) ) ); + setBytesReadPerRep((1*sizeof(Real_type ) * (m_N * m_N - 4) + + + 1*sizeof(Real_type ) * (m_N * m_N - 4)) * m_tsteps); + setBytesWrittenPerRep((1*sizeof(Real_type ) * (m_N-2) * (m_N-2) + + + 1*sizeof(Real_type ) * (m_N-2) * (m_N-2)) * m_tsteps); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep( m_tsteps * ( 5 * (m_N-2)*(m_N-2) + 5 * (m_N -2)*(m_N-2) ) ); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 2ae85367a..fe219e00c 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -33,10 +33,15 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setItsPerRep( 2 * m_N ); setKernelsPerRep(2); - setBytesPerRep( (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N + - (1*sizeof(Real_type ) + 2*sizeof(Real_type )) * m_N + - (0*sizeof(Real_type ) + 1*sizeof(Real_type )) * m_N * m_N ); + setBytesReadPerRep( 2*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N + + + 2*sizeof(Real_type ) * m_N + + 1*sizeof(Real_type ) * m_N * m_N ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * m_N + + + 1*sizeof(Real_type ) * m_N ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * m_N*m_N + 2 * m_N*m_N ); diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index e3b96ae62..510f39bb8 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -28,8 +28,9 @@ ADD::ADD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 997dbfecf..9cfce257a 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -28,8 +28,9 @@ COPY::COPY(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type ) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); setUsesFeature( Forall ); diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 9a615b583..5249c8ebd 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -28,9 +28,10 @@ DOT::DOT(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) + - (0*sizeof(Real_type) + 2*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) + + 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize()); setUsesFeature( Forall ); diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index eee8a69a4..eedea75c7 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -28,8 +28,9 @@ MUL::MUL(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 1*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature( Forall ); diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 472dcdbd8..da6386755 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -28,8 +28,9 @@ TRIAD::TRIAD(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Real_type) + 2*sizeof(Real_type)) * - getActualProblemSize() ); + setBytesReadPerRep( 2*sizeof(Real_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(2 * getActualProblemSize()); checksum_scale_factor = 0.001 * From 2f6e98bfb00813a152cf51e78d41cccabf7ea05e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 17:08:04 -0700 Subject: [PATCH 427/454] Don't count read in memset --- src/algorithm/MEMSET.cpp | 2 +- src/basic/INIT_VIEW1D.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index d35332d58..04ad4f52c 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -28,7 +28,7 @@ MEMSET::MEMSET(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesReadPerRep( 0 ); setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(0); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index b853507bd..eb24e8e8e 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -28,7 +28,7 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesReadPerRep( 0 ); setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 323cc7f2e..1eef8fc3d 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -28,7 +28,7 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesReadPerRep( 1*sizeof(Real_type) ); + setBytesReadPerRep( 0 ); setBytesWrittenPerRep( 1*sizeof(Real_type) * getActualProblemSize() ); setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); From 696acb5a8655f220e50ea76cda2882fd32774e95 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 17:12:59 -0700 Subject: [PATCH 428/454] Fix default problem sizes in ADI, HEAT_3D, JACOBI_1D, JACOBI_2D --- src/polybench/POLYBENCH_ADI.cpp | 4 ++-- src/polybench/POLYBENCH_HEAT_3D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_1D.cpp | 4 ++-- src/polybench/POLYBENCH_JACOBI_2D.cpp | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index a323f1708..340729c91 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -20,12 +20,12 @@ namespace polybench POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) : KernelBase(rajaperf::Polybench_ADI, params) { - Index_type n_default = 1000; + Index_type n_default = 1002; setDefaultProblemSize( (n_default-2) * (n_default-2) ); setDefaultReps(4); - m_n = std::sqrt( getTargetProblemSize() ) + 1; + m_n = std::sqrt( getTargetProblemSize() ) + 2; m_tsteps = 4; setItsPerRep( m_tsteps * ( (m_n-2) + (m_n-2) ) ); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index b8662c488..989295c8a 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -22,12 +22,12 @@ namespace polybench POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) : KernelBase(rajaperf::Polybench_HEAT_3D, params) { - Index_type N_default = 100; + Index_type N_default = 102; setDefaultProblemSize( (N_default-2)*(N_default-2)*(N_default-2) ); setDefaultReps(20); - m_N = std::cbrt( getTargetProblemSize() ) + 1; + m_N = std::cbrt( getTargetProblemSize() ) + 2; m_tsteps = 20; diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index c2c71208f..59a2520b5 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -21,12 +21,12 @@ namespace polybench POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_1D, params) { - Index_type N_default = 1000000; + Index_type N_default = 1000002; setDefaultProblemSize( N_default-2 ); setDefaultReps(100); - m_N = getTargetProblemSize(); + m_N = getTargetProblemSize() + 2; m_tsteps = 16; diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index aca9f5434..1ca329b1b 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -21,12 +21,12 @@ namespace polybench POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) : KernelBase(rajaperf::Polybench_JACOBI_2D, params) { - Index_type N_default = 1000; + Index_type N_default = 1002; - setDefaultProblemSize( N_default * N_default ); + setDefaultProblemSize( (N_default-2)*(N_default-2) ); setDefaultReps(50); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ) + 2; m_tsteps = 40; From e8bd6f0de01cfc986d329de8eb57cd5112d6079a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 20:15:22 -0700 Subject: [PATCH 429/454] Fixup problem size at 1 for some kernels --- src/lcals/TRIDIAG_ELIM.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index cc8b0fa97..9955bee66 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -26,7 +26,7 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - m_N = getActualProblemSize(); + m_N = getActualProblemSize() + 1; setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index f1df0d99b..dd03dfd8b 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -26,7 +26,7 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(100); - m_N = std::sqrt( getTargetProblemSize() )+1; + m_N = std::sqrt( getTargetProblemSize() ); setActualProblemSize( m_N * m_N ); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index f35841db8..8152c28d5 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -26,7 +26,7 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(8); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ); setActualProblemSize( m_N * m_N ); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index bb06b90d1..135f5b5f4 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -26,7 +26,7 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) setDefaultProblemSize( n_default * n_default ); setDefaultReps(20); - m_n = std::sqrt( getTargetProblemSize() ) + 1; + m_n = std::sqrt( getTargetProblemSize() ); m_alpha = 1.5; m_beta = 1.2; diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index baa845003..063550375 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -26,7 +26,7 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(120); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ); m_alpha = 0.62; m_beta = 1.002; diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index fe219e00c..03ab7bb8b 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -26,7 +26,7 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(100); - m_N = std::sqrt( getTargetProblemSize() ) + 1; + m_N = std::sqrt( getTargetProblemSize() ); setActualProblemSize( m_N * m_N ); From 892ab918427c5de46d0ffef0bd98432df4e5936d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 20:16:08 -0700 Subject: [PATCH 430/454] Scale intermediate problem size in 2MM, 3MM, and GEMM --- src/polybench/POLYBENCH_2MM.cpp | 4 ++-- src/polybench/POLYBENCH_3MM.cpp | 6 +++--- src/polybench/POLYBENCH_GEMM.cpp | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 99db797f7..fc8a8765e 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -31,9 +31,9 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) ni_default*nl_default ) ); setDefaultReps(2); - m_ni = std::sqrt( getTargetProblemSize() ) + 1; + m_ni = std::sqrt( getTargetProblemSize() ); m_nj = m_ni; - m_nk = nk_default; + m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_nl = m_ni; m_alpha = 1.5; diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 14651cafe..4d4ee8e5f 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -35,11 +35,11 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setDefaultProblemSize( ni_default * nj_default ); setDefaultReps(2); - m_ni = std::sqrt( getTargetProblemSize() ) + 1; + m_ni = std::sqrt( getTargetProblemSize() ); m_nj = m_ni; - m_nk = nk_default; + m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_nl = m_ni; - m_nm = nm_default; + m_nm = Index_type(double(nm_default)/ni_default*m_ni); setActualProblemSize( std::max( std::max( m_ni*m_nj, m_nj*m_nl ), diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 505bf5673..adbbbabea 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -28,9 +28,9 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setDefaultProblemSize( ni_default * nj_default ); setDefaultReps(4); - m_ni = std::sqrt( getTargetProblemSize() ) + 1; + m_ni = std::sqrt( getTargetProblemSize() ); m_nj = m_ni; - m_nk = nk_default; + m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_alpha = 0.62; m_beta = 1.002; From 98f8851392113ef2711cb3046cc8d97d9b5ad25c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 21:08:25 -0700 Subject: [PATCH 431/454] divide based step function problem sizes improved --- src/apps/CONVECTION3DPA.cpp | 2 +- src/apps/DIFFUSION3DPA.cpp | 2 +- src/apps/LTIMES.cpp | 4 +--- src/apps/LTIMES_NOVIEW.cpp | 4 +--- src/apps/MASS3DEA.cpp | 2 +- src/apps/MASS3DPA.cpp | 2 +- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index bbbd14db4..8213c2c90 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -28,7 +28,7 @@ CONVECTION3DPA::CONVECTION3DPA(const RunParams& params) setDefaultProblemSize(m_NE_default*CPA_Q1D*CPA_Q1D*CPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(CPA_Q1D*CPA_Q1D*CPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (CPA_Q1D*CPA_Q1D*CPA_Q1D)/2) / (CPA_Q1D*CPA_Q1D*CPA_Q1D), Index_type(1)); setActualProblemSize( m_NE*CPA_Q1D*CPA_Q1D*CPA_Q1D ); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 68cd57ad2..16cf307b5 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -28,7 +28,7 @@ DIFFUSION3DPA::DIFFUSION3DPA(const RunParams& params) setDefaultProblemSize(m_NE_default*DPA_Q1D*DPA_Q1D*DPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (DPA_Q1D*DPA_Q1D*DPA_Q1D)/2) / (DPA_Q1D*DPA_Q1D*DPA_Q1D), Index_type(1)); setActualProblemSize( m_NE*DPA_Q1D*DPA_Q1D*DPA_Q1D ); diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index f52eaea4f..74444ba65 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -31,9 +31,7 @@ LTIMES::LTIMES(const RunParams& params) setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); setDefaultReps(50); - m_num_z = std::max( getTargetProblemSize() / - (m_num_d_default * m_num_g_default), - Index_type(1) ); + m_num_z = std::max((getTargetProblemSize() + (m_num_d_default * m_num_g_default)/2) / (m_num_d_default * m_num_g_default), Index_type(1)); m_num_g = m_num_g_default; m_num_m = m_num_m_default; m_num_d = m_num_d_default; diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 941e9da9c..66bed63e5 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -31,9 +31,7 @@ LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); setDefaultReps(50); - m_num_z = std::max( getTargetProblemSize() / - (m_num_d_default * m_num_g_default), - Index_type(1) ); + m_num_z = std::max((getTargetProblemSize() + (m_num_d_default * m_num_g_default)/2) / (m_num_d_default * m_num_g_default), Index_type(1)); m_num_g = m_num_g_default; m_num_m = m_num_m_default; m_num_d = m_num_d_default; diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index 9beaddba0..9689c35ae 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -30,7 +30,7 @@ MASS3DEA::MASS3DEA(const RunParams& params) const int ea_mat_entries = MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D*MEA_D1D; - m_NE = std::max(getTargetProblemSize()/(ea_mat_entries), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (ea_mat_entries)/2) / (ea_mat_entries), Index_type(1)); setActualProblemSize( m_NE*ea_mat_entries); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 6f0058d63..f60e64c3e 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -28,7 +28,7 @@ MASS3DPA::MASS3DPA(const RunParams& params) setDefaultProblemSize(m_NE_default*MPA_Q1D*MPA_Q1D*MPA_Q1D); setDefaultReps(50); - m_NE = std::max(getTargetProblemSize()/(MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1)); + m_NE = std::max((getTargetProblemSize() + (MPA_Q1D*MPA_Q1D*MPA_Q1D)/2) / (MPA_Q1D*MPA_Q1D*MPA_Q1D), Index_type(1)); setActualProblemSize( m_NE*MPA_Q1D*MPA_Q1D*MPA_Q1D ); From fd3d7649efdb92b4e84806fe9e6b073cca8b549b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 21:23:07 -0700 Subject: [PATCH 432/454] Better match problem size with square and cubic step functions --- src/apps/DEL_DOT_VEC_2D.cpp | 2 +- src/apps/EDGE3D.cpp | 2 +- src/apps/MATVEC_3D_STENCIL.cpp | 2 +- src/apps/NODAL_ACCUMULATION_3D.cpp | 2 +- src/apps/VOL3D.cpp | 2 +- src/apps/ZONAL_ACCUMULATION_3D.cpp | 2 +- src/basic/MAT_MAT_SHARED.cpp | 2 +- src/basic/NESTED_INIT.cpp | 2 +- src/comm/HALO_base.cpp | 2 +- src/lcals/HYDRO_2D.cpp | 2 +- src/polybench/POLYBENCH_2MM.cpp | 2 +- src/polybench/POLYBENCH_3MM.cpp | 2 +- src/polybench/POLYBENCH_ADI.cpp | 2 +- src/polybench/POLYBENCH_ATAX.cpp | 2 +- src/polybench/POLYBENCH_FDTD_2D.cpp | 2 +- src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 2 +- src/polybench/POLYBENCH_GEMM.cpp | 2 +- src/polybench/POLYBENCH_GEMVER.cpp | 2 +- src/polybench/POLYBENCH_GESUMMV.cpp | 2 +- src/polybench/POLYBENCH_HEAT_3D.cpp | 2 +- src/polybench/POLYBENCH_JACOBI_2D.cpp | 2 +- src/polybench/POLYBENCH_MVT.cpp | 2 +- 22 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index b4c041e46..8c72474bc 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -28,7 +28,7 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setDefaultProblemSize(1000*1000); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::sqrt(getTargetProblemSize())+1; + Index_type rzmax = std::sqrt(getTargetProblemSize()) + 1 + std::sqrt(2)-1; m_domain = new ADomain(rzmax, /* ndims = */ 2); m_array_length = m_domain->nnalls; diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index 3bd5dcce4..3aefe67f0 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -27,7 +27,7 @@ EDGE3D::EDGE3D(const RunParams& params) { setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(10); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_array_length = m_domain->nnalls; diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp index ea01e9ed4..2b8cb6978 100644 --- a/src/apps/MATVEC_3D_STENCIL.cpp +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -28,7 +28,7 @@ MATVEC_3D_STENCIL::MATVEC_3D_STENCIL(const RunParams& params) setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_zonal_array_length = m_domain->lpz+1; diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 124ef01e1..5bbbbb986 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -28,7 +28,7 @@ NODAL_ACCUMULATION_3D::NODAL_ACCUMULATION_3D(const RunParams& params) setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_nodal_array_length = m_domain->nnalls; diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index c56e77f3d..5b0401dcf 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -28,7 +28,7 @@ VOL3D::VOL3D(const RunParams& params) setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_array_length = m_domain->nnalls; diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index ce47e2057..7f3faf44a 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -28,7 +28,7 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct setDefaultReps(100); - Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + Index_type rzmax = std::cbrt(getTargetProblemSize()) + 1 + std::cbrt(3)-1; m_domain = new ADomain(rzmax, /* ndims = */ 3); m_nodal_array_length = m_domain->nnalls; diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index e448bfa8e..61a85e898 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -24,7 +24,7 @@ MAT_MAT_SHARED::MAT_MAT_SHARED(const RunParams ¶ms) setDefaultProblemSize(m_N_default*m_N_default); setDefaultReps(5); - m_N = std::max(Index_type(std::sqrt(getTargetProblemSize())), Index_type(1)); + m_N = std::sqrt(getTargetProblemSize()) + std::sqrt(2)-1; setActualProblemSize(m_N * m_N); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index e76c59610..67d1d017b 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -29,7 +29,7 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setDefaultProblemSize(m_n_init * m_n_init * m_n_init); setDefaultReps(1000); - auto n_final = std::cbrt( getTargetProblemSize() ); + auto n_final = std::cbrt( getTargetProblemSize() ) + std::cbrt(3)-1; m_ni = n_final; m_nj = n_final; m_nk = n_final; diff --git a/src/comm/HALO_base.cpp b/src/comm/HALO_base.cpp index f72e95179..6455a1e1b 100644 --- a/src/comm/HALO_base.cpp +++ b/src/comm/HALO_base.cpp @@ -30,7 +30,7 @@ HALO_base::HALO_base(KernelID kid, const RunParams& params) s_grid_dims_default[1] * s_grid_dims_default[2] ); - double cbrt_run_size = std::cbrt(getTargetProblemSize()); + double cbrt_run_size = std::cbrt(getTargetProblemSize()) + std::cbrt(3)-1; m_grid_dims[0] = cbrt_run_size; m_grid_dims[1] = cbrt_run_size; diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 5c698167a..d1ae233d0 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -33,7 +33,7 @@ HYDRO_2D::HYDRO_2D(const RunParams& params) setDefaultProblemSize(m_kn * m_jn); setDefaultReps(100); - m_jn = m_kn = std::sqrt(getTargetProblemSize()); + m_jn = m_kn = std::sqrt(getTargetProblemSize()) + std::sqrt(2)-1; m_array_length = m_kn * m_jn; setActualProblemSize( getTargetProblemSize() ); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index fc8a8765e..55cc577cc 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -31,7 +31,7 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) ni_default*nl_default ) ); setDefaultReps(2); - m_ni = std::sqrt( getTargetProblemSize() ); + m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_nj = m_ni; m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_nl = m_ni; diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 4d4ee8e5f..eb8e63d66 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -35,7 +35,7 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setDefaultProblemSize( ni_default * nj_default ); setDefaultReps(2); - m_ni = std::sqrt( getTargetProblemSize() ); + m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_nj = m_ni; m_nk = Index_type(double(nk_default)/ni_default*m_ni); m_nl = m_ni; diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 340729c91..1347975f2 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -25,7 +25,7 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setDefaultProblemSize( (n_default-2) * (n_default-2) ); setDefaultReps(4); - m_n = std::sqrt( getTargetProblemSize() ) + 2; + m_n = std::sqrt( getTargetProblemSize() ) + 2 + std::sqrt(2)-1; m_tsteps = 4; setItsPerRep( m_tsteps * ( (m_n-2) + (m_n-2) ) ); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index dd03dfd8b..e7cb48875 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -26,7 +26,7 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(100); - m_N = std::sqrt( getTargetProblemSize() ); + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; setActualProblemSize( m_N * m_N ); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index f231ae655..7f87fd3ef 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -31,7 +31,7 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) nx_default * (ny_default-1) ) ); setDefaultReps(8); - m_nx = std::sqrt( getTargetProblemSize() ) + 1; + m_nx = std::sqrt( getTargetProblemSize() ) + 1 + std::sqrt(2)-1; m_ny = m_nx; m_tsteps = 40; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index 8152c28d5..149ae87aa 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -26,7 +26,7 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(8); - m_N = std::sqrt( getTargetProblemSize() ); + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; setActualProblemSize( m_N * m_N ); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index adbbbabea..48769b42f 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -28,7 +28,7 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setDefaultProblemSize( ni_default * nj_default ); setDefaultReps(4); - m_ni = std::sqrt( getTargetProblemSize() ); + m_ni = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_nj = m_ni; m_nk = Index_type(double(nk_default)/ni_default*m_ni); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 135f5b5f4..e0db7f361 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -26,7 +26,7 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) setDefaultProblemSize( n_default * n_default ); setDefaultReps(20); - m_n = std::sqrt( getTargetProblemSize() ); + m_n = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_alpha = 1.5; m_beta = 1.2; diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 063550375..7764c4036 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -26,7 +26,7 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(120); - m_N = std::sqrt( getTargetProblemSize() ); + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; m_alpha = 0.62; m_beta = 1.002; diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 989295c8a..1e4272534 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -27,7 +27,7 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setDefaultProblemSize( (N_default-2)*(N_default-2)*(N_default-2) ); setDefaultReps(20); - m_N = std::cbrt( getTargetProblemSize() ) + 2; + m_N = std::cbrt( getTargetProblemSize() ) + 2 + std::cbrt(3)-1; m_tsteps = 20; diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 1ca329b1b..a3b077a1f 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -26,7 +26,7 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) setDefaultProblemSize( (N_default-2)*(N_default-2) ); setDefaultReps(50); - m_N = std::sqrt( getTargetProblemSize() ) + 2; + m_N = std::sqrt( getTargetProblemSize() ) + 2 + std::sqrt(2)-1; m_tsteps = 40; diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 03ab7bb8b..e8da53a0c 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -26,7 +26,7 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setDefaultProblemSize( N_default * N_default ); setDefaultReps(100); - m_N = std::sqrt( getTargetProblemSize() ); + m_N = std::sqrt( getTargetProblemSize() ) + std::sqrt(2)-1; setActualProblemSize( m_N * m_N ); From 0f8942584f0afacf00ce85b1ad89b290de57dcb5 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 14 Jul 2024 21:23:28 -0700 Subject: [PATCH 433/454] Fix problem size in EDGE3D and VOL3D --- src/apps/EDGE3D.cpp | 2 +- src/apps/VOL3D.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/EDGE3D.cpp b/src/apps/EDGE3D.cpp index 3aefe67f0..d917bb321 100644 --- a/src/apps/EDGE3D.cpp +++ b/src/apps/EDGE3D.cpp @@ -33,7 +33,7 @@ EDGE3D::EDGE3D(const RunParams& params) m_array_length = m_domain->nnalls; size_t number_of_elements = m_domain->lpz+1 - m_domain->fpz; - setActualProblemSize( number_of_elements ); + setActualProblemSize( m_domain->n_real_zones ); setItsPerRep( number_of_elements ); setKernelsPerRep(1); diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 5b0401dcf..16951253d 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -33,7 +33,7 @@ VOL3D::VOL3D(const RunParams& params) m_array_length = m_domain->nnalls; - setActualProblemSize( m_domain->lpz+1 - m_domain->fpz ); + setActualProblemSize( m_domain->n_real_zones ); setItsPerRep( m_domain->lpz+1 - m_domain->fpz ); setKernelsPerRep(1); From 0ab1dccba7e1ad7e188cc1d25022321b33ecaf55 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 15 Jul 2024 15:29:48 -0700 Subject: [PATCH 434/454] Apply suggestions from code review Co-authored-by: Michael McKinsey --- src/common/KernelBase.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 2d00b948b..09e78a6d0 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -75,7 +75,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); - Bytes_Rep_Written_attr = cali_create_attribute("BytesWritten/Rep", CALI_TYPE_DOUBLE, + Bytes_Written_Rep_attr = cali_create_attribute("BytesWritten/Rep", CALI_TYPE_DOUBLE, CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS); @@ -564,9 +564,9 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx) cali_set_double(Iters_Rep_attr,(double)getItsPerRep()); cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep()); cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep()); - cali_set_double(BytesRead_Rep_attr,(double)getBytesReadPerRep()); - cali_set_double(BytesWritten_Rep_attr,(double)getBytesWrittenPerRep()); - cali_set_double(BytesAtomicModifyWritten_Rep_attr,(double)getBytesAtomicModifyWrittenPerRep()); + cali_set_double(Bytes_Read_Rep_attr,(double)getBytesReadPerRep()); + cali_set_double(Bytes_Written_Rep_attr,(double)getBytesWrittenPerRep()); + cali_set_double(Bytes_AtomicModifyWritten_Rep_attr,(double)getBytesAtomicModifyWrittenPerRep()); cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep()); cali_set_double(BlockSize_attr, getBlockSize()); for (unsigned i = 0; i < FeatureID::NumFeatures; ++i) { From 46db00e98a96f7daf830906fd3405df6d6ebb89e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 15 Jul 2024 16:37:20 -0700 Subject: [PATCH 435/454] Add mappings from reductions in multireductions --- src/algorithm/HISTOGRAM-Cuda.cpp | 79 +++++++++++++++++++------------ src/algorithm/HISTOGRAM-Hip.cpp | 81 ++++++++++++++++++++------------ src/algorithm/HISTOGRAM.hpp | 6 ++- src/basic/MULTI_REDUCE-Cuda.cpp | 79 +++++++++++++++++++------------ src/basic/MULTI_REDUCE-Hip.cpp | 79 +++++++++++++++++++------------ src/basic/MULTI_REDUCE.hpp | 6 ++- 6 files changed, 205 insertions(+), 125 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 4b82a3c74..3343b0e88 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -51,7 +51,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, { Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); } @@ -73,7 +73,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, Index_type i = blockIdx.x * block_size + threadIdx.x; Index_type warp = i / warp_size; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; RAJA::atomicAdd(&global_counts[offset], HISTOGRAM::Data_type(1)); } @@ -153,7 +153,8 @@ void HISTOGRAM::runCudaVariantLibrary(VariantID vid) template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -173,13 +174,16 @@ void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid) const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - - const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + const Index_type max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); startTimer(); @@ -214,6 +218,10 @@ void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid) } else if ( vid == RAJA_CUDA ) { + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< RAJA::cuda::MultiReduceTuning< RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, @@ -233,7 +241,7 @@ void HISTOGRAM::runCudaVariantAtomicRuntime(VariantID vid) HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); - RAJA::forall>( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { HISTOGRAM_BODY; @@ -274,27 +282,32 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { - seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + if (tune_idx == t) { - if (tune_idx == t) { + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); - setBlockSize(block_size); - runCudaVariantAtomicRuntime(vid); + } - } + t += 1; - t += 1; + }); - }); + } - } + }); }); @@ -325,20 +338,26 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { - seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ - ")_global("+std::to_string(global_replication)+ - ")_"+std::to_string(block_size)); + }); - }); + } - } + }); }); diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 1a85f5006..00b59984f 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -56,7 +56,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, { Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_counts[offset], HISTOGRAM::Data_type(1)); } @@ -78,7 +78,7 @@ __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, Index_type i = blockIdx.x * block_size + threadIdx.x; Index_type warp = i / warp_size; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; RAJA::atomicAdd(&global_counts[offset], HISTOGRAM::Data_type(1)); } @@ -182,7 +182,8 @@ void HISTOGRAM::runHipVariantLibrary(VariantID vid) template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -202,13 +203,16 @@ void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid) const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - - const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + const Index_type max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, counts, hcounts, num_bins, global_replication); startTimer(); @@ -243,6 +247,10 @@ void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid) } else if ( vid == RAJA_HIP ) { + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< RAJA::hip::MultiReduceTuning< RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, @@ -262,7 +270,7 @@ void HISTOGRAM::runHipVariantAtomicRuntime(VariantID vid) HISTOGRAM_INIT_COUNTS_RAJA(multi_reduce_policy); - RAJA::forall>( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { HISTOGRAM_BODY; @@ -296,34 +304,39 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) } - if ( vid == Base_HIP || vid == Lambda_HIP || vid == RAJA_HIP ) { + if ( vid == Base_HIP || vid == RAJA_HIP ) { seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { - if (tune_idx == t) { + if (tune_idx == t) { - setBlockSize(block_size); - runHipVariantAtomicRuntime(vid); + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); - } + } - t += 1; + t += 1; - }); + }); - } + } + + }); }); @@ -354,20 +367,26 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { - seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ - ")_global("+std::to_string(global_replication)+ - ")_"+std::to_string(block_size)); + }); - }); + } - } + }); }); diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 8d8fcd985..65cd27647 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -110,11 +110,13 @@ class HISTOGRAM : public KernelBase template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void runCudaVariantAtomicRuntime(VariantID vid); template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void runHipVariantAtomicRuntime(VariantID vid); private: diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index 1d82e2775..a187003bb 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -49,7 +49,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values { Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_values[offset], data[i]); } @@ -71,7 +71,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values Index_type i = blockIdx.x * block_size + threadIdx.x; Index_type warp = i / warp_size; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; RAJA::atomicAdd(&global_values[offset], data[i]); } @@ -80,7 +80,8 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -100,13 +101,16 @@ void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid) const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - - const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + const Index_type max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + RAJAPERF_CUDA_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication); startTimer(); @@ -142,6 +146,10 @@ void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid) } else if ( vid == RAJA_CUDA ) { + using exec_policy = std::conditional_t, + RAJA::cuda_exec_occ_calc>; + using multi_reduce_policy = RAJA::policy::cuda::cuda_multi_reduce_policy< RAJA::cuda::MultiReduceTuning< RAJA::cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, @@ -161,7 +169,7 @@ void MULTI_REDUCE::runCudaVariantAtomicRuntime(VariantID vid) MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy); - RAJA::forall>( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { MULTI_REDUCE_BODY; @@ -189,27 +197,32 @@ void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { - seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - if (tune_idx == t) { + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { - setBlockSize(block_size); - runCudaVariantAtomicRuntime(vid); + if (tune_idx == t) { - } + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); - t += 1; + } - }); + t += 1; - } + }); + + } + + }); }); @@ -232,20 +245,26 @@ void MULTI_REDUCE::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { - seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ - ")_global("+std::to_string(global_replication)+ - ")_"+std::to_string(block_size)); + seq_for(cuda_atomic_shared_replications_type{}, [&](auto shared_replication) { - }); + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - } + }); + + } + + }); }); diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 9fcd62299..0977dd356 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -49,7 +49,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values { Index_type i = blockIdx.x * block_size + threadIdx.x; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] * shared_replication + RAJA::power_of_2_mod(Index_type{threadIdx.x}, shared_replication); RAJA::atomicAdd(&shared_values[offset], data[i]); } @@ -71,7 +71,7 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values Index_type i = blockIdx.x * block_size + threadIdx.x; Index_type warp = i / warp_size; - if (i < iend) { + for ( ; i < iend ; i += gridDim.x * block_size ) { Index_type offset = bins[i] + RAJA::power_of_2_mod(warp, global_replication) * num_bins; RAJA::atomicAdd(&global_values[offset], data[i]); } @@ -80,7 +80,8 @@ __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -100,13 +101,16 @@ void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid) const Index_type max_shmem_per_block_in_bytes = func_attr.maxDynamicSharedSizeBytes; const Index_type max_shared_replication = max_shmem_per_block_in_bytes / sizeof(Data_type) / num_bins; - const Index_type grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - - const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); const Index_type shared_replication = RAJA::prev_pow2(std::min(preferred_shared_replication, max_shared_replication)); - const Index_type shmem = shared_replication * num_bins * sizeof(Data_type); + const Index_type max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS( + MappingHelper, func, block_size, shmem); + const Index_type normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + const Index_type grid_size = std::min(normal_grid_size, max_grid_size); + + const Index_type global_replication = RAJA::next_pow2(std::min(preferred_global_replication, grid_size)); + RAJAPERF_HIP_REDUCER_SETUP(Data_ptr, values, hvalues, num_bins, global_replication); startTimer(); @@ -142,6 +146,10 @@ void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid) } else if ( vid == RAJA_HIP ) { + using exec_policy = std::conditional_t, + RAJA::hip_exec_occ_calc>; + using multi_reduce_policy = RAJA::policy::hip::hip_multi_reduce_policy< RAJA::hip::MultiReduceTuning< RAJA::hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, @@ -161,7 +169,7 @@ void MULTI_REDUCE::runHipVariantAtomicRuntime(VariantID vid) MULTI_REDUCE_INIT_VALUES_RAJA(multi_reduce_policy); - RAJA::forall>( res, + RAJA::forall( res, RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { MULTI_REDUCE_BODY; @@ -189,27 +197,32 @@ void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { - seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - if (tune_idx == t) { + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { - setBlockSize(block_size); - runHipVariantAtomicRuntime(vid); + if (tune_idx == t) { - } + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); - t += 1; + } - }); + t += 1; - } + }); + + } + + }); }); @@ -232,20 +245,26 @@ void MULTI_REDUCE::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { + seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { - if (run_params.numValidAtomicReplication() == 0u || - run_params.validAtomicReplication(global_replication)) { + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { - seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { + if (run_params.numValidAtomicReplication() == 0u || + run_params.validAtomicReplication(global_replication)) { - addVariantTuningName(vid, "atomic_shared("+std::to_string(shared_replication)+ - ")_global("+std::to_string(global_replication)+ - ")_"+std::to_string(block_size)); + seq_for(hip_atomic_shared_replications_type{}, [&](auto shared_replication) { - }); + addVariantTuningName(vid, "atomic_" + "shared("+std::to_string(shared_replication)+")_"+ + "global("+std::to_string(global_replication)+")_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); - } + }); + + } + + }); }); diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 3f022f261..54d2f0467 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -102,11 +102,13 @@ class MULTI_REDUCE : public KernelBase void setHipTuningDefinitions(VariantID vid); template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void runCudaVariantAtomicRuntime(VariantID vid); template < Index_type block_size, Index_type preferred_global_replication, - Index_type preferred_shared_replication > + Index_type preferred_shared_replication, + typename MappingHelper > void runHipVariantAtomicRuntime(VariantID vid); private: From d4460e9e0707e2119a67c95276c8bdb31b7d694f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 15 Jul 2024 16:41:33 -0700 Subject: [PATCH 436/454] remove unused defaults they are handled in the main header now --- src/algorithm/HISTOGRAM-Cuda.cpp | 4 ---- src/algorithm/HISTOGRAM-Hip.cpp | 4 ---- src/basic/MULTI_REDUCE-Cuda.cpp | 4 ---- src/basic/MULTI_REDUCE-Hip.cpp | 4 ---- 4 files changed, 16 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 3343b0e88..6170634cb 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -26,10 +26,6 @@ namespace algorithm constexpr Index_type warp_size = 32; -constexpr Index_type default_shared_replication = 16; -constexpr Index_type default_global_replication = 2; - - template < Index_type block_size > __launch_bounds__(block_size) __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index 00b59984f..e51a7dd17 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -31,10 +31,6 @@ namespace algorithm constexpr Index_type warp_size = 64; -constexpr Index_type default_shared_replication = 4; -constexpr Index_type default_global_replication = 16; - - template < Index_type block_size > __launch_bounds__(block_size) __global__ void histogram_atomic_runtime(HISTOGRAM::Data_ptr global_counts, diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index a187003bb..6dd74bbb0 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -23,10 +23,6 @@ namespace basic constexpr Index_type warp_size = 32; -constexpr Index_type default_shared_replication = 16; -constexpr Index_type default_global_replication = 2; - - template < Index_type block_size > __launch_bounds__(block_size) __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values, diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index 0977dd356..a46ad9861 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -23,10 +23,6 @@ namespace basic constexpr Index_type warp_size = 64; -constexpr Index_type default_shared_replication = 4; -constexpr Index_type default_global_replication = 32; - - template < Index_type block_size > __launch_bounds__(block_size) __global__ void multi_reduce_atomic_runtime(MULTI_REDUCE::Data_ptr global_values, From 52de508a322e0f2e7ad2192fee172a514b307e15 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 15 Jul 2024 16:47:46 -0700 Subject: [PATCH 437/454] Fix type used in RAJAPERF_CUDA/HIP_GET_MAX_BLOCKS --- src/common/GPUUtils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index 06c34a219..f2189e0d6 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -233,13 +233,13 @@ using reducer_helpers = camp::list< // This will use the occupancy calculator if MappingHelper::direct is false #define RAJAPERF_CUDA_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem) \ MappingHelper::direct \ - ? std::numeric_limits::max() \ + ? std::numeric_limits::max() \ : detail::getCudaOccupancyMaxBlocks( \ (func), (block_size), (shmem)); /// #define RAJAPERF_HIP_GET_MAX_BLOCKS(MappingHelper, func, block_size, shmem) \ MappingHelper::direct \ - ? std::numeric_limits::max() \ + ? std::numeric_limits::max() \ : detail::getHipOccupancyMaxBlocks( \ (func), (block_size), (shmem)); From 3300a2bcfab66b9663106d2cbd6e3248a8bf48a2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 16 Jul 2024 14:04:28 -0700 Subject: [PATCH 438/454] Update src/apps/ZONAL_ACCUMULATION_3D.cpp --- src/apps/ZONAL_ACCUMULATION_3D.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 7f3faf44a..993f65a07 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -40,7 +40,7 @@ ZONAL_ACCUMULATION_3D::ZONAL_ACCUMULATION_3D(const RunParams& params) setKernelsPerRep(1); // touched data size, not actual number of stores and loads setBytesReadPerRep( 1*sizeof(Index_type) * getItsPerRep() + - 1*sizeof(Real_type) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + 1*sizeof(Real_type) * m_domain->n_real_nodes ); setBytesWrittenPerRep( 1*sizeof(Real_type) * getItsPerRep() ); setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(8 * getItsPerRep()); From 24f3864fea19ed57d59d1911faf6f2b443373a06 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 15 Jul 2024 17:07:58 -0700 Subject: [PATCH 439/454] Update byte counters of multi-reduce kernels --- src/algorithm/HISTOGRAM.cpp | 6 ++++-- src/basic/MULTI_REDUCE.cpp | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 0a819a78e..ffc7e9c09 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -34,8 +34,10 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Data_type) + 1*sizeof(Data_type))*m_num_bins + - (1*sizeof(Index_type) + 0*sizeof(Index_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins + + 1*sizeof(Index_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 0e179a7ec..314f91f2d 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -34,9 +34,11 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); - setBytesPerRep( (1*sizeof(Data_type) + 1*sizeof(Data_type))*m_num_bins + - (1*sizeof(Data_type) + 0*sizeof(Data_type) + - 1*sizeof(Index_type) + 0*sizeof(Index_type)) * getActualProblemSize() ); + setBytesReadPerRep( 1*sizeof(Data_type) * m_num_bins + + 1*sizeof(Data_type) * getActualProblemSize() + + 1*sizeof(Index_type) * getActualProblemSize() ); + setBytesWrittenPerRep( 1*sizeof(Data_type) * m_num_bins ); + setBytesAtomicModifyWrittenPerRep( 0 ); setFLOPsPerRep(1 * getActualProblemSize()); setUsesFeature(Forall); From 79c69e5b78fad8b157319f9161a287ca88335c41 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 16 Jul 2024 16:05:05 -0700 Subject: [PATCH 440/454] Update RAJA to develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index c1cffa924..fa8b8718d 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit c1cffa924db3ab3e00de5fb91c8b54f9eabe1d96 +Subproject commit fa8b8718d5a407668c597c6e97b8f1fadd55bd0f From adc1770738a31be631a1880db68f6d3d38aced7e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 08:22:14 -0700 Subject: [PATCH 441/454] Use cli --multi_reduce_num_bins instead of an environment variable. --- src/algorithm/HISTOGRAM.cpp | 3 +-- src/basic/MULTI_REDUCE.cpp | 3 +-- src/common/RunParams.cpp | 31 ++++++++++++++++++++++++++++++- src/common/RunParams.hpp | 5 +++++ 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index ffc7e9c09..eaded40d7 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -29,8 +29,7 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - const char* e_num_bins = getenv("RAJAPERF_MULTI_REDUCE_NUM_BINS"); - m_num_bins = e_num_bins ? atoi(e_num_bins) : 10; + m_num_bins = params.getMultiReduceNumBins(); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 314f91f2d..9a143e7b9 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -29,8 +29,7 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - const char* e_num_bins = getenv("RAJAPERF_MULTI_REDUCE_NUM_BINS"); - m_num_bins = e_num_bins ? atoi(e_num_bins) : 10; + m_num_bins = params.getMultiReduceNumBins(); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 6376b765f..2c5315ea5 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -38,6 +38,7 @@ RunParams::RunParams(int argc, char** argv) size(0.0), size_factor(0.0), data_alignment(RAJA::DATA_ALIGN), + multi_reduce_num_bins(10), gpu_stream(1), gpu_block_sizes(), atomic_replications(), @@ -120,6 +121,7 @@ void RunParams::print(std::ostream& str) const str << "\n size = " << size; str << "\n size_factor = " << size_factor; str << "\n data_alignment = " << data_alignment; + str << "\n multi_reduce_num_bins = " << multi_reduce_num_bins; str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { @@ -441,6 +443,27 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--multi_reduce_num_bins") ) { + + i++; + if ( i < argc ) { + long long num_bins = ::atoll( argv[i] ); + long long min_num_bins = 1; + if ( num_bins < min_num_bins ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num_bins + << std::endl; + input_state = BadInput; + } else { + multi_reduce_num_bins = num_bins; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -1213,7 +1236,7 @@ void RunParams::printHelpMessage(std::ostream& str) const << "\t\t -et default library (exclude default and library tunings)\n\n"; str << "\t Options for selecting kernel data used in kernels....\n" - << "\t ======================================================\n\n";; + << "\t ======================================================\n\n"; str << "\t --data_alignment, -align [default is RAJA::DATA_ALIGN]\n" << "\t (minimum memory alignment for host allocations)\n" @@ -1221,6 +1244,12 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t -align 4096 (allocates memory aligned to 4KiB boundaries)\n\n"; + str << "\t --multi_reduce_num_bins [default is 10]\n" + << "\t (number of bins used in multi-reduce kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --multi_reduce_num_bins 100\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 8c24bea1c..5055752a7 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -123,6 +123,8 @@ class RunParams { Size_type getDataAlignment() const { return data_alignment; } + Index_type getMultiReduceNumBins() const { return multi_reduce_num_bins; } + int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const @@ -255,10 +257,13 @@ class RunParams { double size_factor; /*!< default kernel size multipier (input option) */ Size_type data_alignment; + Index_type multi_reduce_num_bins; /*!< number of bins used in multi reduction kernels (input option) */ + int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ std::vector atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */ std::vector items_per_threads; /*!< Items per thread for gpu tunings to run (input option) */ + int mpi_size; /*!< Number of MPI ranks */ int mpi_rank; /*!< Rank of this MPI process */ std::array mpi_3d_division; /*!< Number of MPI ranks in each dimension of a 3D grid */ From 750f14bf6c15d1c15a30ae9c4fb32bbca092c89b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 09:09:34 -0700 Subject: [PATCH 442/454] Add cli --multi_reduce_bin_assignment_algorithm and remove environment variable --- src/algorithm/HISTOGRAM.cpp | 15 +++++++------ src/algorithm/HISTOGRAM.hpp | 1 + src/basic/MULTI_REDUCE.cpp | 15 +++++++------ src/basic/MULTI_REDUCE.hpp | 1 + src/common/RunParams.cpp | 43 +++++++++++++++++++++++++++++++++++++ src/common/RunParams.hpp | 33 ++++++++++++++++++++++++++++ 6 files changed, 96 insertions(+), 12 deletions(-) diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index eaded40d7..60ad2975e 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -30,6 +30,7 @@ HISTOGRAM::HISTOGRAM(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); m_num_bins = params.getMultiReduceNumBins(); + m_bin_assignment_algorithm = params.getMultiReduceBinAssignmentAlgorithm(); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); @@ -71,12 +72,14 @@ void HISTOGRAM::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); - const char* e_algorithm = getenv("RAJAPERF_MULTI_REDUCE_BIN_ASSIGNMENT"); - const int algorithm = e_algorithm ? atoi(e_algorithm) : 0; - const bool init_random_per_iterate = algorithm == 0; - const bool init_random_sizes = algorithm == 1; - const bool init_even_sizes = algorithm == 2; - const bool init_all_one = algorithm == 3; + const bool init_random_per_iterate = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Random); + const bool init_random_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsRandomSizes); + const bool init_even_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsEvenSizes); + const bool init_all_one = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Single); if (init_even_sizes || init_random_sizes || init_all_one) { Real_ptr data = nullptr; diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 65cd27647..58c7b3221 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -134,6 +134,7 @@ class HISTOGRAM : public KernelBase using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; + RunParams::BinAssignmentAlgorithm m_bin_assignment_algorithm; Index_ptr m_bins; std::vector m_counts_init; std::vector m_counts_final; diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index 9a143e7b9..8fc6ee6c6 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -30,6 +30,7 @@ MULTI_REDUCE::MULTI_REDUCE(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); m_num_bins = params.getMultiReduceNumBins(); + m_bin_assignment_algorithm = params.getMultiReduceBinAssignmentAlgorithm(); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); @@ -73,12 +74,14 @@ void MULTI_REDUCE::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { auto reset_bins = scopedMoveData(m_bins, getActualProblemSize(), vid); - const char* e_algorithm = getenv("RAJAPERF_MULTI_REDUCE_BIN_ASSIGNMENT"); - const int algorithm = e_algorithm ? atoi(e_algorithm) : 0; - const bool init_random_per_iterate = algorithm == 0; - const bool init_random_sizes = algorithm == 1; - const bool init_even_sizes = algorithm == 2; - const bool init_all_one = algorithm == 3; + const bool init_random_per_iterate = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Random); + const bool init_random_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsRandomSizes); + const bool init_even_sizes = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::RunsEvenSizes); + const bool init_all_one = + (m_bin_assignment_algorithm == RunParams::BinAssignmentAlgorithm::Single); if (init_even_sizes || init_random_sizes || init_all_one) { Real_ptr data = nullptr; diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 54d2f0467..9e265a2bc 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -126,6 +126,7 @@ class MULTI_REDUCE : public KernelBase using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type; Index_type m_num_bins; + RunParams::BinAssignmentAlgorithm m_bin_assignment_algorithm; Index_ptr m_bins; Data_ptr m_data; std::vector m_values_init; diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 2c5315ea5..7972a6adb 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -39,6 +39,7 @@ RunParams::RunParams(int argc, char** argv) size_factor(0.0), data_alignment(RAJA::DATA_ALIGN), multi_reduce_num_bins(10), + multi_reduce_bin_assignment_algorithm(BinAssignmentAlgorithm::RunsRandomSizes), gpu_stream(1), gpu_block_sizes(), atomic_replications(), @@ -121,7 +122,10 @@ void RunParams::print(std::ostream& str) const str << "\n size = " << size; str << "\n size_factor = " << size_factor; str << "\n data_alignment = " << data_alignment; + str << "\n multi_reduce_num_bins = " << multi_reduce_num_bins; + str << "\n multi_reduce_bin_assignment_algorithm = " << BinAssignmentAlgorithmToStr(multi_reduce_bin_assignment_algorithm); + str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { @@ -464,6 +468,39 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--multi_reduce_bin_assignment_algorithm") ) { + + i++; + if ( i < argc ) { + + std::string bin_assignment_algorithm_name(argv[i]); + + if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Random)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::Random; + } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsRandomSizes)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::RunsRandomSizes; + } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsEvenSizes)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::RunsEvenSizes; + } else if (bin_assignment_algorithm_name == BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Single)) { + multi_reduce_bin_assignment_algorithm = BinAssignmentAlgorithm::Single; + } else { + getCout() << "\nBad input:" + << " must give " << opt << " one of the following values\n" + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Random) << ", " + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsRandomSizes) << ", " + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::RunsEvenSizes) << ", " + << BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm::Single) + << std::endl; + input_state = BadInput; + invalid_npasses_combiner_input.emplace_back(bin_assignment_algorithm_name); + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (string)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -1250,6 +1287,12 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --multi_reduce_num_bins 100\n\n"; + str << "\t --multi_reduce_bin_assignment_algorithm [default is RunsRandomSizes]\n" + << "\t (algorithm used to assign bins to iterates in multi-reduce kernels)\n" + << "\t Valid assignment algorithm names are 'Random', 'RunsRandomSizes', 'RunsEvenSizes', or 'Single'\n"; + str << "\t\t Example...\n" + << "\t\t --multi_reduce_bin_assignment_algorithm Random\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 5055752a7..fc622445b 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -97,6 +97,37 @@ class RunParams { } } + /*! + * \brief Enumeration for the bin assignment algorithm used in multi-reduce kernels + */ + enum struct BinAssignmentAlgorithm : int { + Random, /*!< random bin for each iterate */ + RunsRandomSizes, /*!< each bin in turn is repeated a random number of times, + Ex. 6 bins and 10 iterates [ 0 0 1 2 2 2 2 3 3 5] */ + RunsEvenSizes, /*!< each bin in turn is repeated the same number of times, + Ex. 6 bins and 10 iterates [ 0 0 1 1 2 2 3 3 4 5] */ + Single /*!< use bin 0 for each iterate */ + }; + + /*! + * \brief Translate BinAssignmentAlgorithm enum value to string + */ + static std::string BinAssignmentAlgorithmToStr(BinAssignmentAlgorithm baa) + { + switch (baa) { + case BinAssignmentAlgorithm::Random: + return "Random"; + case BinAssignmentAlgorithm::RunsRandomSizes: + return "RunsRandomSizes"; + case BinAssignmentAlgorithm::RunsEvenSizes: + return "RunsEvenSizes"; + case BinAssignmentAlgorithm::Single: + return "Single"; + default: + return "Unknown"; + } + } + /*! * \brief Return state of input parsed to this point. */ @@ -124,6 +155,7 @@ class RunParams { Size_type getDataAlignment() const { return data_alignment; } Index_type getMultiReduceNumBins() const { return multi_reduce_num_bins; } + BinAssignmentAlgorithm getMultiReduceBinAssignmentAlgorithm() const { return multi_reduce_bin_assignment_algorithm; } int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } @@ -258,6 +290,7 @@ class RunParams { Size_type data_alignment; Index_type multi_reduce_num_bins; /*!< number of bins used in multi reduction kernels (input option) */ + BinAssignmentAlgorithm multi_reduce_bin_assignment_algorithm; /*!< algorithm used to assign bins to iterates used in multi reduction kernels (input option) */ int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ From 2d279b99d6302e7fc08c6b9253246b283740a115 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 09:50:23 -0700 Subject: [PATCH 443/454] Add cli --ltimes_num_[dgm] options --- src/apps/LTIMES.cpp | 15 +++---- src/apps/LTIMES.hpp | 5 --- src/apps/LTIMES_NOVIEW.cpp | 15 +++---- src/apps/LTIMES_NOVIEW.hpp | 5 --- src/common/RunParams.cpp | 88 ++++++++++++++++++++++++++++++++++++++ src/common/RunParams.hpp | 8 ++++ 6 files changed, 108 insertions(+), 28 deletions(-) diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index 74444ba65..798d44715 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -23,18 +23,15 @@ namespace apps LTIMES::LTIMES(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES, params) { - m_num_d_default = 64; - m_num_z_default = 488; - m_num_g_default = 32; - m_num_m_default = 25; + m_num_d = params.getLtimesNumD(); + m_num_g = params.getLtimesNumG(); + m_num_m = params.getLtimesNumM(); + Index_type num_z_default = std::max((Index_type{1000000} + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); - setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); + setDefaultProblemSize(m_num_d * m_num_g * num_z_default); setDefaultReps(50); - m_num_z = std::max((getTargetProblemSize() + (m_num_d_default * m_num_g_default)/2) / (m_num_d_default * m_num_g_default), Index_type(1)); - m_num_g = m_num_g_default; - m_num_m = m_num_m_default; - m_num_d = m_num_d_default; + m_num_z = std::max((getTargetProblemSize() + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); m_philen = m_num_m * m_num_g * m_num_z; m_elllen = m_num_d * m_num_m; diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 0e94e8d0c..0e74f187f 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -138,11 +138,6 @@ class LTIMES : public KernelBase Real_ptr m_elldat; Real_ptr m_psidat; - Index_type m_num_d_default; - Index_type m_num_z_default; - Index_type m_num_g_default; - Index_type m_num_m_default; - Index_type m_num_d; Index_type m_num_z; Index_type m_num_g; diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 66bed63e5..0e675d487 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -23,18 +23,15 @@ namespace apps LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params) : KernelBase(rajaperf::Apps_LTIMES_NOVIEW, params) { - m_num_d_default = 64; - m_num_z_default = 488; - m_num_g_default = 32; - m_num_m_default = 25; + m_num_d = params.getLtimesNumD(); + m_num_g = params.getLtimesNumG(); + m_num_m = params.getLtimesNumM(); + Index_type num_z_default = std::max((Index_type{1000000} + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); - setDefaultProblemSize(m_num_d_default * m_num_g_default * m_num_z_default); + setDefaultProblemSize(m_num_d * m_num_g * num_z_default); setDefaultReps(50); - m_num_z = std::max((getTargetProblemSize() + (m_num_d_default * m_num_g_default)/2) / (m_num_d_default * m_num_g_default), Index_type(1)); - m_num_g = m_num_g_default; - m_num_m = m_num_m_default; - m_num_d = m_num_d_default; + m_num_z = std::max((getTargetProblemSize() + (m_num_d * m_num_g)/2) / (m_num_d * m_num_g), Index_type(1)); m_philen = m_num_m * m_num_g * m_num_z; m_elllen = m_num_d * m_num_m; diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index ddd8c9ade..4829b8171 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -88,11 +88,6 @@ class LTIMES_NOVIEW : public KernelBase Real_ptr m_elldat; Real_ptr m_psidat; - Index_type m_num_d_default; - Index_type m_num_z_default; - Index_type m_num_g_default; - Index_type m_num_m_default; - Index_type m_num_d; Index_type m_num_z; Index_type m_num_g; diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 7972a6adb..a9869ba0c 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -40,6 +40,9 @@ RunParams::RunParams(int argc, char** argv) data_alignment(RAJA::DATA_ALIGN), multi_reduce_num_bins(10), multi_reduce_bin_assignment_algorithm(BinAssignmentAlgorithm::RunsRandomSizes), + ltimes_num_d(64), + ltimes_num_g(32), + ltimes_num_m(25), gpu_stream(1), gpu_block_sizes(), atomic_replications(), @@ -126,6 +129,10 @@ void RunParams::print(std::ostream& str) const str << "\n multi_reduce_num_bins = " << multi_reduce_num_bins; str << "\n multi_reduce_bin_assignment_algorithm = " << BinAssignmentAlgorithmToStr(multi_reduce_bin_assignment_algorithm); + str << "\n ltimes_num_d = " << ltimes_num_d; + str << "\n ltimes_num_g = " << ltimes_num_g; + str << "\n ltimes_num_m = " << ltimes_num_m; + str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { @@ -501,6 +508,69 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--ltimes_num_d") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + ltimes_num_d = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--ltimes_num_g") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + ltimes_num_g = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--ltimes_num_m") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + ltimes_num_m = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -1293,6 +1363,24 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --multi_reduce_bin_assignment_algorithm Random\n\n"; + str << "\t --ltimes_num_d [default is 64]\n" + << "\t (num_d used in ltimes kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --ltimes_num_d 32\n\n"; + + str << "\t --ltimes_num_g [default is 32]\n" + << "\t (num_g used in ltimes kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --ltimes_num_g 64\n\n"; + + str << "\t --ltimes_num_m [default is 25]\n" + << "\t (num_m used in ltimes kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --ltimes_num_m 100\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index fc622445b..1ea8a048c 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -157,6 +157,10 @@ class RunParams { Index_type getMultiReduceNumBins() const { return multi_reduce_num_bins; } BinAssignmentAlgorithm getMultiReduceBinAssignmentAlgorithm() const { return multi_reduce_bin_assignment_algorithm; } + Index_type getLtimesNumD() const { return ltimes_num_d; } + Index_type getLtimesNumG() const { return ltimes_num_g; } + Index_type getLtimesNumM() const { return ltimes_num_m; } + int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const @@ -292,6 +296,10 @@ class RunParams { Index_type multi_reduce_num_bins; /*!< number of bins used in multi reduction kernels (input option) */ BinAssignmentAlgorithm multi_reduce_bin_assignment_algorithm; /*!< algorithm used to assign bins to iterates used in multi reduction kernels (input option) */ + Index_type ltimes_num_d; /*!< num_d used in ltimes kernels (input option) */ + Index_type ltimes_num_g; /*!< num_g used in ltimes kernels (input option) */ + Index_type ltimes_num_m; /*!< num_m used in ltimes kernels (input option) */ + int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ std::vector atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */ From a6d0dca5d9e0ad53296169bc285dea1147ce984c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 10:09:50 -0700 Subject: [PATCH 444/454] Add cli --array_of_ptrs_array_size option --- src/basic/ARRAY_OF_PTRS.cpp | 2 +- src/basic/ARRAY_OF_PTRS.hpp | 2 -- src/common/RunParams.cpp | 37 +++++++++++++++++++++++++++++++++++++ src/common/RunParams.hpp | 8 ++++++++ 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index f95c7e384..9f0995d0d 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -26,7 +26,7 @@ ARRAY_OF_PTRS::ARRAY_OF_PTRS(const RunParams& params) setActualProblemSize( getTargetProblemSize() ); - m_array_size = ARRAY_OF_PTRS_MAX_ARRAY_SIZE; + m_array_size = params.getArrayOfPtrsArraySize(); setItsPerRep( getActualProblemSize() ); setKernelsPerRep(1); diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index 26ac4c78e..029353f45 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -24,8 +24,6 @@ #ifndef RAJAPerf_Basic_ARRAY_OF_PTRS_HPP #define RAJAPerf_Basic_ARRAY_OF_PTRS_HPP -#define ARRAY_OF_PTRS_MAX_ARRAY_SIZE 26 - #define ARRAY_OF_PTRS_DATA_SETUP_X_ARRAY \ for (Index_type a = 0; a < array_size; ++a) { \ x[a] = x_data + a * iend ; \ diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index a9869ba0c..31546bfa2 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -43,6 +43,7 @@ RunParams::RunParams(int argc, char** argv) ltimes_num_d(64), ltimes_num_g(32), ltimes_num_m(25), + array_of_ptrs_array_size(ARRAY_OF_PTRS_MAX_ARRAY_SIZE), gpu_stream(1), gpu_block_sizes(), atomic_replications(), @@ -133,6 +134,8 @@ void RunParams::print(std::ostream& str) const str << "\n ltimes_num_g = " << ltimes_num_g; str << "\n ltimes_num_m = " << ltimes_num_m; + str << "\n array_of_ptrs_array_size = " << array_of_ptrs_array_size; + str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { @@ -571,6 +574,33 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--array_of_ptrs_array_size") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + long long max_num = ARRAY_OF_PTRS_MAX_ARRAY_SIZE; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else if ( num > max_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at most " << max_num + << std::endl; + input_state = BadInput; + } else { + array_of_ptrs_array_size = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -1381,6 +1411,13 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --ltimes_num_m 100\n\n"; + str << "\t --array_of_ptrs_array_size [default is " << ARRAY_OF_PTRS_MAX_ARRAY_SIZE << "]\n" + << "\t (array size used in ARRAY_OF_PTRS kernel)\n" + << "\t Must be greater than 0.\n" + << "\t Must be less than or equal to " << ARRAY_OF_PTRS_MAX_ARRAY_SIZE << ".\n"; + str << "\t\t Example...\n" + << "\t\t --array_of_ptrs_array_size 4\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 1ea8a048c..5b5bac637 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -18,6 +18,10 @@ #include "RAJAPerfSuite.hpp" #include "RPTypes.hpp" + +#define ARRAY_OF_PTRS_MAX_ARRAY_SIZE 26 + + namespace rajaperf { @@ -161,6 +165,8 @@ class RunParams { Index_type getLtimesNumG() const { return ltimes_num_g; } Index_type getLtimesNumM() const { return ltimes_num_m; } + Index_type getArrayOfPtrsArraySize() const { return array_of_ptrs_array_size; } + int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const @@ -300,6 +306,8 @@ class RunParams { Index_type ltimes_num_g; /*!< num_g used in ltimes kernels (input option) */ Index_type ltimes_num_m; /*!< num_m used in ltimes kernels (input option) */ + Index_type array_of_ptrs_array_size; /*!< number of pointers used in ARRAY_OF_PTRS kernel (input option) */ + int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ std::vector atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */ From e870462a8c45c4e8f90aefd4a5e56273f9c1ca16 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 10:21:36 -0700 Subject: [PATCH 445/454] Add cli --halo_width and --halo_num_vars options --- src/comm/HALO_EXCHANGE.cpp | 2 +- src/comm/HALO_EXCHANGE_FUSED.cpp | 2 +- src/comm/HALO_PACKING.cpp | 2 +- src/comm/HALO_PACKING_FUSED.cpp | 2 +- src/comm/HALO_SENDRECV.cpp | 2 +- src/comm/HALO_base.cpp | 4 +-- src/comm/HALO_base.hpp | 2 -- src/common/RunParams.cpp | 59 ++++++++++++++++++++++++++++++++ src/common/RunParams.hpp | 6 ++++ 9 files changed, 71 insertions(+), 10 deletions(-) diff --git a/src/comm/HALO_EXCHANGE.cpp b/src/comm/HALO_EXCHANGE.cpp index 00a3bc5c4..bbca8851f 100644 --- a/src/comm/HALO_EXCHANGE.cpp +++ b/src/comm/HALO_EXCHANGE.cpp @@ -26,7 +26,7 @@ HALO_EXCHANGE::HALO_EXCHANGE(const RunParams& params) setDefaultReps(200); - m_num_vars = s_num_vars_default; + m_num_vars = params.getHaloNumVars(); m_var_size = m_grid_plus_halo_size ; setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); diff --git a/src/comm/HALO_EXCHANGE_FUSED.cpp b/src/comm/HALO_EXCHANGE_FUSED.cpp index b691f0df7..be76571a2 100644 --- a/src/comm/HALO_EXCHANGE_FUSED.cpp +++ b/src/comm/HALO_EXCHANGE_FUSED.cpp @@ -26,7 +26,7 @@ HALO_EXCHANGE_FUSED::HALO_EXCHANGE_FUSED(const RunParams& params) setDefaultReps(200); - m_num_vars = s_num_vars_default; + m_num_vars = params.getHaloNumVars(); m_var_size = m_grid_plus_halo_size ; setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp index 3287d1280..f1569d3aa 100644 --- a/src/comm/HALO_PACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -20,7 +20,7 @@ HALO_PACKING::HALO_PACKING(const RunParams& params) { setDefaultReps(200); - m_num_vars = s_num_vars_default; + m_num_vars = params.getHaloNumVars(); m_var_size = m_grid_plus_halo_size ; setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp index bbc58f581..93d29dfbc 100644 --- a/src/comm/HALO_PACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -20,7 +20,7 @@ HALO_PACKING_FUSED::HALO_PACKING_FUSED(const RunParams& params) { setDefaultReps(200); - m_num_vars = s_num_vars_default; + m_num_vars = params.getHaloNumVars(); m_var_size = m_grid_plus_halo_size ; setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); diff --git a/src/comm/HALO_SENDRECV.cpp b/src/comm/HALO_SENDRECV.cpp index a7e2c51cc..0c57b2c3a 100644 --- a/src/comm/HALO_SENDRECV.cpp +++ b/src/comm/HALO_SENDRECV.cpp @@ -26,7 +26,7 @@ HALO_SENDRECV::HALO_SENDRECV(const RunParams& params) setDefaultReps(200); - m_num_vars = s_num_vars_default; + m_num_vars = params.getHaloNumVars(); m_var_size = m_grid_plus_halo_size ; setItsPerRep( m_num_vars * (m_var_size - getActualProblemSize()) ); diff --git a/src/comm/HALO_base.cpp b/src/comm/HALO_base.cpp index 6455a1e1b..84845114c 100644 --- a/src/comm/HALO_base.cpp +++ b/src/comm/HALO_base.cpp @@ -20,8 +20,6 @@ namespace comm { Index_type HALO_base::s_grid_dims_default[3] {100, 100, 100}; -Index_type HALO_base::s_halo_width_default = 1; -Index_type HALO_base::s_num_vars_default = 3; HALO_base::HALO_base(KernelID kid, const RunParams& params) : KernelBase(kid, params) @@ -35,7 +33,7 @@ HALO_base::HALO_base(KernelID kid, const RunParams& params) m_grid_dims[0] = cbrt_run_size; m_grid_dims[1] = cbrt_run_size; m_grid_dims[2] = cbrt_run_size; - m_halo_width = s_halo_width_default; + m_halo_width = params.getHaloWidth(); m_grid_plus_halo_dims[0] = m_grid_dims[0] + 2*m_halo_width; m_grid_plus_halo_dims[1] = m_grid_dims[1] + 2*m_halo_width; diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp index 1c179966a..fea021a87 100644 --- a/src/comm/HALO_base.hpp +++ b/src/comm/HALO_base.hpp @@ -127,8 +127,6 @@ class HALO_base : public KernelBase static const int s_boundary_offsets[s_num_neighbors][3]; static Index_type s_grid_dims_default[3]; - static Index_type s_halo_width_default; - static Index_type s_num_vars_default; Index_type m_grid_dims[3]; Index_type m_halo_width; diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 31546bfa2..351db3514 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -44,6 +44,8 @@ RunParams::RunParams(int argc, char** argv) ltimes_num_g(32), ltimes_num_m(25), array_of_ptrs_array_size(ARRAY_OF_PTRS_MAX_ARRAY_SIZE), + halo_width(1), + halo_num_vars(3), gpu_stream(1), gpu_block_sizes(), atomic_replications(), @@ -136,6 +138,9 @@ void RunParams::print(std::ostream& str) const str << "\n array_of_ptrs_array_size = " << array_of_ptrs_array_size; + str << "\n halo_width = " << halo_width; + str << "\n halo_num_vars = " << halo_num_vars; + str << "\n gpu stream = " << ((gpu_stream == 0) ? "0" : "RAJA default"); str << "\n gpu_block_sizes = "; for (size_t j = 0; j < gpu_block_sizes.size(); ++j) { @@ -601,6 +606,48 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--halo_width") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + halo_width = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + + } else if ( opt == std::string("--halo_num_vars") ) { + + i++; + if ( i < argc ) { + long long num = ::atoll( argv[i] ); + long long min_num = 1; + if ( num < min_num ) { + getCout() << "\nBad input:" + << " must give " << opt << " a value of at least " << min_num + << std::endl; + input_state = BadInput; + } else { + halo_num_vars = num; + } + } else { + getCout() << "\nBad input:" + << " must give " << opt << " a value (int)" + << std::endl; + input_state = BadInput; + } + } else if ( opt == std::string("--gpu_stream_0") ) { gpu_stream = 0; @@ -1418,6 +1465,18 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t\t Example...\n" << "\t\t --array_of_ptrs_array_size 4\n\n"; + str << "\t --halo_width [default is 1]\n" + << "\t (halo width used in halo kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --halo_width 2\n\n"; + + str << "\t --halo_num_vars [default is 3]\n" + << "\t (num vars used in halo kernels)\n" + << "\t Must be greater than 0.\n"; + str << "\t\t Example...\n" + << "\t\t --halo_num_vars 10\n\n"; + str << "\t --seq-data-space, -sds [Default is Host]\n" << "\t (name of data space to use for sequential variants)\n" << "\t Valid data space names are 'Host' or 'CudaPinned'\n"; diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 5b5bac637..75a3e0e11 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -167,6 +167,9 @@ class RunParams { Index_type getArrayOfPtrsArraySize() const { return array_of_ptrs_array_size; } + Index_type getHaloWidth() const { return halo_width; } + Index_type getHaloNumVars() const { return halo_num_vars; } + int getGPUStream() const { return gpu_stream; } size_t numValidGPUBlockSize() const { return gpu_block_sizes.size(); } bool validGPUBlockSize(size_t block_size) const @@ -308,6 +311,9 @@ class RunParams { Index_type array_of_ptrs_array_size; /*!< number of pointers used in ARRAY_OF_PTRS kernel (input option) */ + Index_type halo_width; /*!< halo width used in halo kernels (input option) */ + Index_type halo_num_vars; /*!< num vars used in halo kernels (input option) */ + int gpu_stream; /*!< 0 -> use stream 0; anything else -> use raja default stream */ std::vector gpu_block_sizes; /*!< Block sizes for gpu tunings to run (input option) */ std::vector atomic_replications; /*!< Atomic replications for gpu tunings to run (input option) */ From 1efb7f2d9d1888e56d8803b6fe9bf374d2a78d5f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 11:11:36 -0700 Subject: [PATCH 446/454] Fix zero size array in seq_for --- src/rajaperf_config.hpp.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in index 45cafeff1..3d2588378 100644 --- a/src/rajaperf_config.hpp.in +++ b/src/rajaperf_config.hpp.in @@ -156,7 +156,7 @@ template inline void seq_for(camp::list const&, Func&& func) { // braced init lists are evaluated in order - int seq_unused_array[] = {(func(Ts{}), 0)...}; + int seq_unused_array[] = {0, (func(Ts{}), 0)...}; RAJAPERF_UNUSED_VAR(seq_unused_array); } From 2c1765f743b6d8217ebb72c4bd941818917ca645 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 11:14:31 -0700 Subject: [PATCH 447/454] change Handling of default gpu replications in multi-reductions Now the default gpu lists are empty. The default gpu tuning paramaters are now run if atomic replications are not set and the tuning values are not included in the tuning name. This allows easier comparison between backends as the tuning names are now the same in the default case. --- src/algorithm/HISTOGRAM-Cuda.cpp | 26 ++++++++++++++++++++++++++ src/algorithm/HISTOGRAM-Hip.cpp | 26 ++++++++++++++++++++++++++ src/algorithm/HISTOGRAM.hpp | 8 ++++---- src/basic/MULTI_REDUCE-Cuda.cpp | 26 ++++++++++++++++++++++++++ src/basic/MULTI_REDUCE-Hip.cpp | 26 ++++++++++++++++++++++++++ src/basic/MULTI_REDUCE.hpp | 8 ++++---- src/common/GPUUtils.hpp | 4 ++-- 7 files changed, 114 insertions(+), 10 deletions(-) diff --git a/src/algorithm/HISTOGRAM-Cuda.cpp b/src/algorithm/HISTOGRAM-Cuda.cpp index 6170634cb..0bc363ee3 100644 --- a/src/algorithm/HISTOGRAM-Cuda.cpp +++ b/src/algorithm/HISTOGRAM-Cuda.cpp @@ -280,6 +280,23 @@ void HISTOGRAM::runCudaVariant(VariantID vid, size_t tune_idx) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); + + } + + t += 1; + + } + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || @@ -336,6 +353,15 @@ void HISTOGRAM::setCudaTuningDefinitions(VariantID vid) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || diff --git a/src/algorithm/HISTOGRAM-Hip.cpp b/src/algorithm/HISTOGRAM-Hip.cpp index e51a7dd17..5a25bca5c 100644 --- a/src/algorithm/HISTOGRAM-Hip.cpp +++ b/src/algorithm/HISTOGRAM-Hip.cpp @@ -309,6 +309,23 @@ void HISTOGRAM::runHipVariant(VariantID vid, size_t tune_idx) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); + + } + + t += 1; + + } + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || @@ -365,6 +382,15 @@ void HISTOGRAM::setHipTuningDefinitions(VariantID vid) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 65cd27647..58b335368 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -125,13 +125,13 @@ class HISTOGRAM : public KernelBase static const size_t default_cuda_atomic_global_replication = 2; static const size_t default_cuda_atomic_shared_replication = 16; - using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type; - using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type; + using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty static const size_t default_hip_atomic_global_replication = 32; static const size_t default_hip_atomic_shared_replication = 4; - using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type; - using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type; + using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty Index_type m_num_bins; Index_ptr m_bins; diff --git a/src/basic/MULTI_REDUCE-Cuda.cpp b/src/basic/MULTI_REDUCE-Cuda.cpp index 6dd74bbb0..fa52f9e99 100644 --- a/src/basic/MULTI_REDUCE-Cuda.cpp +++ b/src/basic/MULTI_REDUCE-Cuda.cpp @@ -195,6 +195,23 @@ void MULTI_REDUCE::runCudaVariant(VariantID vid, size_t tune_idx) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runCudaVariantAtomicRuntime(vid); + + } + + t += 1; + + } + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || @@ -243,6 +260,15 @@ void MULTI_REDUCE::setCudaTuningDefinitions(VariantID vid) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + seq_for(cuda_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || diff --git a/src/basic/MULTI_REDUCE-Hip.cpp b/src/basic/MULTI_REDUCE-Hip.cpp index a46ad9861..e2106a79e 100644 --- a/src/basic/MULTI_REDUCE-Hip.cpp +++ b/src/basic/MULTI_REDUCE-Hip.cpp @@ -195,6 +195,23 @@ void MULTI_REDUCE::runHipVariant(VariantID vid, size_t tune_idx) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + if (tune_idx == t) { + + setBlockSize(block_size); + runHipVariantAtomicRuntime(vid); + + } + + t += 1; + + } + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || @@ -243,6 +260,15 @@ void MULTI_REDUCE::setHipTuningDefinitions(VariantID vid) seq_for(gpu_mapping::reducer_helpers{}, [&](auto mapping_helper) { + if (camp::size::value == 0 && + camp::size::value == 0 ) { + + addVariantTuningName(vid, "atomic_"+ + decltype(mapping_helper)::get_name()+"_"+ + std::to_string(block_size)); + + } + seq_for(hip_atomic_global_replications_type{}, [&](auto global_replication) { if (run_params.numValidAtomicReplication() == 0u || diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 54d2f0467..089fe0dad 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -117,13 +117,13 @@ class MULTI_REDUCE : public KernelBase static const size_t default_cuda_atomic_global_replication = 2; static const size_t default_cuda_atomic_shared_replication = 16; - using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type; - using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type; + using cuda_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using cuda_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty static const size_t default_hip_atomic_global_replication = 32; static const size_t default_hip_atomic_shared_replication = 4; - using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type; - using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type; + using hip_atomic_global_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty + using hip_atomic_shared_replications_type = integer::make_atomic_replication_list_type<0>; // default list is empty Index_type m_num_bins; Index_ptr m_bins; diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp index f2189e0d6..dcf309ec9 100644 --- a/src/common/GPUUtils.hpp +++ b/src/common/GPUUtils.hpp @@ -156,7 +156,7 @@ using make_gpu_block_size_list_type = // If atomic_replications from the configuration is not empty it is those atomic_replications, // otherwise it is a list containing just default_atomic_replication. // Invalid entries are removed according to validity_checker in either case. -template < size_t default_atomic_replication, typename validity_checker = AllowAny > +template < size_t default_atomic_replication, typename validity_checker = PositiveOnly > using make_atomic_replication_list_type = typename detail::remove_invalid::value > 0), @@ -169,7 +169,7 @@ using make_atomic_replication_list_type = // If gpu_items_per_thread from the configuration is not empty it is those gpu_items_per_thread, // otherwise it is a list containing just default_gpu_items_per_thread. // Invalid entries are removed according to validity_checker in either case. -template < size_t default_gpu_items_per_thread, typename validity_checker = AllowAny > +template < size_t default_gpu_items_per_thread, typename validity_checker = PositiveOnly > using make_gpu_items_per_thread_list_type = typename detail::remove_invalid::value > 0), From a0f55e050688671bed50106975167ee52f5dbf33 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Jul 2024 11:58:05 -0700 Subject: [PATCH 448/454] change Handling of default items per thread in scans Now the default gpu lists are empty. The default gpu tuning paramaters are now run if items per thread is not set and the tuning values are not included in the tuning name. This allows easier comparison between backends as the tuning names are now the same in the default case. --- src/algorithm/SCAN-Cuda.cpp | 35 ++++++++++++++++++++++++++++++----- src/algorithm/SCAN-Hip.cpp | 35 ++++++++++++++++++++++++++++++----- src/basic/INDEXLIST-Cuda.cpp | 35 ++++++++++++++++++++++++++++++----- src/basic/INDEXLIST-Hip.cpp | 35 ++++++++++++++++++++++++++++++----- 4 files changed, 120 insertions(+), 20 deletions(-) diff --git a/src/algorithm/SCAN-Cuda.cpp b/src/algorithm/SCAN-Cuda.cpp index 29715478e..977c91e24 100644 --- a/src/algorithm/SCAN-Cuda.cpp +++ b/src/algorithm/SCAN-Cuda.cpp @@ -27,7 +27,7 @@ namespace algorithm template < size_t block_size > using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::cuda::grid_scan_default_items_per_thread::value, + detail::cuda::grid_scan_max_items_per_thread::value+1, integer::LessEqual::value>>; @@ -210,7 +210,24 @@ void SCAN::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runCudaVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { @@ -253,13 +270,21 @@ void SCAN::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)+ - "_itemsPerThread_"+std::to_string(items_per_thread)); + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); } diff --git a/src/algorithm/SCAN-Hip.cpp b/src/algorithm/SCAN-Hip.cpp index 9262b2be8..22f0bea57 100644 --- a/src/algorithm/SCAN-Hip.cpp +++ b/src/algorithm/SCAN-Hip.cpp @@ -32,7 +32,7 @@ namespace algorithm template < size_t block_size > using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::hip::grid_scan_default_items_per_thread::value, + detail::hip::grid_scan_max_items_per_thread::value+1, integer::LessEqual::value>>; @@ -237,7 +237,24 @@ void SCAN::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runHipVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { @@ -279,13 +296,21 @@ void SCAN::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)+ - "_itemsPerThread_"+std::to_string(items_per_thread)); + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); } diff --git a/src/basic/INDEXLIST-Cuda.cpp b/src/basic/INDEXLIST-Cuda.cpp index b331d5a7c..afcb54176 100644 --- a/src/basic/INDEXLIST-Cuda.cpp +++ b/src/basic/INDEXLIST-Cuda.cpp @@ -24,7 +24,7 @@ namespace basic template < size_t block_size > using cuda_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::cuda::grid_scan_default_items_per_thread::value, + detail::cuda::grid_scan_max_items_per_thread::value+1, integer::LessEqual::value>>; @@ -142,7 +142,24 @@ void INDEXLIST::runCudaVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runCudaVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { @@ -179,13 +196,21 @@ void INDEXLIST::setCudaTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(cuda_items_per_thread_type{}, [&](auto items_per_thread) { + using cuda_items_per_thread = cuda_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(cuda_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)+ - "_itemsPerThread_"+std::to_string(items_per_thread)); + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); } diff --git a/src/basic/INDEXLIST-Hip.cpp b/src/basic/INDEXLIST-Hip.cpp index baa164e3c..3be527d35 100644 --- a/src/basic/INDEXLIST-Hip.cpp +++ b/src/basic/INDEXLIST-Hip.cpp @@ -24,7 +24,7 @@ namespace basic template < size_t block_size > using hip_items_per_thread_type = integer::make_gpu_items_per_thread_list_type< - detail::hip::grid_scan_default_items_per_thread::value, + detail::hip::grid_scan_max_items_per_thread::value+1, integer::LessEqual::value>>; @@ -142,7 +142,24 @@ void INDEXLIST::runHipVariant(VariantID vid, size_t tune_idx) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + if (tune_idx == t) { + + runHipVariantImpl::value + >(vid); + + } + + t += 1; + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { @@ -179,13 +196,21 @@ void INDEXLIST::setHipTuningDefinitions(VariantID vid) if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { - seq_for(hip_items_per_thread_type{}, [&](auto items_per_thread) { + using hip_items_per_thread = hip_items_per_thread_type; + + if (camp::size::value == 0) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + + } + + seq_for(hip_items_per_thread{}, [&](auto items_per_thread) { if (run_params.numValidItemsPerThread() == 0u || run_params.validItemsPerThread(block_size)) { - addVariantTuningName(vid, "block_"+std::to_string(block_size)+ - "_itemsPerThread_"+std::to_string(items_per_thread)); + addVariantTuningName(vid, "itemsPerThread<"+std::to_string(items_per_thread)+">_" + "block_"+std::to_string(block_size)); } From 5f63ec9cb7f774622a5ee2facf26c85d81b10735 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 5 Aug 2024 09:00:11 -0700 Subject: [PATCH 449/454] Pull in latest changes to RAJA and its submodules --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index fa8b8718d..378199aac 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit fa8b8718d5a407668c597c6e97b8f1fadd55bd0f +Subproject commit 378199aac342ee21c2ddfbcbb48413bd1dfac612 From 262a8f182e57c55e805c459a8ccb18fa11e9933d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Aug 2024 15:59:51 -0700 Subject: [PATCH 450/454] Update cray-mpich amdclang build script --- .../lc-builds/toss4_cray-mpich_amdclang.sh | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 0a8dea853..db9cafa5c 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -58,6 +58,19 @@ echo echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2" echo " -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\"" echo +echo "To work around some issues where *_FUSED kernels crash add these options" +echo " -DCMAKE_CXX_FLAGS=\"-fgpu-rdc\"" +echo " -DCMAKE_EXE_LINKER_FLAGS=\"-fgpu-rdc\"" +echo +echo "To work around some issues where *_FUSED kernels perform poorly use this environment variable" +echo " env HSA_SCRATCH_SINGLE_LIMIT=4000000000" +echo +echo "To work around some issues where the build fails with a weird error about max or fmax add these options" +echo " -DCMAKE_CXX_FLAGS=\"--hip-version={hip_version:ex=6.1.2}\"" +echo " -DCMAKE_EXE_LINKER_FLAGS=\"--hip-version={hip_version:ex=6.1.2}\"" +echo + + rm -rf build_${BUILD_SUFFIX} >/dev/null mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} @@ -67,18 +80,27 @@ module load cmake/3.23.1 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER # are inconsistent causing the rocprim from the module to be used unexpectedly -module unload rocm +module unload rocm rocmcc +if [[ "${COMP_VER}" == *-magic ]]; then + ROCM_PATH="/usr/tce/packages/rocmcc/rocmcc-${COMP_VER}" + MPI_ROCM_PATH="/usr/tce/packages/cray-mpich/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}" +else + ROCM_PATH="/opt/rocm-${COMP_VER}" + MPI_ROCM_PATH=/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER} +fi cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DMPI_C_COMPILER="/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}/bin/mpiamdclang" \ - -DMPI_CXX_COMPILER="/usr/tce/packages/cray-mpich-tce/cray-mpich-${MPI_VER}-rocmcc-${COMP_VER}/bin/mpiamdclang++" \ - -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ - -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ - -DHIP_PATH=/opt/rocm-${COMP_VER}/llvm/bin \ - -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang \ - -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/llvm/bin/amdclang++ \ + -DMPI_C_COMPILER="${MPI_ROCM_PATH}/bin/mpiamdclang" \ + -DMPI_CXX_COMPILER="${MPI_ROCM_PATH}/bin/mpiamdclang++" \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/lib/cmake" \ + -DHIP_PLATFORM=amd \ + -DROCM_ROOT_DIR="${ROCM_PATH}" \ + -DHIP_ROOT_DIR="${ROCM_PATH}/hip" \ + -DHIP_PATH="${ROCM_PATH}/llvm/bin" \ + -DCMAKE_C_COMPILER="${ROCM_PATH}/llvm/bin/amdclang" \ + -DCMAKE_CXX_COMPILER="${ROCM_PATH}/llvm/bin/amdclang++" \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ From e5b3b2f4c69aa3ba441f51311ee90af61b3352cf Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Aug 2024 16:00:38 -0700 Subject: [PATCH 451/454] Add --warmup-kernels -wk option This overrides the default warmup kernels and runs the kernels specified instead. --- src/common/Executor.cpp | 93 +++++++++++++++++++++++----------------- src/common/RunParams.cpp | 62 ++++++++++++++++++++++++++- src/common/RunParams.hpp | 4 ++ 3 files changed, 118 insertions(+), 41 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 1cf70f658..6a951334a 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -730,64 +730,77 @@ void Executor::runWarmupKernels() getCout() << "\n\nRun warmup kernels...\n"; // - // For kernels to be run, assemble a set of feature IDs + // Get warmup kernels to run from input // - std::set feature_ids; - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kernel = kernels[ik]; + std::set kernel_ids = run_params.getWarmupKernelIDsToRun(); + + if ( kernel_ids.empty() ) { - for (size_t fid = 0; fid < NumFeatures; ++fid) { - FeatureID tfid = static_cast(fid); - if (kernel->usesFeature(tfid) ) { - feature_ids.insert( tfid ); + // + // If no warmup kernels were given, choose a warmup kernel for each feature + // + + // + // For kernels to be run, assemble a set of feature IDs + // + std::set feature_ids; + for (size_t ik = 0; ik < kernels.size(); ++ik) { + KernelBase* kernel = kernels[ik]; + + for (size_t fid = 0; fid < NumFeatures; ++fid) { + FeatureID tfid = static_cast(fid); + if (kernel->usesFeature(tfid) ) { + feature_ids.insert( tfid ); + } } - } - - } // iterate over kernels - // - // Map feature IDs to set of warmup kernel IDs - // - std::set kernel_ids; - for ( auto fid = feature_ids.begin(); fid != feature_ids.end(); ++ fid ) { + } // iterate over kernels - switch (*fid) { + // + // Map feature IDs to set of warmup kernel IDs + // + for ( auto fid = feature_ids.begin(); fid != feature_ids.end(); ++ fid ) { - case Forall: - case Kernel: - case Launch: - kernel_ids.insert(Basic_DAXPY); break; + switch (*fid) { - case Sort: - kernel_ids.insert(Algorithm_SORT); break; - - case Scan: - kernel_ids.insert(Basic_INDEXLIST_3LOOP); break; + case Forall: + case Kernel: + case Launch: + kernel_ids.insert(Basic_DAXPY); break; - case Workgroup: - kernel_ids.insert(Comm_HALO_PACKING_FUSED); break; + case Sort: + kernel_ids.insert(Algorithm_SORT); break; - case Reduction: - kernel_ids.insert(Basic_REDUCE3_INT); break; + case Scan: + kernel_ids.insert(Basic_INDEXLIST_3LOOP); break; - case Atomic: - kernel_ids.insert(Basic_PI_ATOMIC); break; + case Workgroup: + kernel_ids.insert(Comm_HALO_PACKING_FUSED); break; - case View: - break; + case Reduction: + kernel_ids.insert(Basic_REDUCE3_INT); break; -#ifdef RAJA_PERFSUITE_ENABLE_MPI - case MPI: - kernel_ids.insert(Comm_HALO_EXCHANGE_FUSED); break; -#endif + case Atomic: + kernel_ids.insert(Basic_PI_ATOMIC); break; - default: - break; + case View: + break; + + #ifdef RAJA_PERFSUITE_ENABLE_MPI + case MPI: + kernel_ids.insert(Comm_HALO_EXCHANGE_FUSED); break; + #endif + + default: + break; + + } } } + // // Run warmup kernels // diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 351db3514..dd62c6177 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -57,6 +57,8 @@ RunParams::RunParams(int argc, char** argv) checkrun_reps(1), reference_variant(), reference_vid(NumVariants), + warmup_kernel_input(), + invalid_warmup_kernel_input(), kernel_input(), invalid_kernel_input(), exclude_kernel_input(), @@ -195,6 +197,15 @@ void RunParams::print(std::ostream& str) const str << "\n hip MPI data space = " << getDataSpaceName(hipMPIDataSpace); str << "\n kokkos MPI data space = " << getDataSpaceName(kokkosMPIDataSpace); + str << "\n warmup_kernel_input = "; + for (size_t j = 0; j < warmup_kernel_input.size(); ++j) { + str << "\n\t" << warmup_kernel_input[j]; + } + str << "\n invalid_warmup_kernel_input = "; + for (size_t j = 0; j < invalid_warmup_kernel_input.size(); ++j) { + str << "\n\t" << invalid_warmup_kernel_input[j]; + } + str << "\n kernel_input = "; for (size_t j = 0; j < kernel_input.size(); ++j) { str << "\n\t" << kernel_input[j]; @@ -789,6 +800,22 @@ void RunParams::parseCommandLineOptions(int argc, char** argv) input_state = BadInput; } + } else if ( opt == std::string("--warmup-kernels") || + opt == std::string("-wk") ) { + + bool done = false; + i++; + while ( i < argc && !done ) { + opt = std::string(argv[i]); + if ( opt.at(0) == '-' ) { + i--; + done = true; + } else { + warmup_kernel_input.push_back(opt); + ++i; + } + } + } else if ( opt == std::string("--kernels") || opt == std::string("-k") ) { @@ -1942,6 +1969,38 @@ void RunParams::processKernelInput() // // ================================================================ + run_warmup_kernels.clear(); + + if ( !warmup_kernel_input.empty() ) { + + // + // Need to parse input to determine which warmup kernels to run + // + + // + // Look for matching names of individual kernels in warmup_kernel_input. + // + for (auto it = warmup_kernel_input.begin(); it != warmup_kernel_input.end(); ++it) + { + bool found_it = false; + + for (size_t kid = 0; kid < NumKernels && !found_it; ++kid) { + KernelID tkid = static_cast(kid); + if ( getKernelName(tkid) == *it || getFullKernelName(tkid) == *it ) { + run_warmup_kernels.insert(tkid); + found_it = true; + } + } + + // Assemble invalid input for output message. + if ( !found_it ) { + invalid_warmup_kernel_input.push_back(*it); + } + + } // iterate over kernel name input + + } + run_kernels.clear(); if ( kernel_input.empty() && feature_input.empty() ) { @@ -2091,7 +2150,8 @@ void RunParams::processKernelInput() // Set BadInput state based on invalid kernel input // - if ( !(invalid_kernel_input.empty()) || + if ( !(invalid_warmup_kernel_input.empty()) || + !(invalid_kernel_input.empty()) || !(invalid_exclude_kernel_input.empty()) ) { input_state = BadInput; } diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp index 75a3e0e11..46cd78f4f 100644 --- a/src/common/RunParams.hpp +++ b/src/common/RunParams.hpp @@ -252,6 +252,7 @@ class RunParams { bool getDisableWarmup() const { return disable_warmup; } + const std::set& getWarmupKernelIDsToRun() const { return run_warmup_kernels; } const std::set& getKernelIDsToRun() const { return run_kernels; } const std::set& getVariantIDsToRun() const { return run_variants; } VariantID getReferenceVariantID() const { return reference_vid; } @@ -360,6 +361,8 @@ class RunParams { // Arrays to hold input strings for valid/invalid input. Helpful for // debugging command line args. // + std::vector warmup_kernel_input; + std::vector invalid_warmup_kernel_input; std::vector kernel_input; std::vector invalid_kernel_input; std::vector exclude_kernel_input; @@ -390,6 +393,7 @@ class RunParams { bool disable_warmup; + std::set run_warmup_kernels; std::set run_kernels; std::set run_variants; From ab594143669ff20226cf88b2b9f57f1c77b06cbb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Aug 2024 16:33:04 -0700 Subject: [PATCH 452/454] support exclusion and groups in warmup kernels --- src/common/RunParams.cpp | 47 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index dd62c6177..450b6780c 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1977,24 +1977,63 @@ void RunParams::processKernelInput() // Need to parse input to determine which warmup kernels to run // + // Make list copy of warmup kernel name input to manipulate for + // processing potential group names and/or kernel names, next + Slist warmup_kern_names(warmup_kernel_input.begin(), warmup_kernel_input.end()); + // - // Look for matching names of individual kernels in warmup_kernel_input. + // Search warmup_kern_names for matching group names. + // warmup_groups2run will contain names of groups to run. // - for (auto it = warmup_kernel_input.begin(); it != warmup_kernel_input.end(); ++it) + Svector warmup_groups2run; + for (Slist::iterator it = warmup_kern_names.begin(); it != warmup_kern_names.end(); ++it) + { + for (size_t ig = 0; ig < NumGroups; ++ig) { + const std::string& group_name = getGroupName(static_cast(ig)); + if ( group_name == *it ) { + warmup_groups2run.push_back(group_name); + } + } + } + + // + // If group name(s) found in warmup_kern_names, assemble kernels in group(s) + // to run and remove those group name(s) from warmup_kern_names list. + // + for (size_t ig = 0; ig < warmup_groups2run.size(); ++ig) { + const std::string& gname(warmup_groups2run[ig]); + + for (size_t kid = 0; kid < NumKernels; ++kid) { + KernelID tkid = static_cast(kid); + if ( getFullKernelName(tkid).find(gname) != std::string::npos && + exclude_kernels.find(tkid) == exclude_kernels.end()) { + run_warmup_kernels.insert(tkid); + } + } + + warmup_kern_names.remove(gname); + } + + // + // Look for matching names of individual kernels in remaining warmup_kern_names. + // + for (Slist::iterator it = warmup_kern_names.begin(); it != warmup_kern_names.end(); ++it) { bool found_it = false; for (size_t kid = 0; kid < NumKernels && !found_it; ++kid) { KernelID tkid = static_cast(kid); if ( getKernelName(tkid) == *it || getFullKernelName(tkid) == *it ) { - run_warmup_kernels.insert(tkid); + if (exclude_kernels.find(tkid) == exclude_kernels.end()) { + run_warmup_kernels.insert(tkid); + } found_it = true; } } // Assemble invalid input for output message. if ( !found_it ) { - invalid_warmup_kernel_input.push_back(*it); + invalid_kernel_input.push_back(*it); } } // iterate over kernel name input From d8ca744c48c09f592581cec0b208300a963ea0f7 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Aug 2024 16:33:18 -0700 Subject: [PATCH 453/454] Add help documentation for warmup kernels --- src/common/RunParams.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 450b6780c..1bbff3372 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -1315,6 +1315,15 @@ void RunParams::printHelpMessage(std::ostream& str) const str << "\t --disable-warmup (disable warmup kernels) [Default is run warmup kernels that are relevant to kernels selected to run]\n\n"; + str << "\t --warmup-kernels, -wk [Default is run warmup kernels that are relevant to kernels selected to run]\n" + << "\t (names of individual kernels and/or groups of kernels to warmup)\n" + << "\t See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n" + << "\t Kernel names are listed as _.\n"; + str << "\t\t Examples...\n" + << "\t\t --warmup-kernels Polybench (warmup all kernels in Polybench group)\n" + << "\t\t -wk INIT3 MULADDSUB (warmup INIT3 and MULADDSUB kernels)\n" + << "\t\t -wk INIT3 Apps (warmup INIT3 kernel and all kernels in Apps group)\n\n"; + str << "\t --kernels, -k [Default is run all]\n" << "\t (names of individual kernels and/or groups of kernels to run)\n" << "\t See '--print-kernels'/'-pk' option for list of valid kernel and group names.\n" From f03b34c71da0db4afe0e0edc79337dc734126043 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 7 Aug 2024 16:34:30 -0700 Subject: [PATCH 454/454] fix invalid warmup kernel output --- src/common/RunParams.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 1bbff3372..1665783a9 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -2042,7 +2042,7 @@ void RunParams::processKernelInput() // Assemble invalid input for output message. if ( !found_it ) { - invalid_kernel_input.push_back(*it); + invalid_warmup_kernel_input.push_back(*it); } } // iterate over kernel name input