From 1594a507f379fd5b6d1fa46a2dffbd7b65d156fa Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 26 Apr 2022 12:07:04 -0500 Subject: [PATCH 01/81] adding inital infrastructure for MFMA test --- src/CMakeLists.txt | 3 + src/basic/CMakeLists.txt | 6 ++ src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 109 ++++++++++++++++++++++ src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 106 +++++++++++++++++++++ src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 72 ++++++++++++++ src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 39 ++++++++ src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 66 +++++++++++++ src/basic/MAT_FUSED_MUL_ADD.cpp | 98 +++++++++++++++++++ src/basic/MAT_FUSED_MUL_ADD.hpp | 88 +++++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 ++ src/common/RAJAPerfSuite.hpp | 1 + 11 files changed, 594 insertions(+) create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bc1bf6b77..06a80317c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -94,6 +94,9 @@ blt_add_executable( basic/INIT_VIEW1D_OFFSET.cpp basic/INIT_VIEW1D_OFFSET-Seq.cpp basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp + basic/MAT_FUSED_MUL_ADD.cpp + basic/MAT_FUSED_MUL_ADD-Seq.cpp + basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp basic/MAT_MAT_SHARED.cpp basic/MAT_MAT_SHARED-Seq.cpp basic/MAT_MAT_SHARED-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index ceeb1a502..8d2cee87b 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -56,6 +56,12 @@ blt_add_library( INIT_VIEW1D_OFFSET-Cuda.cpp INIT_VIEW1D_OFFSET-OMP.cpp INIT_VIEW1D_OFFSET-OMPTarget.cpp + MAT_FUSED_MUL_ADD.cpp + MAT_FUSED_MUL_ADD-Seq.cpp + MAT_FUSED_MUL_ADD-Hip.cpp + MAT_FUSED_MUL_ADD-Cuda.cpp + MAT_FUSED_MUL_ADD-OMP.cpp + MAT_FUSED_MUL_ADD-OMPTarget.cpp MAT_MAT_SHARED.cpp MAT_MAT_SHARED-Seq.cpp MAT_MAT_SHARED-Hip.cpp diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp new file mode 100644 index 000000000..33e157080 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -0,0 +1,109 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +#define MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA \ + const Index_type N = m_N; \ + const Index_type Ne = m_Ne; \ + allocAndInitCudaDeviceData(A, m_A, N); \ + allocAndInitCudaDeviceData(B, m_B, N); \ + allocAndInitCudaDeviceData(D, m_D, N); \ + allocAndInitCudaDeviceData(Ae, m_Ae, Ne); \ + allocAndInitCudaDeviceData(Be, m_Be, Ne); \ + allocAndInitCudaDeviceData(De, m_De, Ne); + + +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_A, A, N); \ + getCudaDeviceData(m_B, B, N); \ + getCudaDeviceData(m_D, D, N); \ + getCudaDeviceData(m_Ae, Ae, Ne); \ + getCudaDeviceData(m_Be, Be, Ne); \ + getCudaDeviceData(m_De, De, Ne); \ + deallocCudaDeviceData(A); \ + deallocCudaDeviceData(B); \ + deallocCudaDeviceData(D); \ + deallocCudaDeviceData(Ae); \ + deallocCudaDeviceData(Be); \ + deallocCudaDeviceData(De); + + +template < Index_type block_size > + __launch_bounds__(block_size) +__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B, + Real_ptr D) { + +} + +template < size_t block_size > +void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) +{ + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + if (vid == Base_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else if (vid == Lambda_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else if (vid == RAJA_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Cuda variant id = " << vid + << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Cuda) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp new file mode 100644 index 000000000..528b629df --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +#define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ + const Index_type N = m_N; \ + const Index_type Ne = m_Ne; \ + allocAndInitHipDeviceData(A, m_A, N); \ + allocAndInitHipDeviceData(B, m_B, N); \ + allocAndInitHipDeviceData(D, m_D, N); \ + allocAndInitHipDeviceData(Ae, m_Ae, Ne); \ + allocAndInitHipDeviceData(Be, m_Be, Ne); \ + allocAndInitHipDeviceData(De, m_De, Ne); + +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_A, A, N); \ + getHipDeviceData(m_B, B, N); \ + getHipDeviceData(m_D, D, N); \ + getHipDeviceData(m_Ae, Ae, Ne); \ + getHipDeviceData(m_Be, Be, Ne); \ + getHipDeviceData(m_De, De, Ne); \ + deallocHipDeviceData(A); \ + deallocHipDeviceData(B); \ + deallocHipDeviceData(D); \ + deallocHipDeviceData(Ae); \ + deallocHipDeviceData(Be); \ + deallocHipDeviceData(De); + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B, + Real_ptr D) { + +} + +template < size_t block_size > +void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + if (vid == Base_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else if (vid == Lambda_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else if (vid == RAJA_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid + << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Hip) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp new file mode 100644 index 000000000..0651e3ccd --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -0,0 +1,72 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace basic { + +void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + + switch (vid) { + + case Base_OpenMP: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + break; + } + + case RAJA_OpenMP: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown variant id = " << vid + << std::endl; + } + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp new file mode 100644 index 000000000..e6c5933eb --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp @@ -0,0 +1,39 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + + + void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + const Index_type run_reps = getRunReps(); + + switch (vid) { + + default: { + + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OpenMPTarget variant id = " << vid << std::endl; + break; + } + } + } + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp new file mode 100644 index 000000000..d1b11a9c1 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp @@ -0,0 +1,66 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include + +namespace rajaperf { +namespace basic { + +void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + switch (vid) { + + case Base_Seq: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // number of iterations + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq: { + + + startTimer(); + for (Index_type irep = 0; irep < run_reps; ++irep) { + } // irep + stopTimer(); + + break; + } + + case RAJA_Seq: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default: { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown variant id = " << vid + << std::endl; + } + } +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp new file mode 100644 index 000000000..35db9553b --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -0,0 +1,98 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) + : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params) +{ + m_N_default = 1000; + setDefaultProblemSize(m_N_default*m_N_default); + setDefaultReps(5); + + //If problem target size is not divisible by Ne, round up + m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \ + Index_type(m_Ne)); + + setActualProblemSize(m_N*m_N); + + setItsPerRep(getActualProblemSize()); + setKernelsPerRep(1); + + setBytesPerRep( m_N*m_N*sizeof(Real_type) + + m_N*m_N*sizeof(Real_type) ); + + //Square Mat-Mat product flops should be (2^N−1)N^2=2*N^3−N^2 + setFLOPsPerRep(2*m_N*m_N*m_N - m_N*m_N); + + checksum_scale_factor = 1e-6 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + + + setVariantDefined(Base_Seq); + setVariantDefined(Lambda_Seq); + setVariantDefined(RAJA_Seq); + + setVariantDefined(Base_OpenMP); + setVariantDefined(Lambda_OpenMP); + setVariantDefined(RAJA_OpenMP); + + setVariantDefined(Base_CUDA); + setVariantDefined(Lambda_CUDA); + setVariantDefined(RAJA_CUDA); + + setVariantDefined(Base_HIP); + setVariantDefined(Lambda_HIP); + setVariantDefined(RAJA_HIP); +} + +MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {} + +void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + //hard coded for 16 at the moment + const Index_type m_Me = m_Ne; + const Index_type m_Ne = m_Ne; + const Index_type m_Ke = m_Ne; + //global matrices + allocAndInitDataConst(m_A, m_N * m_N, 1.0, vid); + allocAndInitDataConst(m_B, m_N * m_N, 1.0, vid); + allocAndInitDataConst(m_D, m_N * m_N, 0.0, vid); + //element/batch matrices + allocAndInitDataConst(m_Ae, m_Me * m_Ke, 1.0, vid); + allocAndInitDataConst(m_Be, m_Ke * m_Ne, 1.0, vid); + allocAndInitDataConst(m_De, m_Me * m_Ne, 0.0, vid); +} + +void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) { + checksum[vid][tune_idx] += calcChecksum(m_Ae, m_N, checksum_scale_factor ); +} + +void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + (void)vid; + deallocData(m_A); + deallocData(m_B); + deallocData(m_D); + deallocData(m_Ae); + deallocData(m_Be); + deallocData(m_De); +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp new file mode 100644 index 000000000..caf777934 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -0,0 +1,88 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Compute D = A x B + C, where +// A: a M x K matrix +// B: a K x N matrix +// C, D: M x N matrices +// All square row-major matrices, C is a null matrix and ignored. +// for(int row = 0; row != m; ++row){ +// for(int col = 0; col != n; ++col){ +// +// float sum = 0.0; +// for (int kk = 0; kk < k; ++kk){ +// sum += A[row][kk] * B[kk][col]; +// } +// D[row][col] = sum; +// } +// } +// } + +#ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP +#define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP + +#include "RAJA/RAJA.hpp" +#include "common/KernelBase.hpp" + + +#define MAT_FUSED_MUL_ADD_DATA_SETUP \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ + Real_ptr D = m_D; \ + Real_ptr Ae = m_Ae; \ + Real_ptr Be = m_Be; \ + Real_ptr De = m_De; + +namespace rajaperf { +class RunParams; + +namespace basic { + +class MAT_FUSED_MUL_ADD : public KernelBase { +public: + MAT_FUSED_MUL_ADD(const RunParams ¶ms); + + ~MAT_FUSED_MUL_ADD(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + Real_ptr m_A; + Real_ptr m_B; + Real_ptr m_D; + Real_ptr m_Ae; + Real_ptr m_Be; + Real_ptr m_De; + + Index_type m_N; + Index_type m_N_default; + static constexpr Index_type m_Ne = 16; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 1bc10c31d..ecfc2f797 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -25,6 +25,7 @@ #include "basic/INIT3.hpp" #include "basic/INIT_VIEW1D.hpp" #include "basic/INIT_VIEW1D_OFFSET.hpp" +#include "basic/MAT_FUSED_MUL_ADD.hpp" #include "basic/MAT_MAT_SHARED.hpp" #include "basic/MULADDSUB.hpp" #include "basic/NESTED_INIT.hpp" @@ -158,6 +159,7 @@ static const std::string KernelNames [] = std::string("Basic_INIT3"), std::string("Basic_INIT_VIEW1D"), std::string("Basic_INIT_VIEW1D_OFFSET"), + std::string("Basic_MAT_FUSED_MUL_ADD"), std::string("Basic_MAT_MAT_SHARED"), std::string("Basic_MULADDSUB"), std::string("Basic_NESTED_INIT"), @@ -535,6 +537,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::INIT_VIEW1D_OFFSET(run_params); break; } + case Basic_MAT_FUSED_MUL_ADD : { + kernel = new basic::MAT_FUSED_MUL_ADD(run_params); + break; + } case Basic_MAT_MAT_SHARED : { kernel = new basic::MAT_MAT_SHARED(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index fad672137..ea082325e 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -78,6 +78,7 @@ enum KernelID { Basic_INIT3, Basic_INIT_VIEW1D, Basic_INIT_VIEW1D_OFFSET, + Basic_MAT_FUSED_MUL_ADD, Basic_MAT_MAT_SHARED, Basic_MULADDSUB, Basic_NESTED_INIT, From e3d5aac37126b02c64d7bb9ab16ccb90d45a8353 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Wed, 27 Apr 2022 13:17:00 -0500 Subject: [PATCH 02/81] working hip mfma with matrix core builtins --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 64 +++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 528b629df..e3c138ace 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -21,18 +21,19 @@ namespace basic { #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ const Index_type N = m_N; \ + const Index_type NN = m_N * m_N; \ const Index_type Ne = m_Ne; \ - allocAndInitHipDeviceData(A, m_A, N); \ - allocAndInitHipDeviceData(B, m_B, N); \ - allocAndInitHipDeviceData(D, m_D, N); \ + allocAndInitHipDeviceData(A, m_A, NN); \ + allocAndInitHipDeviceData(B, m_B, NN); \ + allocAndInitHipDeviceData(D, m_D, NN); \ allocAndInitHipDeviceData(Ae, m_Ae, Ne); \ allocAndInitHipDeviceData(Be, m_Be, Ne); \ allocAndInitHipDeviceData(De, m_De, Ne); #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_A, A, N); \ - getHipDeviceData(m_B, B, N); \ - getHipDeviceData(m_D, D, N); \ + getHipDeviceData(m_A, A, NN); \ + getHipDeviceData(m_B, B, NN); \ + getHipDeviceData(m_D, D, NN); \ getHipDeviceData(m_Ae, Ae, Ne); \ getHipDeviceData(m_Be, Be, Ne); \ getHipDeviceData(m_De, De, Ne); \ @@ -45,31 +46,76 @@ namespace basic { template < Index_type block_size > __launch_bounds__(block_size) -__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B, - Real_ptr D) { +__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D){ + // compute a 16x16x16 matrix multiplication using a single wavefront. +#if defined(RP_USE_DOUBLE) + using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double; + double4 result = {0}; +#elif defined(RP_USE_FLOAT) + using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; + float4 result = {0}; // zero out 4 vanilla VGPRs +#endif + Index_type a_idx = 16 * threadIdx.x + threadIdx.y; + Index_type b_idx = threadIdx.x + 16 * threadIdx.y; + + for(int i = 0; i < 4; ++i){ + Real_type a = A[a_idx]; + Real_type b = B[b_idx]; + +#ifdef __gfx90a__ +#if defined(RP_USE_DOUBLE) + result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); +#elif defined(RP_USE_FLOAT) + result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); +#endif +#endif + a_idx += 4; // move four columns to the right + b_idx += 4*16; // move four rows down + } + + #pragma unroll 4 + for(Index_type i = 0; i < 4; ++i){ + const Index_type d_idx = threadIdx.x + + i * 16 + + threadIdx.y * 4 * 16; + D[d_idx] = result[i]; + } } + + template < size_t block_size > void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; - const Index_type Ne = m_Ne; + const Index_type NN = m_N * m_N; + //const Index_type Ne = m_Ne; + + dim3 gridDim (1, 1, 1); + dim3 blockDim(16, 4, 1); MAT_FUSED_MUL_ADD_DATA_SETUP; if (vid == Base_HIP) { + for(Index_type i = 0; i != NN; ++i){ m_A[i] = i; } + for(Index_type i = 0; i != NN; ++i){ m_B[i] = NN - 1 - i; } MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, + A, B, D); + hipErrchk( hipGetLastError() ); } stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + } else if (vid == Lambda_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From 332067b79185505f2385bef364a67bd722cff1a6 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Wed, 27 Apr 2022 14:46:34 -0500 Subject: [PATCH 03/81] fixing incorrect default problem size --- src/basic/MAT_FUSED_MUL_ADD.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp index 35db9553b..5f2589f17 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -20,14 +20,15 @@ namespace basic { MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params) { - m_N_default = 1000; - setDefaultProblemSize(m_N_default*m_N_default); + m_N_default = 16; + setDefaultProblemSize(m_N_default); setDefaultReps(5); //If problem target size is not divisible by Ne, round up - m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \ - Index_type(m_Ne)); +// m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \ +// Index_type(m_Ne)); + m_N = 16; setActualProblemSize(m_N*m_N); setItsPerRep(getActualProblemSize()); From 8aa020bb282dffd2ba66e20229bb2f9727afdb78 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Wed, 27 Apr 2022 15:01:00 -0500 Subject: [PATCH 04/81] fixing flop calc --- src/basic/MAT_FUSED_MUL_ADD.cpp | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp index 5f2589f17..fca7af98b 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -34,11 +34,10 @@ MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( m_N*m_N*sizeof(Real_type) + - m_N*m_N*sizeof(Real_type) ); + setBytesPerRep( 2*m_N*m_N*sizeof(Real_type)); //Square Mat-Mat product flops should be (2^N−1)N^2=2*N^3−N^2 - setFLOPsPerRep(2*m_N*m_N*m_N - m_N*m_N); + setFLOPsPerRep(2*m_N*m_N*m_N); checksum_scale_factor = 1e-6 * ( static_cast(getDefaultProblemSize()) / @@ -66,23 +65,16 @@ MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {} void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - - //hard coded for 16 at the moment - const Index_type m_Me = m_Ne; - const Index_type m_Ne = m_Ne; - const Index_type m_Ke = m_Ne; + //global matrices allocAndInitDataConst(m_A, m_N * m_N, 1.0, vid); allocAndInitDataConst(m_B, m_N * m_N, 1.0, vid); allocAndInitDataConst(m_D, m_N * m_N, 0.0, vid); - //element/batch matrices - allocAndInitDataConst(m_Ae, m_Me * m_Ke, 1.0, vid); - allocAndInitDataConst(m_Be, m_Ke * m_Ne, 1.0, vid); - allocAndInitDataConst(m_De, m_Me * m_Ne, 0.0, vid); + } void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_Ae, m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_D, m_N*m_N, checksum_scale_factor ); } void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { @@ -90,9 +82,7 @@ void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_A); deallocData(m_B); deallocData(m_D); - deallocData(m_Ae); - deallocData(m_Be); - deallocData(m_De); + } } // end namespace basic From bc9770a1f7b712df5a675a3e6d66db03986aa823 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Wed, 27 Apr 2022 15:02:02 -0500 Subject: [PATCH 05/81] fixing some variable names --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 22 +++++++++------------- src/basic/MAT_FUSED_MUL_ADD.hpp | 8 +------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index e3c138ace..5e34fe993 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -22,27 +22,17 @@ namespace basic { #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ const Index_type N = m_N; \ const Index_type NN = m_N * m_N; \ - const Index_type Ne = m_Ne; \ allocAndInitHipDeviceData(A, m_A, NN); \ allocAndInitHipDeviceData(B, m_B, NN); \ - allocAndInitHipDeviceData(D, m_D, NN); \ - allocAndInitHipDeviceData(Ae, m_Ae, Ne); \ - allocAndInitHipDeviceData(Be, m_Be, Ne); \ - allocAndInitHipDeviceData(De, m_De, Ne); + allocAndInitHipDeviceData(D, m_D, NN); #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP \ getHipDeviceData(m_A, A, NN); \ getHipDeviceData(m_B, B, NN); \ getHipDeviceData(m_D, D, NN); \ - getHipDeviceData(m_Ae, Ae, Ne); \ - getHipDeviceData(m_Be, Be, Ne); \ - getHipDeviceData(m_De, De, Ne); \ deallocHipDeviceData(A); \ deallocHipDeviceData(B); \ - deallocHipDeviceData(D); \ - deallocHipDeviceData(Ae); \ - deallocHipDeviceData(Be); \ - deallocHipDeviceData(De); + deallocHipDeviceData(D); template < Index_type block_size > __launch_bounds__(block_size) @@ -91,7 +81,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type N = m_N; const Index_type NN = m_N * m_N; - //const Index_type Ne = m_Ne; dim3 gridDim (1, 1, 1); dim3 blockDim(16, 4, 1); @@ -114,6 +103,13 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; +// for(int i = 0; i != A_h.size(); ++i){ +// printf("A_h[%d] = %f\n", i, A_h[i]); +// } + for(int i = 0; i != NN; ++i){ + printf("D[%d] = %f\n", i, m_D[i]); + } + } else if (vid == Lambda_HIP) { diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index caf777934..e7c621408 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -33,10 +33,7 @@ #define MAT_FUSED_MUL_ADD_DATA_SETUP \ Real_ptr A = m_A; \ Real_ptr B = m_B; \ - Real_ptr D = m_D; \ - Real_ptr Ae = m_Ae; \ - Real_ptr Be = m_Be; \ - Real_ptr De = m_De; + Real_ptr D = m_D; namespace rajaperf { class RunParams; @@ -73,9 +70,6 @@ class MAT_FUSED_MUL_ADD : public KernelBase { Real_ptr m_A; Real_ptr m_B; Real_ptr m_D; - Real_ptr m_Ae; - Real_ptr m_Be; - Real_ptr m_De; Index_type m_N; Index_type m_N_default; From 774d74211d5eaa17e9ff06e6e191649b22a2872e Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 28 Apr 2022 15:03:52 -0500 Subject: [PATCH 06/81] setting up problem size infrastructure correctly --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 35 ++++++++++++++++------------- src/basic/MAT_FUSED_MUL_ADD.cpp | 22 ++++++++---------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 5e34fe993..2238332cc 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -21,15 +21,16 @@ namespace basic { #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ const Index_type N = m_N; \ - const Index_type NN = m_N * m_N; \ - allocAndInitHipDeviceData(A, m_A, NN); \ - allocAndInitHipDeviceData(B, m_B, NN); \ - allocAndInitHipDeviceData(D, m_D, NN); + const Index_type Ne = m_Ne; \ + const Index_type NeNe = m_Ne * m_Ne; \ + allocAndInitHipDeviceData(A, m_A, N); \ + allocAndInitHipDeviceData(B, m_B, N); \ + allocAndInitHipDeviceData(D, m_D, N); #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_A, A, NN); \ - getHipDeviceData(m_B, B, NN); \ - getHipDeviceData(m_D, D, NN); \ + getHipDeviceData(m_A, A, N); \ + getHipDeviceData(m_B, B, N); \ + getHipDeviceData(m_D, D, N); \ deallocHipDeviceData(A); \ deallocHipDeviceData(B); \ deallocHipDeviceData(D); @@ -80,7 +81,8 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; - const Index_type NN = m_N * m_N; + const Index_type Ne = m_Ne; + const Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); dim3 blockDim(16, 4, 1); @@ -89,8 +91,11 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) if (vid == Base_HIP) { - for(Index_type i = 0; i != NN; ++i){ m_A[i] = i; } - for(Index_type i = 0; i != NN; ++i){ m_B[i] = NN - 1 - i; } + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + //for(Index_type ii = 0; ii != 3; ++ii){ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } + for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } + } MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); @@ -103,12 +108,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; -// for(int i = 0; i != A_h.size(); ++i){ -// printf("A_h[%d] = %f\n", i, A_h[i]); -// } - for(int i = 0; i != NN; ++i){ - printf("D[%d] = %f\n", i, m_D[i]); + for(int i = 0; i != N; ++i){ + printf("A[%d] = %f\n", i, m_A[i]); } +// for(int i = 0; i != NN; ++i){ +// printf("D[%d] = %f\n", i, m_D[i]); +// } diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp index fca7af98b..d6820cfd8 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -20,24 +20,20 @@ namespace basic { MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params) { - m_N_default = 16; + m_N_default = 1024; setDefaultProblemSize(m_N_default); setDefaultReps(5); - //If problem target size is not divisible by Ne, round up -// m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \ -// Index_type(m_Ne)); - - m_N = 16; - setActualProblemSize(m_N*m_N); + //Make sure problem target size is divisible by Ne*Ne + m_N = RAJA_DIVIDE_CEILING_INT(Index_type(getTargetProblemSize()),Index_type(m_Ne*m_Ne))*Index_type(m_Ne*m_Ne); + setActualProblemSize(m_N); setItsPerRep(getActualProblemSize()); setKernelsPerRep(1); - setBytesPerRep( 2*m_N*m_N*sizeof(Real_type)); + setBytesPerRep(2*m_N*sizeof(Real_type)); + setFLOPsPerRep(2*m_N*m_Ne); - //Square Mat-Mat product flops should be (2^N−1)N^2=2*N^3−N^2 - setFLOPsPerRep(2*m_N*m_N*m_N); checksum_scale_factor = 1e-6 * ( static_cast(getDefaultProblemSize()) / @@ -67,9 +63,9 @@ MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {} void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { //global matrices - allocAndInitDataConst(m_A, m_N * m_N, 1.0, vid); - allocAndInitDataConst(m_B, m_N * m_N, 1.0, vid); - allocAndInitDataConst(m_D, m_N * m_N, 0.0, vid); + allocAndInitDataConst(m_A, m_N, 1.0, vid); + allocAndInitDataConst(m_B, m_N, 1.0, vid); + allocAndInitDataConst(m_D, m_N*m_N, 0.0, vid); } From 83700d6c7949681c69b9bce7298937a697899518 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 28 Apr 2022 15:13:04 -0500 Subject: [PATCH 07/81] fixing results array storage --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 15 ++------------- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 8 ++++---- src/basic/MAT_FUSED_MUL_ADD.cpp | 5 ++--- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 33e157080..93503b2d7 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -24,26 +24,15 @@ namespace basic { const Index_type Ne = m_Ne; \ allocAndInitCudaDeviceData(A, m_A, N); \ allocAndInitCudaDeviceData(B, m_B, N); \ - allocAndInitCudaDeviceData(D, m_D, N); \ - allocAndInitCudaDeviceData(Ae, m_Ae, Ne); \ - allocAndInitCudaDeviceData(Be, m_Be, Ne); \ - allocAndInitCudaDeviceData(De, m_De, Ne); - + allocAndInitCudaDeviceData(D, m_D, N); #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA \ getCudaDeviceData(m_A, A, N); \ getCudaDeviceData(m_B, B, N); \ getCudaDeviceData(m_D, D, N); \ - getCudaDeviceData(m_Ae, Ae, Ne); \ - getCudaDeviceData(m_Be, Be, Ne); \ - getCudaDeviceData(m_De, De, Ne); \ deallocCudaDeviceData(A); \ deallocCudaDeviceData(B); \ - deallocCudaDeviceData(D); \ - deallocCudaDeviceData(Ae); \ - deallocCudaDeviceData(Be); \ - deallocCudaDeviceData(De); - + deallocCudaDeviceData(D); template < Index_type block_size > __launch_bounds__(block_size) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 2238332cc..6d348b313 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -108,12 +108,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; +// for(int i = 0; i != N; ++i){ +// printf("A[%d] = %f\n", i, m_A[i]); +// } for(int i = 0; i != N; ++i){ - printf("A[%d] = %f\n", i, m_A[i]); + printf("D[%d] = %f\n", i, m_D[i]); } -// for(int i = 0; i != NN; ++i){ -// printf("D[%d] = %f\n", i, m_D[i]); -// } diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp index d6820cfd8..391a03949 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -62,15 +62,14 @@ MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {} void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - //global matrices allocAndInitDataConst(m_A, m_N, 1.0, vid); allocAndInitDataConst(m_B, m_N, 1.0, vid); - allocAndInitDataConst(m_D, m_N*m_N, 0.0, vid); + allocAndInitDataConst(m_D, m_N, 0.0, vid); } void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) { - checksum[vid][tune_idx] += calcChecksum(m_D, m_N*m_N, checksum_scale_factor ); + checksum[vid][tune_idx] += calcChecksum(m_D, m_N, checksum_scale_factor ); } void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { From 1bbd47368bdfdaf2ab72da0c3026e5d4e295c838 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 28 Apr 2022 16:04:01 -0500 Subject: [PATCH 08/81] finish multi matrix support --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 30 +++++++++++++---------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 6d348b313..483160bc7 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -37,17 +37,20 @@ namespace basic { template < Index_type block_size > __launch_bounds__(block_size) -__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D){ +__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, + Index_type N){ +constexpr Index_type Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ // compute a 16x16x16 matrix multiplication using a single wavefront. #if defined(RP_USE_DOUBLE) using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double; double4 result = {0}; #elif defined(RP_USE_FLOAT) using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; - float4 result = {0}; // zero out 4 vanilla VGPRs + float4 result = {0}; #endif - Index_type a_idx = 16 * threadIdx.x + threadIdx.y; - Index_type b_idx = threadIdx.x + 16 * threadIdx.y; + Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne); + Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne); for(int i = 0; i < 4; ++i){ Real_type a = A[a_idx]; @@ -61,19 +64,20 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D #endif #endif a_idx += 4; // move four columns to the right - b_idx += 4*16; // move four rows down + b_idx += 4*Ne; // move four rows down } #pragma unroll 4 for(Index_type i = 0; i < 4; ++i){ const Index_type d_idx = threadIdx.x - + i * 16 - + threadIdx.y * 4 * 16; + + i * Ne + + threadIdx.y * 4 * Ne + + ii*(Ne*Ne); D[d_idx] = result[i]; } } - +} template < size_t block_size > @@ -102,20 +106,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, - A, B, D); + A, B, D, N); hipErrchk( hipGetLastError() ); } stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; -// for(int i = 0; i != N; ++i){ -// printf("A[%d] = %f\n", i, m_A[i]); -// } - for(int i = 0; i != N; ++i){ - printf("D[%d] = %f\n", i, m_D[i]); - } - - } else if (vid == Lambda_HIP) { From 55918498a2d6b9cbf18d271ca3ea941427710171 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 28 Apr 2022 21:43:49 -0500 Subject: [PATCH 09/81] updating algorithm description --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 11 +++++----- src/basic/MAT_FUSED_MUL_ADD.cpp | 2 +- src/basic/MAT_FUSED_MUL_ADD.hpp | 34 +++++++++++++++++------------ 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 483160bc7..16fbd1e0a 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -21,8 +21,8 @@ namespace basic { #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ const Index_type N = m_N; \ - const Index_type Ne = m_Ne; \ - const Index_type NeNe = m_Ne * m_Ne; \ + constexpr Index_type Ne = m_Ne; \ + constexpr Index_type NeNe = m_Ne * m_Ne; \ allocAndInitHipDeviceData(A, m_A, N); \ allocAndInitHipDeviceData(B, m_B, N); \ allocAndInitHipDeviceData(D, m_D, N); @@ -85,18 +85,17 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; - const Index_type Ne = m_Ne; - const Index_type NeNe = m_Ne * m_Ne; + constexpr Index_type Ne = m_Ne; + constexpr Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); - dim3 blockDim(16, 4, 1); + dim3 blockDim(Ne, 4, 1); MAT_FUSED_MUL_ADD_DATA_SETUP; if (vid == Base_HIP) { for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - //for(Index_type ii = 0; ii != 3; ++ii){ for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } } diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp index 391a03949..957e09bfb 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -24,7 +24,7 @@ MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) setDefaultProblemSize(m_N_default); setDefaultReps(5); - //Make sure problem target size is divisible by Ne*Ne + //Make sure problem target size is divisible by 16*16 m_N = RAJA_DIVIDE_CEILING_INT(Index_type(getTargetProblemSize()),Index_type(m_Ne*m_Ne))*Index_type(m_Ne*m_Ne); setActualProblemSize(m_N); diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index e7c621408..552590c1a 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -5,24 +5,30 @@ // // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Compute D = A x B + C, where -// A: a M x K matrix -// B: a K x N matrix -// C, D: M x N matrices -// All square row-major matrices, C is a null matrix and ignored. -// for(int row = 0; row != m; ++row){ -// for(int col = 0; col != n; ++col){ -// -// float sum = 0.0; -// for (int kk = 0; kk < k; ++kk){ -// sum += A[row][kk] * B[kk][col]; -// } -// D[row][col] = sum; +// Inputs: +// A: N/(Ne*Ne) Ne x Ne matrices +// B: N/(Ne*Ne) Ne x Ne matrices +// Ouput: +// D: N/(Ne*Ne) Ne x Ne matrices +// All square row-major matrices, C is ignored. +// for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +// for(int row = 0; row != Ne; ++row){ +// for(int col = 0; col != Ne; ++col){ +// float dot = 0.0; +// int A_idx = row * Ne + ii*NeNe; +// int B_idx = col + ii*NeNe; +// for(int i = 0; i != Ne; ++i){ +// dot += A[A_idx] * B[B_idx]; +// ++A_idx; +// B_idx += Ne; // } +// D[row * Ne + col] = dot; // } // } - +// return D; +//} +//} #ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP #define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP From 4653c23650e5e546cd17a3291c69390ea9285c0c Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 28 Apr 2022 22:31:07 -0500 Subject: [PATCH 10/81] adding a few more guard rails for fma builtins. kernel still only implemented for gfx908 and gfx90a --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 16fbd1e0a..9d1bd598f 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -56,13 +56,20 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Real_type a = A[a_idx]; Real_type b = B[b_idx]; -#ifdef __gfx90a__ #if defined(RP_USE_DOUBLE) +#ifdef __gfx90a__ result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); +#else + result = {0}; //currenlty unimplemented +#endif //end __gfx90a__ #elif defined(RP_USE_FLOAT) +#ifdef __gfx90a__ || __gfx908__ result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); -#endif -#endif +#else + result = {0}; //uncurrently unimplemented +#endif //end __gfx90a__ or __gfx908__ +#endif //end FLOAT vs DOBULE + a_idx += 4; // move four columns to the right b_idx += 4*Ne; // move four rows down } From bd47d35131b13876c97f840ee0ba7db8f46b6efc Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 29 Apr 2022 16:59:38 -0500 Subject: [PATCH 11/81] adding some ifdefs to call a seperate mfma kernel if hardware support not availble. kernel is currently empty --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 44 ++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 9d1bd598f..29dad102c 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -21,8 +21,8 @@ namespace basic { #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ const Index_type N = m_N; \ - constexpr Index_type Ne = m_Ne; \ - constexpr Index_type NeNe = m_Ne * m_Ne; \ + constexpr Index_type Ne = m_Ne; \ + constexpr Index_type NeNe = m_Ne * m_Ne; \ allocAndInitHipDeviceData(A, m_A, N); \ allocAndInitHipDeviceData(B, m_B, N); \ allocAndInitHipDeviceData(D, m_D, N); @@ -37,7 +37,7 @@ namespace basic { template < Index_type block_size > __launch_bounds__(block_size) -__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, +__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ @@ -60,13 +60,13 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ #ifdef __gfx90a__ result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); #else - result = {0}; //currenlty unimplemented + result = {0}; //should never get here. #endif //end __gfx90a__ #elif defined(RP_USE_FLOAT) -#ifdef __gfx90a__ || __gfx908__ +#if defined(__gfx90a__) || defined(__gfx908__) result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); #else - result = {0}; //uncurrently unimplemented + result = {0}; //should never get here #endif //end __gfx90a__ or __gfx908__ #endif //end FLOAT vs DOBULE @@ -85,8 +85,13 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ } } } - - +//Reference for cases with no hardware support +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, + Index_type N){ + return; +} template < size_t block_size > void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) { @@ -97,6 +102,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) dim3 gridDim (1, 1, 1); dim3 blockDim(Ne, 4, 1); + hipDeviceProp_t devProp; + hipError_t err = hipGetDeviceProperties(&devProp, 0); + std::string gcnArchName(devProp.gcnArchName); + std::string hipArch = gcnArchName.substr(0, 6); MAT_FUSED_MUL_ADD_DATA_SETUP; @@ -110,7 +119,23 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + if(hipArch == "gfx90a") + //Both FP32 and FP64 supported + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDim), 0, 0, + A, B, D, N); + else if(hipArch == "gfx908"){ +#if defined(RP_USE_FLOAT) + //Only FP32 supported on MI100 + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDim), 0, 0, + A, B, D, N); +#elif defined(RP_USE_DOUBLE) + //FP64 not supported on MI100 + hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, + A, B, D, N); + } +#endif + else + //Otherwise no mfma hardware support hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, A, B, D, N); hipErrchk( hipGetLastError() ); @@ -118,7 +143,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; - } else if (vid == Lambda_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From fde4b655a4f3178674dd2b6ec270ff9f596fc28f Mon Sep 17 00:00:00 2001 From: Corbin Andrew Robeck Date: Mon, 2 May 2022 23:26:58 -0500 Subject: [PATCH 12/81] adding reference basic mat-mat kernel for comparison when no mfma instructions are available --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 29dad102c..bc757242d 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -90,6 +90,15 @@ template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ + constexpr int Ne = 16; + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + + float sum = 0; + for (int k = 0; k < Ne; ++k) { + sum += A[y*Ne + k] * B[k*Ne + x]; + } + D[y*Ne + x] = sum; return; } template < size_t block_size > @@ -101,7 +110,8 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) constexpr Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); - dim3 blockDim(Ne, 4, 1); + dim3 blockDimBuiltin(Ne, Ne/4, 1); + dim3 blockDim(Ne, Ne, 1); hipDeviceProp_t devProp; hipError_t err = hipGetDeviceProperties(&devProp, 0); std::string gcnArchName(devProp.gcnArchName); @@ -121,12 +131,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { if(hipArch == "gfx90a") //Both FP32 and FP64 supported - hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDim), 0, 0, + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, A, B, D, N); else if(hipArch == "gfx908"){ #if defined(RP_USE_FLOAT) //Only FP32 supported on MI100 - hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDim), 0, 0, + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, A, B, D, N); #elif defined(RP_USE_DOUBLE) //FP64 not supported on MI100 @@ -143,6 +153,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + } else if (vid == Lambda_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From 717438238fd4d7d816da03cb8f1eff6032928bb7 Mon Sep 17 00:00:00 2001 From: Corbin Andrew Robeck Date: Tue, 3 May 2022 13:26:58 -0500 Subject: [PATCH 13/81] adding problem size support to base hip kernel --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index bc757242d..e47e00c97 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -91,6 +91,7 @@ __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ constexpr int Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; @@ -98,8 +99,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D for (int k = 0; k < Ne; ++k) { sum += A[y*Ne + k] * B[k*Ne + x]; } - D[y*Ne + x] = sum; - return; + D[y*Ne + x + ii*(Ne*Ne)] = sum; +} } template < size_t block_size > void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) @@ -153,7 +154,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; - } else if (vid == Lambda_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From f5fe86145b35cf2288ef096bfed205e15007a786 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 13:26:58 -0500 Subject: [PATCH 14/81] adding problem size support to base hip kernel --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index bc757242d..e47e00c97 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -91,6 +91,7 @@ __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ constexpr int Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; @@ -98,8 +99,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D for (int k = 0; k < Ne; ++k) { sum += A[y*Ne + k] * B[k*Ne + x]; } - D[y*Ne + x] = sum; - return; + D[y*Ne + x + ii*(Ne*Ne)] = sum; +} } template < size_t block_size > void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) @@ -153,7 +154,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; - } else if (vid == Lambda_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From 799e17c04d6edc7801f8101ad2993be040cc4855 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:18:42 -0500 Subject: [PATCH 15/81] updating top level cmake file for new mfma test --- src/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bc1bf6b77..06a80317c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -94,6 +94,9 @@ blt_add_executable( basic/INIT_VIEW1D_OFFSET.cpp basic/INIT_VIEW1D_OFFSET-Seq.cpp basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp + basic/MAT_FUSED_MUL_ADD.cpp + basic/MAT_FUSED_MUL_ADD-Seq.cpp + basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp basic/MAT_MAT_SHARED.cpp basic/MAT_MAT_SHARED-Seq.cpp basic/MAT_MAT_SHARED-OMPTarget.cpp From 451dee25dbe34406ea0f70ca4fbb01b14f27a871 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:19:25 -0500 Subject: [PATCH 16/81] updating cmake file to add mfma test to basic list --- src/basic/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index ceeb1a502..8d2cee87b 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -56,6 +56,12 @@ blt_add_library( INIT_VIEW1D_OFFSET-Cuda.cpp INIT_VIEW1D_OFFSET-OMP.cpp INIT_VIEW1D_OFFSET-OMPTarget.cpp + MAT_FUSED_MUL_ADD.cpp + MAT_FUSED_MUL_ADD-Seq.cpp + MAT_FUSED_MUL_ADD-Hip.cpp + MAT_FUSED_MUL_ADD-Cuda.cpp + MAT_FUSED_MUL_ADD-OMP.cpp + MAT_FUSED_MUL_ADD-OMPTarget.cpp MAT_MAT_SHARED.cpp MAT_MAT_SHARED-Seq.cpp MAT_MAT_SHARED-Hip.cpp From c015117cf544b6699dcf1d3c86860a96a9796e0c Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:19:53 -0500 Subject: [PATCH 17/81] adding mfma test to raja perf suite infrastructure --- src/common/RAJAPerfSuite.cpp | 6 ++++++ src/common/RAJAPerfSuite.hpp | 1 + 2 files changed, 7 insertions(+) diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 1bc10c31d..ecfc2f797 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -25,6 +25,7 @@ #include "basic/INIT3.hpp" #include "basic/INIT_VIEW1D.hpp" #include "basic/INIT_VIEW1D_OFFSET.hpp" +#include "basic/MAT_FUSED_MUL_ADD.hpp" #include "basic/MAT_MAT_SHARED.hpp" #include "basic/MULADDSUB.hpp" #include "basic/NESTED_INIT.hpp" @@ -158,6 +159,7 @@ static const std::string KernelNames [] = std::string("Basic_INIT3"), std::string("Basic_INIT_VIEW1D"), std::string("Basic_INIT_VIEW1D_OFFSET"), + std::string("Basic_MAT_FUSED_MUL_ADD"), std::string("Basic_MAT_MAT_SHARED"), std::string("Basic_MULADDSUB"), std::string("Basic_NESTED_INIT"), @@ -535,6 +537,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::INIT_VIEW1D_OFFSET(run_params); break; } + case Basic_MAT_FUSED_MUL_ADD : { + kernel = new basic::MAT_FUSED_MUL_ADD(run_params); + break; + } case Basic_MAT_MAT_SHARED : { kernel = new basic::MAT_MAT_SHARED(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index fad672137..ea082325e 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -78,6 +78,7 @@ enum KernelID { Basic_INIT3, Basic_INIT_VIEW1D, Basic_INIT_VIEW1D_OFFSET, + Basic_MAT_FUSED_MUL_ADD, Basic_MAT_MAT_SHARED, Basic_MULADDSUB, Basic_NESTED_INIT, From 6a6ccd7c2ed45794f8d7df8eac9eaadb5d08893b Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:20:47 -0500 Subject: [PATCH 18/81] adding mfma base and header files --- src/basic/MAT_FUSED_MUL_ADD.cpp | 84 +++++++++++++++++++++++++++++++ src/basic/MAT_FUSED_MUL_ADD.hpp | 88 +++++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 src/basic/MAT_FUSED_MUL_ADD.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD.hpp diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp new file mode 100644 index 000000000..957e09bfb --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) + : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params) +{ + m_N_default = 1024; + setDefaultProblemSize(m_N_default); + setDefaultReps(5); + + //Make sure problem target size is divisible by 16*16 + m_N = RAJA_DIVIDE_CEILING_INT(Index_type(getTargetProblemSize()),Index_type(m_Ne*m_Ne))*Index_type(m_Ne*m_Ne); + setActualProblemSize(m_N); + + setItsPerRep(getActualProblemSize()); + setKernelsPerRep(1); + + setBytesPerRep(2*m_N*sizeof(Real_type)); + setFLOPsPerRep(2*m_N*m_Ne); + + + checksum_scale_factor = 1e-6 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + + + setVariantDefined(Base_Seq); + setVariantDefined(Lambda_Seq); + setVariantDefined(RAJA_Seq); + + setVariantDefined(Base_OpenMP); + setVariantDefined(Lambda_OpenMP); + setVariantDefined(RAJA_OpenMP); + + setVariantDefined(Base_CUDA); + setVariantDefined(Lambda_CUDA); + setVariantDefined(RAJA_CUDA); + + setVariantDefined(Base_HIP); + setVariantDefined(Lambda_HIP); + setVariantDefined(RAJA_HIP); +} + +MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {} + +void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + allocAndInitDataConst(m_A, m_N, 1.0, vid); + allocAndInitDataConst(m_B, m_N, 1.0, vid); + allocAndInitDataConst(m_D, m_N, 0.0, vid); + +} + +void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) { + checksum[vid][tune_idx] += calcChecksum(m_D, m_N, checksum_scale_factor ); +} + +void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + (void)vid; + deallocData(m_A); + deallocData(m_B); + deallocData(m_D); + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp new file mode 100644 index 000000000..552590c1a --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -0,0 +1,88 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Compute D = A x B + C, where +// Inputs: +// A: N/(Ne*Ne) Ne x Ne matrices +// B: N/(Ne*Ne) Ne x Ne matrices +// Ouput: +// D: N/(Ne*Ne) Ne x Ne matrices +// All square row-major matrices, C is ignored. +// for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +// for(int row = 0; row != Ne; ++row){ +// for(int col = 0; col != Ne; ++col){ +// float dot = 0.0; +// int A_idx = row * Ne + ii*NeNe; +// int B_idx = col + ii*NeNe; +// for(int i = 0; i != Ne; ++i){ +// dot += A[A_idx] * B[B_idx]; +// ++A_idx; +// B_idx += Ne; +// } +// D[row * Ne + col] = dot; +// } +// } +// return D; +//} +//} +#ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP +#define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP + +#include "RAJA/RAJA.hpp" +#include "common/KernelBase.hpp" + + +#define MAT_FUSED_MUL_ADD_DATA_SETUP \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ + Real_ptr D = m_D; + +namespace rajaperf { +class RunParams; + +namespace basic { + +class MAT_FUSED_MUL_ADD : public KernelBase { +public: + MAT_FUSED_MUL_ADD(const RunParams ¶ms); + + ~MAT_FUSED_MUL_ADD(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + Real_ptr m_A; + Real_ptr m_B; + Real_ptr m_D; + + Index_type m_N; + Index_type m_N_default; + static constexpr Index_type m_Ne = 16; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard From 43c1c6e413b70a0584b8d2dff8c27c6cf8cb40c9 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:21:27 -0500 Subject: [PATCH 19/81] adding mfma seq variant skeleton --- src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 66 +++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp new file mode 100644 index 000000000..d1b11a9c1 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp @@ -0,0 +1,66 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include + +namespace rajaperf { +namespace basic { + +void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + switch (vid) { + + case Base_Seq: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // number of iterations + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq: { + + + startTimer(); + for (Index_type irep = 0; irep < run_reps; ++irep) { + } // irep + stopTimer(); + + break; + } + + case RAJA_Seq: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default: { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown variant id = " << vid + << std::endl; + } + } +} + +} // end namespace basic +} // end namespace rajaperf From 2e759818b5ba9faf3a406a0e930b3997af5e22aa Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:22:02 -0500 Subject: [PATCH 20/81] adding mfma omp and omp offload variant skeleton structure --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 72 +++++++++++++++++++++++ src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 39 ++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp new file mode 100644 index 000000000..0651e3ccd --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -0,0 +1,72 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace basic { + +void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + + switch (vid) { + + case Base_OpenMP: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + break; + } + + case RAJA_OpenMP: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown variant id = " << vid + << std::endl; + } + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp new file mode 100644 index 000000000..e6c5933eb --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp @@ -0,0 +1,39 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + + + void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + const Index_type run_reps = getRunReps(); + + switch (vid) { + + default: { + + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OpenMPTarget variant id = " << vid << std::endl; + break; + } + } + } + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP From 518121981473f3c3b8c15acfa90a0fafddb1f386 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:22:16 -0500 Subject: [PATCH 21/81] adding mfma cuda variant skeleton --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 98 ++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp new file mode 100644 index 000000000..93503b2d7 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -0,0 +1,98 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +#define MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA \ + const Index_type N = m_N; \ + const Index_type Ne = m_Ne; \ + allocAndInitCudaDeviceData(A, m_A, N); \ + allocAndInitCudaDeviceData(B, m_B, N); \ + allocAndInitCudaDeviceData(D, m_D, N); + +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_A, A, N); \ + getCudaDeviceData(m_B, B, N); \ + getCudaDeviceData(m_D, D, N); \ + deallocCudaDeviceData(A); \ + deallocCudaDeviceData(B); \ + deallocCudaDeviceData(D); + +template < Index_type block_size > + __launch_bounds__(block_size) +__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B, + Real_ptr D) { + +} + +template < size_t block_size > +void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) +{ + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + if (vid == Base_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else if (vid == Lambda_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else if (vid == RAJA_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Cuda variant id = " << vid + << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Cuda) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA From ef451af849ac9f4f02832a5fbe74c26f242f1655 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 3 May 2022 14:23:13 -0500 Subject: [PATCH 22/81] add inital set of HIP mfma varaint with builtin matrix core instructions --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 190 ++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp new file mode 100644 index 000000000..e47e00c97 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -0,0 +1,190 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +#define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ + const Index_type N = m_N; \ + constexpr Index_type Ne = m_Ne; \ + constexpr Index_type NeNe = m_Ne * m_Ne; \ + allocAndInitHipDeviceData(A, m_A, N); \ + allocAndInitHipDeviceData(B, m_B, N); \ + allocAndInitHipDeviceData(D, m_D, N); + +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_A, A, N); \ + getHipDeviceData(m_B, B, N); \ + getHipDeviceData(m_D, D, N); \ + deallocHipDeviceData(A); \ + deallocHipDeviceData(B); \ + deallocHipDeviceData(D); + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, + Index_type N){ +constexpr Index_type Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + // compute a 16x16x16 matrix multiplication using a single wavefront. +#if defined(RP_USE_DOUBLE) + using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double; + double4 result = {0}; +#elif defined(RP_USE_FLOAT) + using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; + float4 result = {0}; +#endif + Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne); + Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne); + + for(int i = 0; i < 4; ++i){ + Real_type a = A[a_idx]; + Real_type b = B[b_idx]; + +#if defined(RP_USE_DOUBLE) +#ifdef __gfx90a__ + result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); +#else + result = {0}; //should never get here. +#endif //end __gfx90a__ +#elif defined(RP_USE_FLOAT) +#if defined(__gfx90a__) || defined(__gfx908__) + result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); +#else + result = {0}; //should never get here +#endif //end __gfx90a__ or __gfx908__ +#endif //end FLOAT vs DOBULE + + a_idx += 4; // move four columns to the right + b_idx += 4*Ne; // move four rows down + } + + #pragma unroll 4 + for(Index_type i = 0; i < 4; ++i){ + const Index_type d_idx = threadIdx.x + + i * Ne + + threadIdx.y * 4 * Ne + + ii*(Ne*Ne); + + D[d_idx] = result[i]; + } +} +} +//Reference for cases with no hardware support +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, + Index_type N){ + constexpr int Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + + float sum = 0; + for (int k = 0; k < Ne; ++k) { + sum += A[y*Ne + k] * B[k*Ne + x]; + } + D[y*Ne + x + ii*(Ne*Ne)] = sum; +} +} +template < size_t block_size > +void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + constexpr Index_type Ne = m_Ne; + constexpr Index_type NeNe = m_Ne * m_Ne; + + dim3 gridDim (1, 1, 1); + dim3 blockDimBuiltin(Ne, Ne/4, 1); + dim3 blockDim(Ne, Ne, 1); + hipDeviceProp_t devProp; + hipError_t err = hipGetDeviceProperties(&devProp, 0); + std::string gcnArchName(devProp.gcnArchName); + std::string hipArch = gcnArchName.substr(0, 6); + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + if (vid == Base_HIP) { + + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } + for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } + } + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + if(hipArch == "gfx90a") + //Both FP32 and FP64 supported + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, + A, B, D, N); + else if(hipArch == "gfx908"){ +#if defined(RP_USE_FLOAT) + //Only FP32 supported on MI100 + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, + A, B, D, N); +#elif defined(RP_USE_DOUBLE) + //FP64 not supported on MI100 + hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, + A, B, D, N); + } +#endif + else + //Otherwise no mfma hardware support + hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, + A, B, D, N); + hipErrchk( hipGetLastError() ); + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + } else if (vid == Lambda_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else if (vid == RAJA_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + } // loop over kernel reps + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid + << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Hip) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP From 0bd36f4d1840215d091203034a3906f823c34259 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Wed, 4 May 2022 20:49:22 -0500 Subject: [PATCH 23/81] add mat_fused_mul_add cuda variant kernel --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 30 ++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 93503b2d7..657e5eea6 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -34,29 +34,47 @@ namespace basic { deallocCudaDeviceData(B); \ deallocCudaDeviceData(D); -template < Index_type block_size > - __launch_bounds__(block_size) -__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B, - Real_ptr D) { +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, + Index_type N){ + constexpr int Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + + float sum = 0; + for (int k = 0; k < Ne; ++k) { + sum += A[y*Ne + k] * B[k*Ne + x]; + } + D[y*Ne + x + ii*(Ne*Ne)] = sum; +} } - template < size_t block_size > void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; - const Index_type Ne = m_Ne; + constexpr Index_type Ne = m_Ne; + constexpr Index_type NeNe = m_Ne * m_Ne; + dim3 gridDim (1, 1, 1); + dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP; if (vid == Base_CUDA) { + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } + for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } + } MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + mat_fused_mul_add<<>>(A, B, D, N); } stopTimer(); From 3c906a0125567299932bade1033d94dfc9a60610 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Fri, 6 May 2022 13:57:02 -0500 Subject: [PATCH 24/81] filling in seq MAT_FUSED_MUL_ADD seq variant --- src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp index d1b11a9c1..176ef0676 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp @@ -27,6 +27,14 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + MAT_FUSED_MUL_ADD_BODY + } + } + } + } // number of iterations stopTimer(); @@ -36,9 +44,20 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( #if defined(RUN_RAJA_SEQ) case Lambda_Seq: { + auto mat_fused_lam = [=](Index_type ii, Index_type row, Index_type col){ + MAT_FUSED_MUL_ADD_BODY; + }; startTimer(); for (Index_type irep = 0; irep < run_reps; ++irep) { + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + mat_fused_lam(ii,row,col); + } + } + } + } // irep stopTimer(); @@ -46,8 +65,19 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } case RAJA_Seq: { + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + RAJA::forall( ii_range, [=](int ii) { + RAJA::forall( row_range, [=](int row) { + RAJA::forall( col_range, [=](int col) { + MAT_FUSED_MUL_ADD_BODY; + }); + }); + }); } // loop over kernel reps stopTimer(); From 464380d89a72dab159a71fd121e29d9917c146b5 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Fri, 6 May 2022 13:57:34 -0500 Subject: [PATCH 25/81] cleaning up MAT_FUSED_MUL_ADD macros a bit --- src/basic/MAT_FUSED_MUL_ADD.hpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 552590c1a..583c74a03 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -13,17 +13,17 @@ // D: N/(Ne*Ne) Ne x Ne matrices // All square row-major matrices, C is ignored. // for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ -// for(int row = 0; row != Ne; ++row){ -// for(int col = 0; col != Ne; ++col){ -// float dot = 0.0; -// int A_idx = row * Ne + ii*NeNe; -// int B_idx = col + ii*NeNe; -// for(int i = 0; i != Ne; ++i){ -// dot += A[A_idx] * B[B_idx]; +// for(Index_type row = 0; row != Ne; ++row){ +// for(Index_type col = 0; col != Ne; ++col){ +// Real_type dot = 0.0; +// Index_type A_idx = row * Ne; +// Index_type B_idx = col; +// for(Index_type i = 0; i != Ne; ++i){ +// sum += A[A_idx] * B[B_idx]; // ++A_idx; // B_idx += Ne; // } -// D[row * Ne + col] = dot; +// D[row * Ne + col + ii*(Ne*Ne)] = sum; // } // } // return D; @@ -35,12 +35,22 @@ #include "RAJA/RAJA.hpp" #include "common/KernelBase.hpp" - #define MAT_FUSED_MUL_ADD_DATA_SETUP \ Real_ptr A = m_A; \ Real_ptr B = m_B; \ Real_ptr D = m_D; +#define MAT_FUSED_MUL_ADD_BODY \ + Real_type sum = 0.0; \ + Index_type A_idx = row * Ne; \ + Index_type B_idx = col; \ + for(Index_type i = 0; i != Ne; ++i){ \ + sum += A[A_idx] * B[B_idx]; \ + A_idx++; \ + B_idx += Ne; \ + } \ + D[row * Ne + col + ii*(Ne*Ne)] = sum; + namespace rajaperf { class RunParams; From ee3c695807642cb00e4531498ad99acf0c157b3a Mon Sep 17 00:00:00 2001 From: CRobeck Date: Fri, 6 May 2022 13:58:17 -0500 Subject: [PATCH 26/81] making MAT_FUSED_MUL_ADD cuda and hip variants more consistent --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 14 +++++++------- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 15 ++++++++++----- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 657e5eea6..1d531c12d 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -39,16 +39,16 @@ template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ - constexpr int Ne = 16; +constexpr int Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - int x = threadIdx.x + blockIdx.x * blockDim.x; - int y = threadIdx.y + blockIdx.y * blockDim.y; + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; - float sum = 0; - for (int k = 0; k < Ne; ++k) { - sum += A[y*Ne + k] * B[k*Ne + x]; + Real_type dot = 0; + for (Real_type k = 0; k < Ne; ++k) { + dot += A[row*Ne + k] * B[k*Ne + col]; } - D[y*Ne + x + ii*(Ne*Ne)] = sum; + D[row*Ne + col + ii*(Ne*Ne)] = dot; } } template < size_t block_size > diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index e47e00c97..2a59f4559 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -92,14 +92,14 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D Index_type N){ constexpr int Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - int x = threadIdx.x + blockIdx.x * blockDim.x; - int y = threadIdx.y + blockIdx.y * blockDim.y; + int col = threadIdx.x + blockIdx.x * blockDim.x; + int row = threadIdx.y + blockIdx.y * blockDim.y; - float sum = 0; + float dot = 0; for (int k = 0; k < Ne; ++k) { - sum += A[y*Ne + k] * B[k*Ne + x]; + dot += A[y*Ne + k] * B[k*Ne + x]; } - D[y*Ne + x + ii*(Ne*Ne)] = sum; + D[row*Ne + col + ii*(Ne*Ne)] = dot; } } template < size_t block_size > @@ -154,6 +154,11 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + for(int i = 0; i != N; ++i){ + printf("D[%d] = %f\n", i, m_D[i]); + } + + } else if (vid == Lambda_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From 24279eb6302647cb6355d6cf032d94cac806649a Mon Sep 17 00:00:00 2001 From: CRobeck Date: Fri, 6 May 2022 16:34:55 -0500 Subject: [PATCH 27/81] fixing bug in MAT_FUSED_MUL_ADD Cuda kernel --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 1d531c12d..ce386fca9 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -39,13 +39,13 @@ template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ -constexpr int Ne = 16; -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - Index_type col = threadIdx.x + blockIdx.x * blockDim.x; - Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + constexpr int Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + int col = threadIdx.x + blockIdx.x * blockDim.x; + int row = threadIdx.y + blockIdx.y * blockDim.y; - Real_type dot = 0; - for (Real_type k = 0; k < Ne; ++k) { + float dot = 0; + for (int k = 0; k < Ne; ++k) { dot += A[row*Ne + k] * B[k*Ne + col]; } D[row*Ne + col + ii*(Ne*Ne)] = dot; From 75c7a07828cca1f3dde13cc33229208d0ac27b3d Mon Sep 17 00:00:00 2001 From: CRobeck Date: Fri, 6 May 2022 16:36:03 -0500 Subject: [PATCH 28/81] making MAT_FUSED_MUL_ADD naming consistent --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 2a59f4559..eb7d5d5f9 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -97,7 +97,7 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ float dot = 0; for (int k = 0; k < Ne; ++k) { - dot += A[y*Ne + k] * B[k*Ne + x]; + dot += A[row*Ne + k] * B[k*Ne + col]; } D[row*Ne + col + ii*(Ne*Ne)] = dot; } From 75875b9e44153b3e3ef42e3b8fe0d8226f71684f Mon Sep 17 00:00:00 2001 From: CRobeck Date: Mon, 9 May 2022 15:03:49 -0500 Subject: [PATCH 29/81] updating mat fused body and ading raja lam variant --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 10 ++---- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 53 +++++++++++++++++----------- src/basic/MAT_FUSED_MUL_ADD.hpp | 18 ++++------ 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index ce386fca9..4f50efeda 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -41,14 +41,10 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D Index_type N){ constexpr int Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - int col = threadIdx.x + blockIdx.x * blockDim.x; - int row = threadIdx.y + blockIdx.y * blockDim.y; + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; - float dot = 0; - for (int k = 0; k < Ne; ++k) { - dot += A[row*Ne + k] * B[k*Ne + col]; - } - D[row*Ne + col + ii*(Ne*Ne)] = dot; + MAT_FUSED_MUL_ADD_BODY; } } template < size_t block_size > diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index eb7d5d5f9..63ddd02c0 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -92,20 +92,28 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D Index_type N){ constexpr int Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - int col = threadIdx.x + blockIdx.x * blockDim.x; - int row = threadIdx.y + blockIdx.y * blockDim.y; - - float dot = 0; - for (int k = 0; k < Ne; ++k) { - dot += A[row*Ne + k] * B[k*Ne + col]; - } - D[row*Ne + col + ii*(Ne*Ne)] = dot; + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + MAT_FUSED_MUL_ADD_BODY; } } +template < Index_type block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void mat_fused_lam(Index_type N, Lambda body) +{ + constexpr int Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + body(ii,col,row); + } +} template < size_t block_size > void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); +// const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); const Index_type N = m_N; constexpr Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; @@ -133,50 +141,53 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) if(hipArch == "gfx90a") //Both FP32 and FP64 supported hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, - A, B, D, N); + A, B, D, iend); else if(hipArch == "gfx908"){ #if defined(RP_USE_FLOAT) //Only FP32 supported on MI100 hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, - A, B, D, N); + A, B, D, iend); #elif defined(RP_USE_DOUBLE) //FP64 not supported on MI100 hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, - A, B, D, N); + A, B, D, iend); } #endif else //Otherwise no mfma hardware support hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, - A, B, D, N); + A, B, D, iend); hipErrchk( hipGetLastError() ); } stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; - for(int i = 0; i != N; ++i){ - printf("D[%d] = %f\n", i, m_D[i]); - } - } else if (vid == Lambda_HIP) { - + dim3 gridDim (1, 1, 1); + dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + auto mat_fused_lamda = + [=] __device__ (Index_type ii, Index_type row, Index_type col) { + MAT_FUSED_MUL_ADD_BODY; + }; + hipLaunchKernelGGL((mat_fused_lam), + dim3(gridDim), dim3(blockDim), 0, 0, + iend, mat_fused_lamda); + hipErrchk( hipGetLastError() ); } stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; - } else if (vid == RAJA_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - } // loop over kernel reps + stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 583c74a03..396702d84 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -19,7 +19,7 @@ // Index_type A_idx = row * Ne; // Index_type B_idx = col; // for(Index_type i = 0; i != Ne; ++i){ -// sum += A[A_idx] * B[B_idx]; +// dot += A[A_idx] * B[B_idx]; // ++A_idx; // B_idx += Ne; // } @@ -40,16 +40,12 @@ Real_ptr B = m_B; \ Real_ptr D = m_D; -#define MAT_FUSED_MUL_ADD_BODY \ - Real_type sum = 0.0; \ - Index_type A_idx = row * Ne; \ - Index_type B_idx = col; \ - for(Index_type i = 0; i != Ne; ++i){ \ - sum += A[A_idx] * B[B_idx]; \ - A_idx++; \ - B_idx += Ne; \ - } \ - D[row * Ne + col + ii*(Ne*Ne)] = sum; +#define MAT_FUSED_MUL_ADD_BODY \ + Real_type dot = 0; \ + for (Index_type k = 0; k < Ne; ++k) { \ + dot += A[row*Ne + k] * B[k*Ne + col]; \ + } \ + D[row*Ne + col + ii*(Ne*Ne)] = dot; \ namespace rajaperf { class RunParams; From 2a46e307a829d0466f868a9905d5209042f21913 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Mon, 9 May 2022 15:30:16 -0500 Subject: [PATCH 30/81] add data set up to mat_fused_lam --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 63ddd02c0..7eacf2af0 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -166,6 +166,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) } else if (vid == Lambda_HIP) { dim3 gridDim (1, 1, 1); dim3 blockDim(Ne, Ne, 1); + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } + for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } + } MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); @@ -183,6 +187,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + } else if (vid == RAJA_HIP) { MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From a4328c8390f5dfe3c5372aca4fb631a2b3f55b1a Mon Sep 17 00:00:00 2001 From: CRobeck Date: Mon, 9 May 2022 15:56:26 -0500 Subject: [PATCH 31/81] moving MAT_FUSED_MUL_ADD array init into macro --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 6 ++---- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 11 +++-------- src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 4 +++- src/basic/MAT_FUSED_MUL_ADD.hpp | 6 ++++++ 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 4f50efeda..98aeef388 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -60,11 +60,9 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP; + MAT_FUSED_MUL_ADD_DATA_INIT; + if (vid == Base_CUDA) { - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } - for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } - } MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 7eacf2af0..f7567376d 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -128,12 +128,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) MAT_FUSED_MUL_ADD_DATA_SETUP; + MAT_FUSED_MUL_ADD_DATA_INIT; + if (vid == Base_HIP) { - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } - for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } - } MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); @@ -166,10 +164,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) } else if (vid == Lambda_HIP) { dim3 gridDim (1, 1, 1); dim3 blockDim(Ne, Ne, 1); - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } - for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } - } + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp index 176ef0676..d142f38b0 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp @@ -17,10 +17,12 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( const Index_type run_reps = getRunReps(); const Index_type N = m_N; - const Index_type Ne = m_Ne; + constexpr Index_type Ne = m_Ne; + constexpr Index_type NeNe = m_Ne * m_Ne; MAT_FUSED_MUL_ADD_DATA_SETUP; + MAT_FUSED_MUL_ADD_DATA_INIT; switch (vid) { case Base_Seq: { diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 396702d84..87580cb3b 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -35,6 +35,12 @@ #include "RAJA/RAJA.hpp" #include "common/KernelBase.hpp" +#define MAT_FUSED_MUL_ADD_DATA_INIT \ +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ + for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \ +} + #define MAT_FUSED_MUL_ADD_DATA_SETUP \ Real_ptr A = m_A; \ Real_ptr B = m_B; \ From be2430fc60eb92e64c270a557044265fb68d8703 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Mon, 9 May 2022 16:11:20 -0500 Subject: [PATCH 32/81] fixing missing ; --- src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp index d142f38b0..842966146 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp @@ -32,7 +32,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ for(Index_type row = 0; row != Ne; ++row){ for(Index_type col = 0; col != Ne; ++col){ - MAT_FUSED_MUL_ADD_BODY + MAT_FUSED_MUL_ADD_BODY; } } } From 1ff6f933dbc00e3209980845bee632df2db203bd Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 10 May 2022 09:35:21 -0500 Subject: [PATCH 33/81] finish filling in RAJA_HIP MAT_FUSED_MUL_ADD variant --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index f7567376d..d5cc299e8 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -184,10 +184,35 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; } else if (vid == RAJA_HIP) { + dim3 gridDim (1, 1, 1); + dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; - startTimer(); + startTimer(); + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::HipKernel< + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::For<1, RAJA::hip_thread_y_direct, + RAJA::statement::For<0, RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + RAJA::kernel(RAJA::make_tuple(ii_range, col_range, row_range), + [=] RAJA_DEVICE (int ii, int col, int row) { + MAT_FUSED_MUL_ADD_BODY; + }); stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; From 9dcd3d84290070e2904813065e65440f3d0c4f5a Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 10 May 2022 09:59:51 -0500 Subject: [PATCH 34/81] finish filling in RAJA_CUDA MAT_FUSED_MUL_ADD variant --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 46 ++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 98aeef388..4714fde59 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -47,6 +47,17 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ MAT_FUSED_MUL_ADD_BODY; } } +template < Index_type block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void mat_fused_lam(Index_type N, Lambda body) +{ + constexpr int Ne = 16; +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + body(ii,col,row); + } +} template < size_t block_size > void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) { @@ -80,22 +91,53 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + auto mat_fused_lamda = + [=] __device__ (Index_type ii, Index_type row, Index_type col) { + MAT_FUSED_MUL_ADD_BODY; + }; + mat_fused_lam + <<>>(N, mat_fused_lamda); } stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; } else if (vid == RAJA_CUDA) { + dim3 gridDim (1, 1, 1); + dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - } // loop over kernel reps + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::For<1, RAJA::cuda_thread_y_direct, + RAJA::statement::For<0, RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + RAJA::kernel(RAJA::make_tuple(ii_range, col_range, row_range), + [=] RAJA_DEVICE (int ii, int col, int row) { + MAT_FUSED_MUL_ADD_BODY; + }); stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + } else { getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Cuda variant id = " << vid << std::endl; From 66d9756b84e0af05320ba4d5282f8557a8c23c5a Mon Sep 17 00:00:00 2001 From: CRobeck Date: Tue, 10 May 2022 12:46:22 -0500 Subject: [PATCH 35/81] filling in MAT_FUSED_MUL_ADD OMP variants --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 34 +++++++++ src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 92 +++++++++++++++++++++-- src/basic/MAT_FUSED_MUL_ADD.hpp | 24 ++---- 3 files changed, 125 insertions(+), 25 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 0651e3ccd..22a83fcc1 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -31,6 +31,15 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + #pragma omp parallel for + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + MAT_FUSED_MUL_ADD_BODY; + } + } + } + } stopTimer(); @@ -39,9 +48,22 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } case Lambda_OpenMP: { + auto mat_fused_base_lam = [=](Index_type ii, Index_type row, Index_type col){ + MAT_FUSED_MUL_ADD_BODY; + }; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + #pragma omp parallel for + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + mat_fused_base_lam(ii, row, col); + } + } + } + + } stopTimer(); @@ -49,8 +71,20 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A } case RAJA_OpenMP: { + + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + RAJA::forall( ii_range, [=](int ii) { + RAJA::forall( row_range, [=](int row) { + RAJA::forall( col_range, [=](int col) { + MAT_FUSED_MUL_ADD_BODY; + }); + }); + }); } // loop over kernel reps stopTimer(); diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp index e6c5933eb..801747766 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp @@ -19,18 +19,94 @@ namespace rajaperf { namespace basic { + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; - void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { - const Index_type run_reps = getRunReps(); +#define MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ + const Index_type N = m_N; \ + constexpr Index_type Ne = m_Ne; \ + constexpr Index_type NeNe = m_Ne * m_Ne; \ + allocAndInitOpenMPDeviceData(A, m_A, N, did, hid); \ + allocAndInitOpenMPDeviceData(B, m_B, N, did, hid); \ + allocAndInitOpenMPDeviceData(D, m_D, N, did, hid); - switch (vid) { +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_A, A, N, hid, did); \ + getOpenMPDeviceData(m_B, B, N, hid, did); \ + getOpenMPDeviceData(m_D, D, N, hid, did); \ + deallocOpenMPDeviceData(A, did); \ + deallocOpenMPDeviceData(B, did); \ + deallocOpenMPDeviceData(D, did); - default: { - getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OpenMPTarget variant id = " << vid << std::endl; - break; - } - } +void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + #pragma omp target is_device_ptr(A, B, D) device( did ) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + MAT_FUSED_MUL_ADD_BODY; + } + } + } + } + + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET; + + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + + + using EXEC_POL = + RAJA::KernelPolicy< + + RAJA::statement::For<0, RAJA::seq_exec, // ii + RAJA::statement::Collapse, // row, col + RAJA::statement::Lambda<0> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(ii_range, + row_range, + col_range), + [=] (Index_type ii, Index_type row, Index_type col) { + MAT_FUSED_MUL_ADD_BODY; + }); + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OMP Target variant id = " << vid << std::endl; + } +} } } // end namespace basic diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 87580cb3b..52ce76bd2 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -12,23 +12,13 @@ // Ouput: // D: N/(Ne*Ne) Ne x Ne matrices // All square row-major matrices, C is ignored. -// for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ -// for(Index_type row = 0; row != Ne; ++row){ -// for(Index_type col = 0; col != Ne; ++col){ -// Real_type dot = 0.0; -// Index_type A_idx = row * Ne; -// Index_type B_idx = col; -// for(Index_type i = 0; i != Ne; ++i){ -// dot += A[A_idx] * B[B_idx]; -// ++A_idx; -// B_idx += Ne; -// } -// D[row * Ne + col + ii*(Ne*Ne)] = sum; -// } -// } -// return D; -//} -//} +//for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +// for(Index_type row = 0; row != Ne; ++row){ +// for(Index_type col = 0; col != Ne; ++col){ +// MAT_FUSED_MUL_ADD_BODY; +// } +// } +//} #ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP #define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP From 73d745a749951ee3cb77239807b074b80931a6fc Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 10 May 2022 13:54:06 -0400 Subject: [PATCH 36/81] add MAT_FUSED_MUL_ADD_DATA_INIT --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 22a83fcc1..0611f5640 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -24,6 +24,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A MAT_FUSED_MUL_ADD_DATA_SETUP; + MAT_FUSED_MUL_ADD_DATA_INIT; switch (vid) { From 9c63d3bdd3fcdd604ff293edf6446106dc38637e Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Tue, 10 May 2022 13:54:58 -0400 Subject: [PATCH 37/81] add MAT_FUSED_MUL_ADD_DATA_INIT --- src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp index 801747766..313b8e391 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp @@ -51,6 +51,8 @@ void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN MAT_FUSED_MUL_ADD_DATA_SETUP; + MAT_FUSED_MUL_ADD_DATA_INIT; + if ( vid == Base_OpenMPTarget ) { MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET; From ca7b81021411be9e555d421c503746a6bd483bff Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 16 May 2022 08:49:11 -0500 Subject: [PATCH 38/81] fixing some unused vars --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index d5cc299e8..72baf4f78 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -21,8 +21,6 @@ namespace basic { #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ const Index_type N = m_N; \ - constexpr Index_type Ne = m_Ne; \ - constexpr Index_type NeNe = m_Ne * m_Ne; \ allocAndInitHipDeviceData(A, m_A, N); \ allocAndInitHipDeviceData(B, m_B, N); \ allocAndInitHipDeviceData(D, m_D, N); @@ -52,7 +50,7 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne); Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne); - for(int i = 0; i < 4; ++i){ + for(Index_type i = 0; i < 4; ++i){ Real_type a = A[a_idx]; Real_type b = B[b_idx]; @@ -90,7 +88,7 @@ template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ - constexpr int Ne = 16; + constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; @@ -101,7 +99,7 @@ template < Index_type block_size, typename Lambda > __launch_bounds__(block_size) __global__ void mat_fused_lam(Index_type N, Lambda body) { - constexpr int Ne = 16; + constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; @@ -122,7 +120,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) dim3 blockDimBuiltin(Ne, Ne/4, 1); dim3 blockDim(Ne, Ne, 1); hipDeviceProp_t devProp; - hipError_t err = hipGetDeviceProperties(&devProp, 0); + hipGetDeviceProperties(&devProp, 0); std::string gcnArchName(devProp.gcnArchName); std::string hipArch = gcnArchName.substr(0, 6); @@ -184,8 +182,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; } else if (vid == RAJA_HIP) { - dim3 gridDim (1, 1, 1); - dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; @@ -210,7 +206,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) > >; RAJA::kernel(RAJA::make_tuple(ii_range, col_range, row_range), - [=] RAJA_DEVICE (int ii, int col, int row) { + [=] RAJA_DEVICE (Index_type ii, Index_type col, Index_type row) { MAT_FUSED_MUL_ADD_BODY; }); stopTimer(); From 02633eafd4f59347b3aab200d7cf39946aa237f3 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 16 May 2022 11:35:47 -0500 Subject: [PATCH 39/81] fix indexing issue --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 67 +++++++++++------------------ 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 72baf4f78..81d0a3085 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -33,53 +33,35 @@ namespace basic { deallocHipDeviceData(B); \ deallocHipDeviceData(D); -template < Index_type block_size > -__launch_bounds__(block_size) -__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, - Index_type N){ -constexpr Index_type Ne = 16; -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - // compute a 16x16x16 matrix multiplication using a single wavefront. -#if defined(RP_USE_DOUBLE) - using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double; - double4 result = {0}; -#elif defined(RP_USE_FLOAT) - using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; - float4 result = {0}; -#endif - Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne); - Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne); +__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ + constexpr Index_type Ne = 16; + for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + // This kernel computes a 16x16x16 matrix multiplication using a single wavefront. + using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type; + real4 result = {0}; - for(Index_type i = 0; i < 4; ++i){ - Real_type a = A[a_idx]; - Real_type b = B[b_idx]; + int a_idx = Ne * threadIdx.x + threadIdx.y; + int b_idx = threadIdx.x + Ne * threadIdx.y; -#if defined(RP_USE_DOUBLE) + for(int i = 0; i < 4; ++i){ + double a = A[a_idx]; + double b = B[b_idx]; #ifdef __gfx90a__ - result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); -#else - result = {0}; //should never get here. -#endif //end __gfx90a__ +#if defined(RP_USE_DOUBLE) + result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); #elif defined(RP_USE_FLOAT) -#if defined(__gfx90a__) || defined(__gfx908__) result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); -#else - result = {0}; //should never get here -#endif //end __gfx90a__ or __gfx908__ -#endif //end FLOAT vs DOBULE - - a_idx += 4; // move four columns to the right - b_idx += 4*Ne; // move four rows down +#endif +#endif + a_idx += 4; // move two columns to the right + b_idx += 4*Ne; // move two rows down } #pragma unroll 4 - for(Index_type i = 0; i < 4; ++i){ - const Index_type d_idx = threadIdx.x - + i * Ne - + threadIdx.y * 4 * Ne - + ii*(Ne*Ne); - - D[d_idx] = result[i]; + for(int i = 0; i < 4; ++i){ + const int d_idx = threadIdx.x + + Ne * (threadIdx.y + 4 * i); + D[d_idx + ii*(Ne*Ne)] = result[i]; } } } @@ -117,7 +99,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) constexpr Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); - dim3 blockDimBuiltin(Ne, Ne/4, 1); + dim3 blockDimBuiltin(16, 4, 1); dim3 blockDim(Ne, Ne, 1); hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); @@ -136,12 +118,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { if(hipArch == "gfx90a") //Both FP32 and FP64 supported - hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, A, B, D, iend); else if(hipArch == "gfx908"){ #if defined(RP_USE_FLOAT) //Only FP32 supported on MI100 - hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, A, B, D, iend); #elif defined(RP_USE_DOUBLE) //FP64 not supported on MI100 @@ -158,7 +140,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; - } else if (vid == Lambda_HIP) { dim3 gridDim (1, 1, 1); dim3 blockDim(Ne, Ne, 1); From f1fb1abdc0a0de3c5436a3d1082091ce38c9ac61 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 16 May 2022 13:11:47 -0500 Subject: [PATCH 40/81] fix incorrect order of raja_hip segments --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 81d0a3085..59200a668 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -99,7 +99,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) constexpr Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); - dim3 blockDimBuiltin(16, 4, 1); + dim3 blockDimBuiltin(Ne, 4, 1); dim3 blockDim(Ne, Ne, 1); hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); @@ -167,9 +167,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); + RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); RAJA::RangeSegment row_range(0, Ne); RAJA::RangeSegment col_range(0, Ne); - RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::HipKernel< @@ -186,8 +187,8 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) > > >; - RAJA::kernel(RAJA::make_tuple(ii_range, col_range, row_range), - [=] RAJA_DEVICE (Index_type ii, Index_type col, Index_type row) { + RAJA::kernel(RAJA::make_tuple(row_range, col_range, ii_range), + [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) { MAT_FUSED_MUL_ADD_BODY; }); stopTimer(); From faf046fb242d328917f6f05895d0817a1d8d5f8d Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 16 May 2022 13:28:42 -0500 Subject: [PATCH 41/81] fix incorrect order of raja_cuda segments --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 4714fde59..cc71c9307 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -129,8 +129,8 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) > > >; - RAJA::kernel(RAJA::make_tuple(ii_range, col_range, row_range), - [=] RAJA_DEVICE (int ii, int col, int row) { + RAJA::kernel(RAJA::make_tuple(row_range, col_range, ii_range), + [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) { MAT_FUSED_MUL_ADD_BODY; }); stopTimer(); From 73bd85e6e7c2161e8cc233ef15c5823e0a866756 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Mon, 16 May 2022 16:02:14 -0400 Subject: [PATCH 42/81] update data types --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 59200a668..ed84c9ec8 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -40,12 +40,12 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type; real4 result = {0}; - int a_idx = Ne * threadIdx.x + threadIdx.y; - int b_idx = threadIdx.x + Ne * threadIdx.y; + Index_type a_idx = Ne * threadIdx.x + threadIdx.y; + Index_type b_idx = threadIdx.x + Ne * threadIdx.y; - for(int i = 0; i < 4; ++i){ - double a = A[a_idx]; - double b = B[b_idx]; + for(Index_type i = 0; i < 4; ++i){ + Real_type a = A[a_idx]; + Real_type b = B[b_idx]; #ifdef __gfx90a__ #if defined(RP_USE_DOUBLE) result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); @@ -58,8 +58,8 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re } #pragma unroll 4 - for(int i = 0; i < 4; ++i){ - const int d_idx = threadIdx.x + for(Index_type i = 0; i < 4; ++i){ + const Index_type d_idx = threadIdx.x + Ne * (threadIdx.y + 4 * i); D[d_idx + ii*(Ne*Ne)] = result[i]; } From 4f7dac37a0506fe5b8ac4b3ddd985838371ceeea Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 19 May 2022 08:41:21 -0500 Subject: [PATCH 43/81] fix indexing issue --- src/basic/MAT_FUSED_MUL_ADD.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 52ce76bd2..bef4a8660 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -39,7 +39,7 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ #define MAT_FUSED_MUL_ADD_BODY \ Real_type dot = 0; \ for (Index_type k = 0; k < Ne; ++k) { \ - dot += A[row*Ne + k] * B[k*Ne + col]; \ + dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)]; \ } \ D[row*Ne + col + ii*(Ne*Ne)] = dot; \ From 8e119d7053b5c6ad26a143c80dc0574413fa188e Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 19 May 2022 08:42:03 -0500 Subject: [PATCH 44/81] fix indexing issue --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 59200a668..f53f01cdb 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -33,15 +33,15 @@ namespace basic { deallocHipDeviceData(B); \ deallocHipDeviceData(D); -__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ +__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ // This kernel computes a 16x16x16 matrix multiplication using a single wavefront. using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type; real4 result = {0}; - int a_idx = Ne * threadIdx.x + threadIdx.y; - int b_idx = threadIdx.x + Ne * threadIdx.y; + int a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne); + int b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne); for(int i = 0; i < 4; ++i){ double a = A[a_idx]; @@ -52,6 +52,12 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re #elif defined(RP_USE_FLOAT) result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); #endif +#endif + +#ifdef __gfx908__ +#if defined(RP_USE_FLOAT) + result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); +#endif #endif a_idx += 4; // move two columns to the right b_idx += 4*Ne; // move two rows down @@ -69,17 +75,17 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, - Index_type N){ + const Index_type N){ constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - Index_type col = threadIdx.x + blockIdx.x * blockDim.x; - Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne); + Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne); MAT_FUSED_MUL_ADD_BODY; } } template < Index_type block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void mat_fused_lam(Index_type N, Lambda body) +__global__ void mat_fused_lam(const Index_type N, Lambda body) { constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ From e7ef2b421697d5d2e1d14490717a450073f514e1 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 19 May 2022 08:51:03 -0500 Subject: [PATCH 45/81] fix one more indexing issue --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index cfa3d4e4a..889e80359 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -40,8 +40,8 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type; real4 result = {0}; - Index_type a_idx = Ne * threadIdx.x + threadIdx.y; - Index_type b_idx = threadIdx.x + Ne * threadIdx.y; + Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne); + Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne); for(Index_type i = 0; i < 4; ++i){ Real_type a = A[a_idx]; From 78e259080db9b50220f153063f6474ce5e1718d8 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 19 May 2022 09:36:39 -0500 Subject: [PATCH 46/81] fix indexing issue --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index cc71c9307..91b041382 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -41,8 +41,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D Index_type N){ constexpr int Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - Index_type col = threadIdx.x + blockIdx.x * blockDim.x; - Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne); + Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne); MAT_FUSED_MUL_ADD_BODY; } From feadc6482b7aa2e98891cdc2026337568b078f47 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 19 May 2022 14:14:05 -0500 Subject: [PATCH 47/81] adding tunings for builtin version of mfma hip kernel --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 143 +++++++++++++++++++++++----- src/basic/MAT_FUSED_MUL_ADD.hpp | 1 + 2 files changed, 120 insertions(+), 24 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 889e80359..45fd0489f 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -46,6 +46,7 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re for(Index_type i = 0; i < 4; ++i){ Real_type a = A[a_idx]; Real_type b = B[b_idx]; + #ifdef __gfx90a__ #if defined(RP_USE_DOUBLE) result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); @@ -94,6 +95,41 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ body(ii,col,row); } } +void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid) +{ + const Index_type run_reps = getRunReps(); +// const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + const Index_type N = m_N; + constexpr Index_type Ne = m_Ne; + constexpr Index_type NeNe = m_Ne * m_Ne; + + dim3 gridDim (1, 1, 1); + dim3 blockDimBuiltin(Ne, 4, 1); + dim3 blockDim(Ne, Ne, 1); + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + MAT_FUSED_MUL_ADD_DATA_INIT; + + if (vid == Base_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, A, B, D, iend); + hipErrchk( hipGetLastError() ); + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid + << std::endl; + } +} template < size_t block_size > void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) { @@ -107,10 +143,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) dim3 gridDim (1, 1, 1); dim3 blockDimBuiltin(Ne, 4, 1); dim3 blockDim(Ne, Ne, 1); - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - std::string gcnArchName(devProp.gcnArchName); - std::string hipArch = gcnArchName.substr(0, 6); MAT_FUSED_MUL_ADD_DATA_SETUP; @@ -122,25 +154,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - if(hipArch == "gfx90a") - //Both FP32 and FP64 supported - hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, - A, B, D, iend); - else if(hipArch == "gfx908"){ -#if defined(RP_USE_FLOAT) - //Only FP32 supported on MI100 - hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, - A, B, D, iend); -#elif defined(RP_USE_DOUBLE) - //FP64 not supported on MI100 - hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, - A, B, D, iend); - } -#endif - else - //Otherwise no mfma hardware support - hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, - A, B, D, iend); + hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, A, B, D, iend); hipErrchk( hipGetLastError() ); } stopTimer(); @@ -206,8 +220,89 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) << std::endl; } } +std::string getArch() +{ + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + std::string gcnArchName(devProp.gcnArchName); + std::string hipArch = gcnArchName.substr(0, 6); + return hipArch; +} +bool builtinSupported() +{ + std::string hipArch = getArch(); +#if defined(RP_USE_DOUBLE) + if (hipArch=="gfx90a") + return true; +#endif +#if defined(RP_USE_FLOAT) + if (hipArch=="gfx90a" || hipArch=="gfx908") + return true; +#endif +return false; +} +void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx) +{ + bool builtin_supported = builtinSupported(); + + size_t t = 0; + if ( vid == Base_HIP && builtin_supported) { + + if (tune_idx == t) { + + runHipVariantBuiltin(vid); + + } + + t += 1; + } + if ( (vid == Base_HIP) || (vid == RAJA_HIP) || (vid == Lambda_HIP)){ + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0 || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { -RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Hip) + runHipVariantImpl(vid); + + } + + t += 1; + + } + + }); + } + else { + + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void MAT_FUSED_MUL_ADD::setHipTuningDefinitions(VariantID vid) +{ + bool builtin_supported = builtinSupported(); + if ( vid == Base_HIP ) { + + if (builtin_supported) { + addVariantTuningName(vid, "builtin"); + } + } + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + + }); + +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index bef4a8660..74beb78d6 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -70,6 +70,7 @@ class MAT_FUSED_MUL_ADD : public KernelBase { void runCudaVariantImpl(VariantID vid); template < size_t block_size > void runHipVariantImpl(VariantID vid); + void runHipVariantBuiltin(VariantID vid); private: static const size_t default_gpu_block_size = 256; From 900671ab1916b319f0fd81fa54a06b38410d25eb Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 08:09:45 -0500 Subject: [PATCH 48/81] fix one more indexing issue --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 91b041382..b60b6eb95 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -39,7 +39,7 @@ template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ - constexpr int Ne = 16; + constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne); Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne); @@ -51,7 +51,7 @@ template < Index_type block_size, typename Lambda > __launch_bounds__(block_size) __global__ void mat_fused_lam(Index_type N, Lambda body) { - constexpr int Ne = 16; + constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; From 162a8557f92903c154c9647d10969e3a0b7ac8b0 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 08:13:42 -0500 Subject: [PATCH 49/81] clean up some comments and unused vars --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 45fd0489f..2d7e33f05 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -36,7 +36,6 @@ namespace basic { __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - // This kernel computes a 16x16x16 matrix multiplication using a single wavefront. using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type; real4 result = {0}; @@ -134,14 +133,12 @@ template < size_t block_size > void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); -// const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); const Index_type N = m_N; constexpr Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); - dim3 blockDimBuiltin(Ne, 4, 1); dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP; From fbf2820d7a23dad6cd02de31a8a75a9194bcd4da Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 10:26:23 -0500 Subject: [PATCH 50/81] hopefully fixing final indexing issue --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++-- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index b60b6eb95..6478a1aa4 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -41,8 +41,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D Index_type N){ constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne); - Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne); + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; MAT_FUSED_MUL_ADD_BODY; } diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 2d7e33f05..6a3506ed2 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -78,8 +78,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D const Index_type N){ constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne); - Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne); + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; MAT_FUSED_MUL_ADD_BODY; } } From e1ebcdd691173a3b5a4ce72a0a577379f223f61d Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 10:29:00 -0500 Subject: [PATCH 51/81] removing some unused vars --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 6a3506ed2..10cca500f 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -97,14 +97,12 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid) { const Index_type run_reps = getRunReps(); -// const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); const Index_type N = m_N; constexpr Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); - dim3 blockDimBuiltin(Ne, 4, 1); dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP; From f02da326c27bd0e057b76d4d3806fb58b6b241e0 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 10:53:10 -0500 Subject: [PATCH 52/81] cleaning up block/grid defs --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 12 +++++++----- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 12 ++++++------ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 6478a1aa4..11eab1b79 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -67,8 +67,12 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) constexpr Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; - dim3 gridDim (1, 1, 1); - dim3 blockDim(Ne, Ne, 1); + constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); + dim3 blockDim(tile_size, tile_size); + dim3 gridDim(static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(1)); + MAT_FUSED_MUL_ADD_DATA_SETUP; MAT_FUSED_MUL_ADD_DATA_INIT; @@ -104,9 +108,7 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; } else if (vid == RAJA_CUDA) { - dim3 gridDim (1, 1, 1); - dim3 blockDim(Ne, Ne, 1); - + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; startTimer(); diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 10cca500f..fc71cc68b 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -103,7 +103,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid) constexpr Index_type NeNe = m_Ne * m_Ne; dim3 gridDim (1, 1, 1); - dim3 blockDim(Ne, Ne, 1); + dim3 blockDimBuiltin(Ne, 4, 1); MAT_FUSED_MUL_ADD_DATA_SETUP; @@ -136,9 +136,11 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) constexpr Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; - dim3 gridDim (1, 1, 1); - dim3 blockDim(Ne, Ne, 1); - + constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); + dim3 blockDim(tile_size, tile_size); + dim3 gridDim(static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(1)); MAT_FUSED_MUL_ADD_DATA_SETUP; MAT_FUSED_MUL_ADD_DATA_INIT; @@ -156,8 +158,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; } else if (vid == Lambda_HIP) { - dim3 gridDim (1, 1, 1); - dim3 blockDim(Ne, Ne, 1); MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; From 59f6766ce1be979eca22a916c3dfcd5d6b75cb3a Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 11:00:09 -0500 Subject: [PATCH 53/81] cleaning up block/grid defs further --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 5 +++-- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 11eab1b79..ad084045b 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -67,8 +67,9 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) constexpr Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; - constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); - dim3 blockDim(tile_size, tile_size); + constexpr Index_type block_x = gpu_block_size::sqrt(block_size); + constexpr Index_type block_y = gpu_block_size::sqrt(block_size); + dim3 blockDim(block_x, block_y); dim3 gridDim(static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), static_cast(1)); diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index fc71cc68b..1df7b7ba2 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -136,8 +136,9 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) constexpr Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; - constexpr Index_type tile_size = gpu_block_size::sqrt(block_size); - dim3 blockDim(tile_size, tile_size); + constexpr Index_type block_x = gpu_block_size::sqrt(block_size); + constexpr Index_type block_y = gpu_block_size::sqrt(block_size); + dim3 blockDim(block_x, block_y); dim3 gridDim(static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), static_cast(1)); From 89a8c007acdee2081acff3524835961d7b2a60e7 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Fri, 20 May 2022 13:35:52 -0400 Subject: [PATCH 54/81] fix spacing --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index ad084045b..a78bc3487 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -39,11 +39,10 @@ template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ - constexpr Index_type Ne = 16; +constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; - MAT_FUSED_MUL_ADD_BODY; } } @@ -51,7 +50,7 @@ template < Index_type block_size, typename Lambda > __launch_bounds__(block_size) __global__ void mat_fused_lam(Index_type N, Lambda body) { - constexpr Index_type Ne = 16; +constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; @@ -61,7 +60,6 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ template < size_t block_size > void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) { - const Index_type run_reps = getRunReps(); const Index_type N = m_N; constexpr Index_type Ne = m_Ne; From 1156a665c3bb565d2487e92fbed9bc7ae03bb3a7 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Fri, 20 May 2022 13:38:06 -0400 Subject: [PATCH 55/81] fix spacing --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 1df7b7ba2..607972654 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -76,7 +76,7 @@ template < Index_type block_size > __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ - constexpr Index_type Ne = 16; +constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; @@ -87,7 +87,7 @@ template < Index_type block_size, typename Lambda > __launch_bounds__(block_size) __global__ void mat_fused_lam(const Index_type N, Lambda body) { - constexpr Index_type Ne = 16; +constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; From fd9d80a085aaa13fd30db50e57482f954615e453 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Fri, 20 May 2022 13:40:39 -0400 Subject: [PATCH 56/81] spacing --- src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp index 313b8e391..4847a05c5 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp @@ -25,21 +25,21 @@ namespace basic { const size_t threads_per_team = 256; #define MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ - const Index_type N = m_N; \ - constexpr Index_type Ne = m_Ne; \ - constexpr Index_type NeNe = m_Ne * m_Ne; \ - allocAndInitOpenMPDeviceData(A, m_A, N, did, hid); \ - allocAndInitOpenMPDeviceData(B, m_B, N, did, hid); \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ + const Index_type N = m_N; \ + constexpr Index_type Ne = m_Ne; \ + constexpr Index_type NeNe = m_Ne * m_Ne; \ + allocAndInitOpenMPDeviceData(A, m_A, N, did, hid); \ + allocAndInitOpenMPDeviceData(B, m_B, N, did, hid); \ allocAndInitOpenMPDeviceData(D, m_D, N, did, hid); #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_A, A, N, hid, did); \ - getOpenMPDeviceData(m_B, B, N, hid, did); \ - getOpenMPDeviceData(m_D, D, N, hid, did); \ - deallocOpenMPDeviceData(A, did); \ - deallocOpenMPDeviceData(B, did); \ + getOpenMPDeviceData(m_A, A, N, hid, did); \ + getOpenMPDeviceData(m_B, B, N, hid, did); \ + getOpenMPDeviceData(m_D, D, N, hid, did); \ + deallocOpenMPDeviceData(A, did); \ + deallocOpenMPDeviceData(B, did); \ deallocOpenMPDeviceData(D, did); From c7b35a90b211be6418cb1842579a5fa0eeb42dbe Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Fri, 20 May 2022 13:42:24 -0400 Subject: [PATCH 57/81] spacing --- src/basic/MAT_FUSED_MUL_ADD.hpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 74beb78d6..f5bee524d 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -25,23 +25,23 @@ #include "RAJA/RAJA.hpp" #include "common/KernelBase.hpp" -#define MAT_FUSED_MUL_ADD_DATA_INIT \ -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ - for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ +#define MAT_FUSED_MUL_ADD_DATA_INIT \ +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \ } -#define MAT_FUSED_MUL_ADD_DATA_SETUP \ - Real_ptr A = m_A; \ - Real_ptr B = m_B; \ +#define MAT_FUSED_MUL_ADD_DATA_SETUP \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ Real_ptr D = m_D; -#define MAT_FUSED_MUL_ADD_BODY \ - Real_type dot = 0; \ - for (Index_type k = 0; k < Ne; ++k) { \ - dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)]; \ - } \ - D[row*Ne + col + ii*(Ne*Ne)] = dot; \ +#define MAT_FUSED_MUL_ADD_BODY \ + Real_type dot = 0; \ + for (Index_type k = 0; k < Ne; ++k) { \ + dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)]; \ + } \ + D[row*Ne + col + ii*(Ne*Ne)] = dot; \ namespace rajaperf { class RunParams; From 2d591b283ff7d62362e54db81c8057a06fcfe154 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 12:51:43 -0500 Subject: [PATCH 58/81] moving function to unnamed namespace --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 1df7b7ba2..495b6a6a3 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -216,6 +216,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) << std::endl; } } +namespace{ std::string getArch() { hipDeviceProp_t devProp; @@ -237,6 +238,7 @@ bool builtinSupported() #endif return false; } +} void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx) { bool builtin_supported = builtinSupported(); From 18622fb1cc4ec750b2f8d3f08fcd331d483b82ae Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 13:17:18 -0500 Subject: [PATCH 59/81] fixing undefined var --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 0611f5640..6892b3ddb 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -21,6 +21,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A const Index_type run_reps = getRunReps(); const Index_type N = m_N; const Index_type Ne = m_Ne; + constexpr Index_type NeNe = m_Ne * m_Ne; MAT_FUSED_MUL_ADD_DATA_SETUP; From 23b9277fab0e3975dd33288a419b1f8b267e37bc Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 13:40:25 -0500 Subject: [PATCH 60/81] fixing omp looping issue for gcc --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 6892b3ddb..9cc62bf30 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -19,6 +19,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); const Index_type N = m_N; const Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; @@ -32,9 +33,11 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A case Base_OpenMP: { startTimer(); + + Index_type ii_end = (N/(Ne*Ne); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type ii = 0; ii != ii_end; ++ii){ for(Index_type row = 0; row != Ne; ++row){ for(Index_type col = 0; col != Ne; ++col){ MAT_FUSED_MUL_ADD_BODY; From 5f46c0e88ffc25dad64b3207b8dbe62c46699811 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 13:40:52 -0500 Subject: [PATCH 61/81] fixing omp looping issue for gcc --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 9cc62bf30..b8610f1f4 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -34,7 +34,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A startTimer(); - Index_type ii_end = (N/(Ne*Ne); + Index_type ii_end = (N/(Ne*Ne)); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for for(Index_type ii = 0; ii != ii_end; ++ii){ From 8f0e5124e73d75dc7aede2dd0c6c0aeb8292fa1b Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 20 May 2022 14:14:15 -0500 Subject: [PATCH 62/81] fixing one more omp looping issue for gcc --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index b8610f1f4..5b501d33c 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -58,9 +58,10 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A }; startTimer(); + Index_type ii_end = (N/(Ne*Ne)); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type ii = 0; ii != ii_end; ++ii){ for(Index_type row = 0; row != Ne; ++row){ for(Index_type col = 0; col != Ne; ++col){ mat_fused_base_lam(ii, row, col); From 8f0799651ee4b29cc609477c842110d247f3c657 Mon Sep 17 00:00:00 2001 From: CRobeck Date: Fri, 20 May 2022 14:34:23 -0500 Subject: [PATCH 63/81] omp requires canonical loop forms --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 5b501d33c..5fe03bd90 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -23,7 +23,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A const Index_type N = m_N; const Index_type Ne = m_Ne; constexpr Index_type NeNe = m_Ne * m_Ne; - + const Index_type ii_end = (N/(Ne*Ne)); MAT_FUSED_MUL_ADD_DATA_SETUP; MAT_FUSED_MUL_ADD_DATA_INIT; @@ -33,13 +33,11 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A case Base_OpenMP: { startTimer(); - - Index_type ii_end = (N/(Ne*Ne)); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for - for(Index_type ii = 0; ii != ii_end; ++ii){ - for(Index_type row = 0; row != Ne; ++row){ - for(Index_type col = 0; col != Ne; ++col){ + for(Index_type ii = 0; ii < ii_end; ++ii){ + for(Index_type row = 0; row < Ne; ++row){ + for(Index_type col = 0; col < Ne; ++col){ MAT_FUSED_MUL_ADD_BODY; } } @@ -58,12 +56,11 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A }; startTimer(); - Index_type ii_end = (N/(Ne*Ne)); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for - for(Index_type ii = 0; ii != ii_end; ++ii){ - for(Index_type row = 0; row != Ne; ++row){ - for(Index_type col = 0; col != Ne; ++col){ + for(Index_type ii = 0; ii < ii_end; ++ii){ + for(Index_type row = 0; row < Ne; ++row){ + for(Index_type col = 0; col < Ne; ++col){ mat_fused_base_lam(ii, row, col); } } From c94586fd856e77dd87091abc6421b218eb845261 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Fri, 20 May 2022 21:09:19 -0400 Subject: [PATCH 64/81] update default problem size --- src/basic/MAT_FUSED_MUL_ADD.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp index 957e09bfb..f7e40d3a8 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -20,8 +20,8 @@ namespace basic { MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params) { - m_N_default = 1024; - setDefaultProblemSize(m_N_default); + m_N_default = 1000; + setDefaultProblemSize(m_N_default*m_N_default); setDefaultReps(5); //Make sure problem target size is divisible by 16*16 From 43e277b838d4eb8e8f465a67188b617d64601ef1 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Mon, 6 Jun 2022 10:01:29 -0400 Subject: [PATCH 65/81] update spacing --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 01d5a428e..46cb6b2e3 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -227,14 +227,14 @@ std::string getArch() } bool builtinSupported() { - std::string hipArch = getArch(); + std::string hipArch = getArch(); #if defined(RP_USE_DOUBLE) - if (hipArch=="gfx90a") - return true; + if (hipArch=="gfx90a") + return true; #endif #if defined(RP_USE_FLOAT) - if (hipArch=="gfx90a" || hipArch=="gfx908") - return true; + if (hipArch=="gfx90a" || hipArch=="gfx908") + return true; #endif return false; } @@ -244,7 +244,7 @@ void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx) bool builtin_supported = builtinSupported(); size_t t = 0; - if ( vid == Base_HIP && builtin_supported) { + if ( vid == Base_HIP && builtin_supported) { if (tune_idx == t) { @@ -283,14 +283,14 @@ void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx) void MAT_FUSED_MUL_ADD::setHipTuningDefinitions(VariantID vid) { - bool builtin_supported = builtinSupported(); - if ( vid == Base_HIP ) { + bool builtin_supported = builtinSupported(); + if ( vid == Base_HIP ) { - if (builtin_supported) { - addVariantTuningName(vid, "builtin"); - } - } - seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + if (builtin_supported) { + addVariantTuningName(vid, "builtin"); + } + } + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { if (run_params.numValidGPUBlockSize() == 0u || run_params.validGPUBlockSize(block_size)) { From 44c36b4e01d3b60190d2481a6e4c834e74af8d8f Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 6 Jun 2022 10:53:01 -0500 Subject: [PATCH 66/81] fix spacing --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 8 ++++---- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index a78bc3487..c739a0b16 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -52,10 +52,10 @@ __global__ void mat_fused_lam(Index_type N, Lambda body) { constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - Index_type col = threadIdx.x + blockIdx.x * blockDim.x; - Index_type row = threadIdx.y + blockIdx.y * blockDim.y; - body(ii,col,row); - } + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + body(ii,col,row); + } } template < size_t block_size > void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 46cb6b2e3..9dfb25af9 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -89,9 +89,9 @@ __global__ void mat_fused_lam(const Index_type N, Lambda body) { constexpr Index_type Ne = 16; for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ - Index_type col = threadIdx.x + blockIdx.x * blockDim.x; - Index_type row = threadIdx.y + blockIdx.y * blockDim.y; - body(ii,col,row); + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + body(ii,col,row); } } void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid) @@ -172,7 +172,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) hipLaunchKernelGGL((mat_fused_lam), dim3(gridDim), dim3(blockDim), 0, 0, iend, mat_fused_lamda); - hipErrchk( hipGetLastError() ); + hipErrchk( hipGetLastError() ); } stopTimer(); @@ -203,10 +203,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) > > >; - RAJA::kernel(RAJA::make_tuple(row_range, col_range, ii_range), - [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) { - MAT_FUSED_MUL_ADD_BODY; - }); + RAJA::kernel(RAJA::make_tuple(row_range, col_range, ii_range), + [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) { + MAT_FUSED_MUL_ADD_BODY; + }); stopTimer(); MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; From 8e3fbf68f64ef782eefcdfcd907f42f27750046f Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 6 Jun 2022 13:43:29 -0500 Subject: [PATCH 67/81] formating clean up --- src/basic/MAT_FUSED_MUL_ADD.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index f5bee524d..5cb193822 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -27,13 +27,13 @@ #define MAT_FUSED_MUL_ADD_DATA_INIT \ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ - for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ - for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ + for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \ } #define MAT_FUSED_MUL_ADD_DATA_SETUP \ - Real_ptr A = m_A; \ - Real_ptr B = m_B; \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ Real_ptr D = m_D; #define MAT_FUSED_MUL_ADD_BODY \ From bd2d44c159d272210e61201d8ab0c80c3f93c76d Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Mon, 6 Jun 2022 20:10:32 -0500 Subject: [PATCH 68/81] move getHipArch into common header file as a static free function --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 11 ++--------- src/common/HipDataUtils.hpp | 9 ++++++++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 9dfb25af9..17368da20 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -217,17 +217,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) } } namespace{ -std::string getArch() -{ - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - std::string gcnArchName(devProp.gcnArchName); - std::string hipArch = gcnArchName.substr(0, 6); - return hipArch; -} + bool builtinSupported() { - std::string hipArch = getArch(); + std::string hipArch = getHipArch(); #if defined(RP_USE_DOUBLE) if (hipArch=="gfx90a") return true; diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index a3871d31e..e9855ae9c 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -22,7 +22,6 @@ #include "RAJA/policy/hip/raja_hiperrchk.hpp" - namespace rajaperf { @@ -178,6 +177,14 @@ void deallocHipPinnedData(T& pptr) pptr = nullptr; } +static std::string getHipArch() +{ + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + std::string gcnArchName(devProp.gcnArchName); + std::string hipArch = gcnArchName.substr(0, 6); + return hipArch; +} } // closing brace for rajaperf namespace From 49084efa42584a9c037ced8efd74f34d7f83c15d Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Jun 2022 09:12:39 -0500 Subject: [PATCH 69/81] update out loop for raja hip/cuda variant --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 6 +++--- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index c739a0b16..85fc8ab37 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -117,9 +117,9 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::For<2, RAJA::loop_exec, - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::For<2, RAJA::hip_block_z_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0> diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 17368da20..a4eece3a8 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -190,9 +190,9 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::HipKernel< - RAJA::statement::For<2, RAJA::loop_exec, - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::For<2, RAJA::hip_block_z_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_direct, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0> From 5169e6e4db3242fa36bbe7092bb84a7a89446f29 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Jun 2022 09:25:20 -0500 Subject: [PATCH 70/81] update unroll pragma to RAJAPERF_UNROLL --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index a4eece3a8..dbf926933 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -63,7 +63,7 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re b_idx += 4*Ne; // move two rows down } - #pragma unroll 4 + RAJAPERF_UNROLL(4) for(Index_type i = 0; i < 4; ++i){ const Index_type d_idx = threadIdx.x + Ne * (threadIdx.y + 4 * i); From a400d9303028631b04b28eccab1a430909a0d340 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Jun 2022 09:42:07 -0500 Subject: [PATCH 71/81] fix RAJAPERF_UNROLL def --- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index dbf926933..54a332f3b 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -63,7 +63,7 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re b_idx += 4*Ne; // move two rows down } - RAJAPERF_UNROLL(4) + RAJA_UNROLL_COUNT(4) for(Index_type i = 0; i < 4; ++i){ const Index_type d_idx = threadIdx.x + Ne * (threadIdx.y + 4 * i); From 5bc407fcd4e22b2a5622545436f04164f7528e91 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Tue, 7 Jun 2022 10:15:14 -0500 Subject: [PATCH 72/81] fix type in cuda variant --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 85fc8ab37..0f51f8cb4 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -117,7 +117,7 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernel< - RAJA::statement::For<2, RAJA::hip_block_z_loop, + RAJA::statement::For<2, RAJA::cuda_block_z_loop, RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_direct, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, From 3186df1388f3cfc13fd9967b530a25ae819e683a Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Mon, 20 Jun 2022 16:07:56 -0400 Subject: [PATCH 73/81] Update MAT_FUSED_MUL_ADD.hpp fix spacing --- src/basic/MAT_FUSED_MUL_ADD.hpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 5cb193822..8d92a2f13 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -25,23 +25,23 @@ #include "RAJA/RAJA.hpp" #include "common/KernelBase.hpp" -#define MAT_FUSED_MUL_ADD_DATA_INIT \ -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ - for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ - for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \ +#define MAT_FUSED_MUL_ADD_DATA_INIT \ +for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ + for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ + for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \ } -#define MAT_FUSED_MUL_ADD_DATA_SETUP \ - Real_ptr A = m_A; \ - Real_ptr B = m_B; \ +#define MAT_FUSED_MUL_ADD_DATA_SETUP \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ Real_ptr D = m_D; -#define MAT_FUSED_MUL_ADD_BODY \ - Real_type dot = 0; \ - for (Index_type k = 0; k < Ne; ++k) { \ - dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)]; \ - } \ - D[row*Ne + col + ii*(Ne*Ne)] = dot; \ +#define MAT_FUSED_MUL_ADD_BODY \ + Real_type dot = 0; \ + for (Index_type k = 0; k < Ne; ++k) { \ + dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)]; \ + } \ + D[row*Ne + col + ii*(Ne*Ne)] = dot; \ namespace rajaperf { class RunParams; From 9eb3ce4d13c1a1ed087736d9869dfaa7342c6dd4 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 14 Jul 2022 15:05:42 -0500 Subject: [PATCH 74/81] cleaning up some variable naming and create a N_Elem var --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 10 ++++++---- src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 15 +++++++++------ src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 7 +++---- src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 6 +++--- src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 8 ++++---- src/basic/MAT_FUSED_MUL_ADD.hpp | 12 ++++++------ 6 files changed, 31 insertions(+), 27 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 0f51f8cb4..49b79f38d 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -40,7 +40,8 @@ __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ constexpr Index_type Ne = 16; -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +const Index_Type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; MAT_FUSED_MUL_ADD_BODY; @@ -51,7 +52,8 @@ __launch_bounds__(block_size) __global__ void mat_fused_lam(Index_type N, Lambda body) { constexpr Index_type Ne = 16; -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +const Index_Type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; body(ii,col,row); @@ -62,8 +64,8 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; + const Index_Type N_Elem = N/(Ne*Ne); constexpr Index_type Ne = m_Ne; - constexpr Index_type NeNe = m_Ne * m_Ne; constexpr Index_type block_x = gpu_block_size::sqrt(block_size); constexpr Index_type block_y = gpu_block_size::sqrt(block_size); @@ -113,7 +115,7 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) startTimer(); RAJA::RangeSegment row_range(0, Ne); RAJA::RangeSegment col_range(0, Ne); - RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + RAJA::RangeSegment ii_range(0, N_Elem); using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernel< diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp index 54a332f3b..b8a0d286b 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -35,7 +35,8 @@ namespace basic { __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ constexpr Index_type Ne = 16; - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + const Index_type N_Elem = N/(Ne*Ne); + for(Index_type ii = 0; ii != N_Elem; ++ii){ using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type; real4 result = {0}; @@ -77,7 +78,8 @@ __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ constexpr Index_type Ne = 16; -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +const Index_type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; MAT_FUSED_MUL_ADD_BODY; @@ -88,7 +90,8 @@ __launch_bounds__(block_size) __global__ void mat_fused_lam(const Index_type N, Lambda body) { constexpr Index_type Ne = 16; -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +const Index_type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; body(ii,col,row); @@ -100,7 +103,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid) const Index_type iend = getActualProblemSize(); const Index_type N = m_N; constexpr Index_type Ne = m_Ne; - constexpr Index_type NeNe = m_Ne * m_Ne; + const Index_type N_Elem = N/(Ne*Ne); dim3 gridDim (1, 1, 1); dim3 blockDimBuiltin(Ne, 4, 1); @@ -134,7 +137,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) const Index_type iend = getActualProblemSize(); const Index_type N = m_N; constexpr Index_type Ne = m_Ne; - constexpr Index_type NeNe = m_Ne * m_Ne; + const Index_type N_Elem = N/(Ne*Ne); constexpr Index_type block_x = gpu_block_size::sqrt(block_size); constexpr Index_type block_y = gpu_block_size::sqrt(block_size); @@ -183,7 +186,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; startTimer(); - RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + RAJA::RangeSegment ii_range(0, N_Elem); RAJA::RangeSegment row_range(0, Ne); RAJA::RangeSegment col_range(0, Ne); diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 5fe03bd90..868c5aa99 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -22,8 +22,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A const Index_type iend = getActualProblemSize(); const Index_type N = m_N; const Index_type Ne = m_Ne; - constexpr Index_type NeNe = m_Ne * m_Ne; - const Index_type ii_end = (N/(Ne*Ne)); + const Index_type N_Elem = (N/(Ne*Ne)); MAT_FUSED_MUL_ADD_DATA_SETUP; MAT_FUSED_MUL_ADD_DATA_INIT; @@ -35,7 +34,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for - for(Index_type ii = 0; ii < ii_end; ++ii){ + for(Index_type ii = 0; ii < N_Elem; ++ii){ for(Index_type row = 0; row < Ne; ++row){ for(Index_type col = 0; col < Ne; ++col){ MAT_FUSED_MUL_ADD_BODY; @@ -58,7 +57,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp parallel for - for(Index_type ii = 0; ii < ii_end; ++ii){ + for(Index_type ii = 0; ii < N_Elem; ++ii){ for(Index_type row = 0; row < Ne; ++row){ for(Index_type col = 0; col < Ne; ++col){ mat_fused_base_lam(ii, row, col); diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp index 4847a05c5..18e7a140e 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp @@ -29,7 +29,7 @@ namespace basic { int did = omp_get_default_device(); \ const Index_type N = m_N; \ constexpr Index_type Ne = m_Ne; \ - constexpr Index_type NeNe = m_Ne * m_Ne; \ + const Index_type N_Elem = (N/(Ne*Ne); \ allocAndInitOpenMPDeviceData(A, m_A, N, did, hid); \ allocAndInitOpenMPDeviceData(B, m_B, N, did, hid); \ allocAndInitOpenMPDeviceData(D, m_D, N, did, hid); @@ -61,7 +61,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN for (RepIndex_type irep = 0; irep < run_reps; ++irep) { #pragma omp target is_device_ptr(A, B, D) device( did ) #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type ii = 0; ii != N_Elem; ++ii){ for(Index_type row = 0; row != Ne; ++row){ for(Index_type col = 0; col != Ne; ++col){ MAT_FUSED_MUL_ADD_BODY; @@ -80,7 +80,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN RAJA::RangeSegment row_range(0, Ne); RAJA::RangeSegment col_range(0, Ne); - RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + RAJA::RangeSegment ii_range(0, N_Elem); using EXEC_POL = diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp index 842966146..14f783f66 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp @@ -18,7 +18,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( const Index_type run_reps = getRunReps(); const Index_type N = m_N; constexpr Index_type Ne = m_Ne; - constexpr Index_type NeNe = m_Ne * m_Ne; + const Index_type N_Elem = N/(Ne*Ne); MAT_FUSED_MUL_ADD_DATA_SETUP; @@ -29,7 +29,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type ii = 0; ii != N_Elem; ++ii){ for(Index_type row = 0; row != Ne; ++row){ for(Index_type col = 0; col != Ne; ++col){ MAT_FUSED_MUL_ADD_BODY; @@ -52,7 +52,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( startTimer(); for (Index_type irep = 0; irep < run_reps; ++irep) { - for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ + for(Index_type ii = 0; ii != N_Elem; ++ii){ for(Index_type row = 0; row != Ne; ++row){ for(Index_type col = 0; col != Ne; ++col){ mat_fused_lam(ii,row,col); @@ -69,7 +69,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( case RAJA_Seq: { RAJA::RangeSegment row_range(0, Ne); RAJA::RangeSegment col_range(0, Ne); - RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + RAJA::RangeSegment ii_range(0, N_Elem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp index 8d92a2f13..01aae2601 100644 --- a/src/basic/MAT_FUSED_MUL_ADD.hpp +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -7,12 +7,12 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Compute D = A x B + C, where // Inputs: -// A: N/(Ne*Ne) Ne x Ne matrices -// B: N/(Ne*Ne) Ne x Ne matrices +// A: N_Elem x (Ne x Ne) matrices +// B: N_Elem x (Ne x Ne) matrices // Ouput: // D: N/(Ne*Ne) Ne x Ne matrices // All square row-major matrices, C is ignored. -//for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ +//for(Index_type ii = 0; ii != N_Elem; ++ii){ // for(Index_type row = 0; row != Ne; ++row){ // for(Index_type col = 0; col != Ne; ++col){ // MAT_FUSED_MUL_ADD_BODY; @@ -26,9 +26,9 @@ #include "common/KernelBase.hpp" #define MAT_FUSED_MUL_ADD_DATA_INIT \ -for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \ - for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \ - for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \ +for(Index_type ii = 0; ii != N_Elem; ++ii){ \ + for(Index_type i = 0; i != Ne*Ne; ++i){ m_A[i+(ii*Ne*Ne)] = i; } \ + for(Index_type i = 0; i != Ne*Ne; ++i){ m_B[i+(ii*Ne*Ne)] = (Ne*Ne) - 1 - i; } \ } #define MAT_FUSED_MUL_ADD_DATA_SETUP \ From f8c1f2c175abe0a398685396b39a8f40f18be220 Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 14 Jul 2022 16:15:33 -0400 Subject: [PATCH 75/81] add raja_hiperrchk header Co-authored-by: Jason Burmark --- src/common/HipDataUtils.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index e9855ae9c..709e07525 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -22,6 +22,7 @@ #include "RAJA/policy/hip/raja_hiperrchk.hpp" + namespace rajaperf { From 13474345ef47b6fabf9bd6094a0c368e299255ed Mon Sep 17 00:00:00 2001 From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com> Date: Thu, 14 Jul 2022 16:16:37 -0400 Subject: [PATCH 76/81] update lone N_Elem var --- src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp index 868c5aa99..6b417faae 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -76,7 +76,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A RAJA::RangeSegment row_range(0, Ne); RAJA::RangeSegment col_range(0, Ne); - RAJA::RangeSegment ii_range(0, (N/(Ne*Ne))); + RAJA::RangeSegment ii_range(0, N_Elem); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { From cde5c950653113a479e3bcbe07f09526eb853d43 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 14 Jul 2022 15:31:07 -0500 Subject: [PATCH 77/81] update lambda forall --- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 +- src/basic/DAXPY-Hip.cpp | 2 +- src/basic/DAXPY_ATOMIC-Hip.cpp | 2 +- src/basic/IF_QUAD-Hip.cpp | 2 +- src/basic/INIT3-Hip.cpp | 2 +- src/basic/INIT_VIEW1D-Hip.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +- src/basic/MULADDSUB-Hip.cpp | 2 +- src/basic/PI_ATOMIC-Hip.cpp | 2 +- src/common/HipDataUtils.hpp | 4 ++-- src/stream/ADD-Hip.cpp | 2 +- src/stream/COPY-Hip.cpp | 2 +- src/stream/MUL-Hip.cpp | 2 +- src/stream/TRIAD-Hip.cpp | 2 +- 14 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 782e4099c..3ddd555b8 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -122,7 +122,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, 0, iend, deldotvec2d_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index 25810c19e..e3793b18c 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -82,7 +82,7 @@ void DAXPY::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, daxpy_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index a1e7a6465..65f258d8d 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -81,7 +81,7 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, daxpy_atomic_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index 6ded209a9..cf4b7d8e9 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -89,7 +89,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, ifquad_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index af3276a7d..bc9030d13 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -90,7 +90,7 @@ void INIT3::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, init3_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index 6f9d41924..b9f28795b 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -80,7 +80,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, initview1d_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index ae98f56ab..2a0e06681 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -81,7 +81,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, initview1d_offset_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index cb9076b38..b60dcf4b1 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -90,7 +90,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, muladdsub_lambda ); hipErrchk( hipGetLastError() ); diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 605696676..1ae18f3d9 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -87,7 +87,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, atomic_pi_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index e9855ae9c..47a255c2a 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -29,7 +29,7 @@ namespace rajaperf * \brief Simple forall hip kernel that runs a lambda. */ template < typename Lambda > -__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body) +__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body) { Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x; if (i < iend) { @@ -39,7 +39,7 @@ __global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda bod /// template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body) +__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body) { Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x; if (i < iend) { diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 5e53500c8..838893318 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -82,7 +82,7 @@ void ADD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, add_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index fe302a7fc..9dd9fdeed 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -80,7 +80,7 @@ void COPY::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, copy_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 3e5e3f9f0..4894a3a83 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -80,7 +80,7 @@ void MUL::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, mul_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index 740727530..c8a89308c 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -82,7 +82,7 @@ void TRIAD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall), + hipLaunchKernelGGL((lambda_hip_forall_1D), grid_size, block_size, 0, 0, ibegin, iend, triad_lambda); hipErrchk( hipGetLastError() ); From 5b9312325bc2d14672e743ccf481e1c3ed08ba95 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 14 Jul 2022 22:23:30 -0500 Subject: [PATCH 78/81] roll back lambda_hip_forall naming --- src/apps/DEL_DOT_VEC_2D-Hip.cpp | 2 +- src/basic/DAXPY-Hip.cpp | 2 +- src/basic/DAXPY_ATOMIC-Hip.cpp | 2 +- src/basic/IF_QUAD-Hip.cpp | 2 +- src/basic/INIT3-Hip.cpp | 2 +- src/basic/INIT_VIEW1D-Hip.cpp | 2 +- src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +- src/basic/MULADDSUB-Hip.cpp | 2 +- src/basic/PI_ATOMIC-Hip.cpp | 2 +- src/common/HipDataUtils.hpp | 4 ++-- src/stream/ADD-Hip.cpp | 2 +- src/stream/COPY-Hip.cpp | 2 +- src/stream/MUL-Hip.cpp | 2 +- src/stream/TRIAD-Hip.cpp | 2 +- 14 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp index 3ddd555b8..782e4099c 100644 --- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp +++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp @@ -122,7 +122,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid) const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, 0, iend, deldotvec2d_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp index e3793b18c..25810c19e 100644 --- a/src/basic/DAXPY-Hip.cpp +++ b/src/basic/DAXPY-Hip.cpp @@ -82,7 +82,7 @@ void DAXPY::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, daxpy_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp index 65f258d8d..a1e7a6465 100644 --- a/src/basic/DAXPY_ATOMIC-Hip.cpp +++ b/src/basic/DAXPY_ATOMIC-Hip.cpp @@ -81,7 +81,7 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, daxpy_atomic_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp index cf4b7d8e9..6ded209a9 100644 --- a/src/basic/IF_QUAD-Hip.cpp +++ b/src/basic/IF_QUAD-Hip.cpp @@ -89,7 +89,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, ifquad_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp index bc9030d13..af3276a7d 100644 --- a/src/basic/INIT3-Hip.cpp +++ b/src/basic/INIT3-Hip.cpp @@ -90,7 +90,7 @@ void INIT3::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, init3_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp index b9f28795b..6f9d41924 100644 --- a/src/basic/INIT_VIEW1D-Hip.cpp +++ b/src/basic/INIT_VIEW1D-Hip.cpp @@ -80,7 +80,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, initview1d_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp index 2a0e06681..ae98f56ab 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp @@ -81,7 +81,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, initview1d_offset_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp index b60dcf4b1..cb9076b38 100644 --- a/src/basic/MULADDSUB-Hip.cpp +++ b/src/basic/MULADDSUB-Hip.cpp @@ -90,7 +90,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, muladdsub_lambda ); hipErrchk( hipGetLastError() ); diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp index 1ae18f3d9..605696676 100644 --- a/src/basic/PI_ATOMIC-Hip.cpp +++ b/src/basic/PI_ATOMIC-Hip.cpp @@ -87,7 +87,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, atomic_pi_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 459b4016e..709e07525 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -30,7 +30,7 @@ namespace rajaperf * \brief Simple forall hip kernel that runs a lambda. */ template < typename Lambda > -__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body) +__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body) { Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x; if (i < iend) { @@ -40,7 +40,7 @@ __global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda /// template < size_t block_size, typename Lambda > __launch_bounds__(block_size) -__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body) +__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body) { Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x; if (i < iend) { diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp index 838893318..5e53500c8 100644 --- a/src/stream/ADD-Hip.cpp +++ b/src/stream/ADD-Hip.cpp @@ -82,7 +82,7 @@ void ADD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, add_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp index 9dd9fdeed..fe302a7fc 100644 --- a/src/stream/COPY-Hip.cpp +++ b/src/stream/COPY-Hip.cpp @@ -80,7 +80,7 @@ void COPY::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, copy_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp index 4894a3a83..3e5e3f9f0 100644 --- a/src/stream/MUL-Hip.cpp +++ b/src/stream/MUL-Hip.cpp @@ -80,7 +80,7 @@ void MUL::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, mul_lambda); hipErrchk( hipGetLastError() ); diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp index c8a89308c..740727530 100644 --- a/src/stream/TRIAD-Hip.cpp +++ b/src/stream/TRIAD-Hip.cpp @@ -82,7 +82,7 @@ void TRIAD::runHipVariantImpl(VariantID vid) }; const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((lambda_hip_forall_1D), + hipLaunchKernelGGL((lambda_hip_forall), grid_size, block_size, 0, 0, ibegin, iend, triad_lambda); hipErrchk( hipGetLastError() ); From 39fbf158ccbcebe39a0f8d3ea4b74a7a1aea4128 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 14 Jul 2022 22:26:17 -0500 Subject: [PATCH 79/81] git rid of some unused static function warning --- src/common/HipDataUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 709e07525..742f0c54b 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -178,7 +178,7 @@ void deallocHipPinnedData(T& pptr) pptr = nullptr; } -static std::string getHipArch() +static inline std::string getHipArch() { hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); From 831e61cdf07142f27cbf0030c6fcef316c946457 Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Thu, 14 Jul 2022 23:00:43 -0500 Subject: [PATCH 80/81] make hipArch more consistent with device naming --- src/common/HipDataUtils.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index 742f0c54b..782fd7a1f 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -183,7 +183,8 @@ static inline std::string getHipArch() hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); std::string gcnArchName(devProp.gcnArchName); - std::string hipArch = gcnArchName.substr(0, 6); + std::string hipArch = gcnArchName.substr(0, 7); + if(hipArch.back() == ':' ) hipArch.pop_back(); return hipArch; } From 4e6a7b33b5834479c06760638edc0ed3a729039b Mon Sep 17 00:00:00 2001 From: Corbin Robeck Date: Fri, 15 Jul 2022 14:03:01 -0500 Subject: [PATCH 81/81] fixing var naming issue --- src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp index 49b79f38d..87966915c 100644 --- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -21,10 +21,10 @@ namespace basic { #define MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA \ const Index_type N = m_N; \ - const Index_type Ne = m_Ne; \ allocAndInitCudaDeviceData(A, m_A, N); \ allocAndInitCudaDeviceData(B, m_B, N); \ - allocAndInitCudaDeviceData(D, m_D, N); + allocAndInitCudaDeviceData(D, m_D, N); + #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA \ getCudaDeviceData(m_A, A, N); \ @@ -40,7 +40,7 @@ __launch_bounds__(block_size) __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ constexpr Index_type Ne = 16; -const Index_Type N_Elem = N/(Ne*Ne); +const Index_type N_Elem = N/(Ne*Ne); for(Index_type ii = 0; ii != N_Elem; ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; @@ -52,7 +52,7 @@ __launch_bounds__(block_size) __global__ void mat_fused_lam(Index_type N, Lambda body) { constexpr Index_type Ne = 16; -const Index_Type N_Elem = N/(Ne*Ne); +const Index_type N_Elem = N/(Ne*Ne); for(Index_type ii = 0; ii != N_Elem; ++ii){ Index_type col = threadIdx.x + blockIdx.x * blockDim.x; Index_type row = threadIdx.y + blockIdx.y * blockDim.y; @@ -64,8 +64,8 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type N = m_N; - const Index_Type N_Elem = N/(Ne*Ne); - constexpr Index_type Ne = m_Ne; + constexpr Index_type Ne = 16; + const Index_type N_Elem = N/(Ne*Ne); constexpr Index_type block_x = gpu_block_size::sqrt(block_size); constexpr Index_type block_y = gpu_block_size::sqrt(block_size);