From 1594a507f379fd5b6d1fa46a2dffbd7b65d156fa Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Tue, 26 Apr 2022 12:07:04 -0500
Subject: [PATCH 01/81] adding inital infrastructure for MFMA test

---
 src/CMakeLists.txt                        |   3 +
 src/basic/CMakeLists.txt                  |   6 ++
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp      | 109 ++++++++++++++++++++++
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp       | 106 +++++++++++++++++++++
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp       |  72 ++++++++++++++
 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp |  39 ++++++++
 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp       |  66 +++++++++++++
 src/basic/MAT_FUSED_MUL_ADD.cpp           |  98 +++++++++++++++++++
 src/basic/MAT_FUSED_MUL_ADD.hpp           |  88 +++++++++++++++++
 src/common/RAJAPerfSuite.cpp              |   6 ++
 src/common/RAJAPerfSuite.hpp              |   1 +
 11 files changed, 594 insertions(+)
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD.hpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bc1bf6b77..06a80317c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -94,6 +94,9 @@ blt_add_executable(
   basic/INIT_VIEW1D_OFFSET.cpp
   basic/INIT_VIEW1D_OFFSET-Seq.cpp
   basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
+  basic/MAT_FUSED_MUL_ADD.cpp
+  basic/MAT_FUSED_MUL_ADD-Seq.cpp
+  basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
   basic/MAT_MAT_SHARED.cpp
   basic/MAT_MAT_SHARED-Seq.cpp
   basic/MAT_MAT_SHARED-OMPTarget.cpp
diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt
index ceeb1a502..8d2cee87b 100644
--- a/src/basic/CMakeLists.txt
+++ b/src/basic/CMakeLists.txt
@@ -56,6 +56,12 @@ blt_add_library(
           INIT_VIEW1D_OFFSET-Cuda.cpp
           INIT_VIEW1D_OFFSET-OMP.cpp
           INIT_VIEW1D_OFFSET-OMPTarget.cpp
+          MAT_FUSED_MUL_ADD.cpp
+          MAT_FUSED_MUL_ADD-Seq.cpp
+          MAT_FUSED_MUL_ADD-Hip.cpp
+          MAT_FUSED_MUL_ADD-Cuda.cpp
+          MAT_FUSED_MUL_ADD-OMP.cpp
+          MAT_FUSED_MUL_ADD-OMPTarget.cpp
           MAT_MAT_SHARED.cpp
           MAT_MAT_SHARED-Seq.cpp
           MAT_MAT_SHARED-Hip.cpp
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
new file mode 100644
index 000000000..33e157080
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -0,0 +1,109 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+#define MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA           \
+  const Index_type N = m_N;                         \
+  const Index_type Ne = m_Ne;                       \
+  allocAndInitCudaDeviceData(A, m_A, N);            \
+  allocAndInitCudaDeviceData(B, m_B, N);            \
+  allocAndInitCudaDeviceData(D, m_D, N);			\
+  allocAndInitCudaDeviceData(Ae, m_Ae, Ne);         \
+  allocAndInitCudaDeviceData(Be, m_Be, Ne);         \
+  allocAndInitCudaDeviceData(De, m_De, Ne);
+
+
+#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA        \
+  getCudaDeviceData(m_A, A, N);                     \
+  getCudaDeviceData(m_B, B, N);                     \
+  getCudaDeviceData(m_D, D, N);                     \
+  getCudaDeviceData(m_Ae, Ae, Ne);                  \
+  getCudaDeviceData(m_Be, Be, Ne);                  \
+  getCudaDeviceData(m_De, De, Ne);                  \
+  deallocCudaDeviceData(A);                         \
+  deallocCudaDeviceData(B);                         \
+  deallocCudaDeviceData(D);							\
+  deallocCudaDeviceData(Ae);                        \
+  deallocCudaDeviceData(Be);                        \
+  deallocCudaDeviceData(De);
+
+
+template < Index_type block_size >
+  __launch_bounds__(block_size)
+__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B,
+                               Real_ptr D) {
+
+}
+
+template < size_t block_size >
+void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
+{
+
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  const Index_type Ne = m_Ne;
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  if (vid == Base_CUDA) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
+
+  } else if (vid == Lambda_CUDA) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
+
+  } else if (vid == RAJA_CUDA) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
+
+  } else {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown Cuda variant id = " << vid
+              << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Cuda)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_CUDA
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
new file mode 100644
index 000000000..528b629df
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -0,0 +1,106 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+#define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
+  const Index_type N = m_N;                        \
+  const Index_type Ne = m_Ne;                      \
+  allocAndInitHipDeviceData(A, m_A, N);            \
+  allocAndInitHipDeviceData(B, m_B, N);            \
+  allocAndInitHipDeviceData(D, m_D, N);			   \
+  allocAndInitHipDeviceData(Ae, m_Ae, Ne);         \
+  allocAndInitHipDeviceData(Be, m_Be, Ne);         \
+  allocAndInitHipDeviceData(De, m_De, Ne);
+
+#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP        \
+  getHipDeviceData(m_A, A, N);                     \
+  getHipDeviceData(m_B, B, N);                     \
+  getHipDeviceData(m_D, D, N);                     \
+  getHipDeviceData(m_Ae, Ae, Ne);                  \
+  getHipDeviceData(m_Be, Be, Ne);                  \
+  getHipDeviceData(m_De, De, Ne);                  \
+  deallocHipDeviceData(A);                         \
+  deallocHipDeviceData(B);                         \
+  deallocHipDeviceData(D);						   \
+  deallocHipDeviceData(Ae);                        \
+  deallocHipDeviceData(Be);                        \
+  deallocHipDeviceData(De);
+
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B,
+                               Real_ptr D) {
+
+}
+
+template < size_t block_size >
+void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  const Index_type Ne = m_Ne;
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  if (vid == Base_HIP) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+
+  } else if (vid == Lambda_HIP) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+
+  } else if (vid == RAJA_HIP) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+
+  } else {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid
+              << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Hip)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_HIP
diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
new file mode 100644
index 000000000..0651e3ccd
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -0,0 +1,72 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  const Index_type Ne = m_Ne;
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+
+  switch (vid) {
+
+  case Base_OpenMP: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case Lambda_OpenMP: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_OpenMP: {
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+
+  default: {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown variant id = " << vid
+              << std::endl;
+  }
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
new file mode 100644
index 000000000..e6c5933eb
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+
+  void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+    const Index_type run_reps = getRunReps();
+
+    switch (vid) {
+
+    default: {
+
+      getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OpenMPTarget variant id = " << vid << std::endl;
+      break;
+    }
+    }
+  }
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
new file mode 100644
index 000000000..d1b11a9c1
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
@@ -0,0 +1,66 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  const Index_type Ne = m_Ne;
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_Seq: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    } // number of iterations
+    stopTimer();
+
+    break;
+  }
+
+#if defined(RUN_RAJA_SEQ)
+  case Lambda_Seq: {
+
+
+    startTimer();
+    for (Index_type irep = 0; irep < run_reps; ++irep) {
+    } // irep
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_Seq: {
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+#endif // RUN_RAJA_SEQ
+
+  default: {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown variant id = " << vid
+              << std::endl;
+  }
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
new file mode 100644
index 000000000..35db9553b
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -0,0 +1,98 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <algorithm>
+
+namespace rajaperf {
+namespace basic {
+
+MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
+    : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params)
+{
+  m_N_default = 1000;
+  setDefaultProblemSize(m_N_default*m_N_default);
+  setDefaultReps(5);
+
+  //If problem target size is not divisible by Ne, round up
+  m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \
+  		Index_type(m_Ne));
+
+  setActualProblemSize(m_N*m_N);
+
+  setItsPerRep(getActualProblemSize());
+  setKernelsPerRep(1);
+
+  setBytesPerRep( m_N*m_N*sizeof(Real_type) +
+                  m_N*m_N*sizeof(Real_type) );
+
+  //Square Mat-Mat product flops should be (2^N−1)N^2=2*N^3−N^2
+  setFLOPsPerRep(2*m_N*m_N*m_N - m_N*m_N);
+
+  checksum_scale_factor = 1e-6 *
+              ( static_cast<Checksum_type>(getDefaultProblemSize()) /
+                                           getActualProblemSize() );
+
+
+
+  setVariantDefined(Base_Seq);
+  setVariantDefined(Lambda_Seq);
+  setVariantDefined(RAJA_Seq);
+
+  setVariantDefined(Base_OpenMP);
+  setVariantDefined(Lambda_OpenMP);
+  setVariantDefined(RAJA_OpenMP);
+
+  setVariantDefined(Base_CUDA);
+  setVariantDefined(Lambda_CUDA);
+  setVariantDefined(RAJA_CUDA);
+
+  setVariantDefined(Base_HIP);
+  setVariantDefined(Lambda_HIP);
+  setVariantDefined(RAJA_HIP);
+}
+
+MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {}
+
+void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+	
+  //hard coded for 16 at the moment
+  const Index_type m_Me = m_Ne;
+  const Index_type m_Ne = m_Ne;
+  const Index_type m_Ke = m_Ne;
+  //global matrices
+  allocAndInitDataConst(m_A, m_N * m_N, 1.0, vid);
+  allocAndInitDataConst(m_B, m_N * m_N, 1.0, vid);
+  allocAndInitDataConst(m_D, m_N * m_N, 0.0, vid);
+  //element/batch matrices
+  allocAndInitDataConst(m_Ae, m_Me * m_Ke, 1.0, vid);
+  allocAndInitDataConst(m_Be, m_Ke * m_Ne, 1.0, vid);
+  allocAndInitDataConst(m_De, m_Me * m_Ne, 0.0, vid);
+}
+
+void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) {
+  checksum[vid][tune_idx] += calcChecksum(m_Ae, m_N, checksum_scale_factor );
+}
+
+void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+  (void)vid;
+  deallocData(m_A);
+  deallocData(m_B);
+  deallocData(m_D);
+  deallocData(m_Ae);
+  deallocData(m_Be);
+  deallocData(m_De);
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
new file mode 100644
index 000000000..caf777934
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -0,0 +1,88 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+// Compute D = A x B + C, where
+// A: a M x K matrix
+// B: a K x N matrix
+// C, D: M x N matrices
+// All square row-major matrices, C is a null matrix and ignored.
+//    for(int row = 0; row != m; ++row){
+//      for(int col = 0; col != n; ++col){
+//
+//        float sum = 0.0;
+//        for (int kk = 0; kk < k; ++kk){
+//            sum += A[row][kk] * B[kk][col];
+//        }
+//        D[row][col] = sum;
+//      }
+//    }
+//  }
+
+#ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
+#define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
+
+#include "RAJA/RAJA.hpp"
+#include "common/KernelBase.hpp"
+
+
+#define MAT_FUSED_MUL_ADD_DATA_SETUP        \
+  Real_ptr A = m_A; 						\
+  Real_ptr B = m_B; 						\
+  Real_ptr D = m_D; 						\
+  Real_ptr Ae = m_Ae;                       \
+  Real_ptr Be = m_Be;                       \
+  Real_ptr De = m_De;
+
+namespace rajaperf {
+class RunParams;
+
+namespace basic {
+
+class MAT_FUSED_MUL_ADD : public KernelBase {
+public:
+  MAT_FUSED_MUL_ADD(const RunParams &params);
+
+  ~MAT_FUSED_MUL_ADD();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size, gpu_block_size::ExactSqrt>;
+
+  Real_ptr m_A;
+  Real_ptr m_B;
+  Real_ptr m_D;
+  Real_ptr m_Ae;
+  Real_ptr m_Be;
+  Real_ptr m_De;
+
+  Index_type m_N;
+  Index_type m_N_default;
+  static constexpr Index_type m_Ne = 16;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp
index 1bc10c31d..ecfc2f797 100644
--- a/src/common/RAJAPerfSuite.cpp
+++ b/src/common/RAJAPerfSuite.cpp
@@ -25,6 +25,7 @@
 #include "basic/INIT3.hpp"
 #include "basic/INIT_VIEW1D.hpp"
 #include "basic/INIT_VIEW1D_OFFSET.hpp"
+#include "basic/MAT_FUSED_MUL_ADD.hpp"
 #include "basic/MAT_MAT_SHARED.hpp"
 #include "basic/MULADDSUB.hpp"
 #include "basic/NESTED_INIT.hpp"
@@ -158,6 +159,7 @@ static const std::string KernelNames [] =
   std::string("Basic_INIT3"),
   std::string("Basic_INIT_VIEW1D"),
   std::string("Basic_INIT_VIEW1D_OFFSET"),
+  std::string("Basic_MAT_FUSED_MUL_ADD"),
   std::string("Basic_MAT_MAT_SHARED"),
   std::string("Basic_MULADDSUB"),
   std::string("Basic_NESTED_INIT"),
@@ -535,6 +537,10 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new basic::INIT_VIEW1D_OFFSET(run_params);
        break;
     }
+    case Basic_MAT_FUSED_MUL_ADD : {
+       kernel = new basic::MAT_FUSED_MUL_ADD(run_params);
+       break;
+    }
     case Basic_MAT_MAT_SHARED : {
        kernel = new basic::MAT_MAT_SHARED(run_params);
        break;
diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp
index fad672137..ea082325e 100644
--- a/src/common/RAJAPerfSuite.hpp
+++ b/src/common/RAJAPerfSuite.hpp
@@ -78,6 +78,7 @@ enum KernelID {
   Basic_INIT3,
   Basic_INIT_VIEW1D,
   Basic_INIT_VIEW1D_OFFSET,
+  Basic_MAT_FUSED_MUL_ADD,
   Basic_MAT_MAT_SHARED,
   Basic_MULADDSUB,
   Basic_NESTED_INIT,

From e3d5aac37126b02c64d7bb9ab16ccb90d45a8353 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Wed, 27 Apr 2022 13:17:00 -0500
Subject: [PATCH 02/81] working hip mfma with matrix core builtins

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 64 +++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 528b629df..e3c138ace 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -21,18 +21,19 @@ namespace basic {
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
   const Index_type N = m_N;                        \
+  const Index_type NN = m_N * m_N;                 \
   const Index_type Ne = m_Ne;                      \
-  allocAndInitHipDeviceData(A, m_A, N);            \
-  allocAndInitHipDeviceData(B, m_B, N);            \
-  allocAndInitHipDeviceData(D, m_D, N);			   \
+  allocAndInitHipDeviceData(A, m_A, NN);            \
+  allocAndInitHipDeviceData(B, m_B, NN);            \
+  allocAndInitHipDeviceData(D, m_D, NN);			   \
   allocAndInitHipDeviceData(Ae, m_Ae, Ne);         \
   allocAndInitHipDeviceData(Be, m_Be, Ne);         \
   allocAndInitHipDeviceData(De, m_De, Ne);
 
 #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP        \
-  getHipDeviceData(m_A, A, N);                     \
-  getHipDeviceData(m_B, B, N);                     \
-  getHipDeviceData(m_D, D, N);                     \
+  getHipDeviceData(m_A, A, NN);                     \
+  getHipDeviceData(m_B, B, NN);                     \
+  getHipDeviceData(m_D, D, NN);                     \
   getHipDeviceData(m_Ae, Ae, Ne);                  \
   getHipDeviceData(m_Be, Be, Ne);                  \
   getHipDeviceData(m_De, De, Ne);                  \
@@ -45,31 +46,76 @@ namespace basic {
 
 template < Index_type block_size >
 __launch_bounds__(block_size)
-__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B,
-                               Real_ptr D) {
+__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D){
+  // compute a 16x16x16 matrix multiplication using a single wavefront.
+#if defined(RP_USE_DOUBLE)
+  using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double;
+  double4 result = {0};
+#elif defined(RP_USE_FLOAT)
+  using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
+  float4 result = {0}; // zero out 4 vanilla VGPRs
+#endif
+  Index_type a_idx = 16 * threadIdx.x + threadIdx.y;
+  Index_type b_idx = threadIdx.x + 16 * threadIdx.y;
+
+  for(int i = 0; i < 4; ++i){
+    Real_type a = A[a_idx];
+    Real_type b = B[b_idx];
+
+#ifdef __gfx90a__	
+#if defined(RP_USE_DOUBLE)
+	result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
+#elif defined(RP_USE_FLOAT)
+    result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0);
+#endif  
+#endif
+    a_idx += 4; // move four columns to the right
+    b_idx += 4*16; // move four rows down
+  }
+
+  #pragma unroll 4
+  for(Index_type i = 0; i < 4; ++i){
+    const Index_type d_idx =  threadIdx.x            
+                     + i * 16                
+                     + threadIdx.y * 4 * 16; 
 
+    D[d_idx] = result[i];
+  }
 }
 
+
+
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
-  const Index_type Ne = m_Ne;
+  const Index_type NN = m_N * m_N;
+  //const Index_type Ne = m_Ne;
+
+  dim3 gridDim (1, 1, 1);
+  dim3 blockDim(16, 4, 1);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
   if (vid == Base_HIP) {
 
+  	for(Index_type i = 0; i != NN; ++i){ m_A[i] = i; }
+  	for(Index_type i = 0; i != NN; ++i){ m_B[i] = NN - 1 - i; }	
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+                         A, B, D);
+      hipErrchk( hipGetLastError() );
     }
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
 
+
   } else if (vid == Lambda_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;

From 332067b79185505f2385bef364a67bd722cff1a6 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Wed, 27 Apr 2022 14:46:34 -0500
Subject: [PATCH 03/81] fixing incorrect default problem size

---
 src/basic/MAT_FUSED_MUL_ADD.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
index 35db9553b..5f2589f17 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -20,14 +20,15 @@ namespace basic {
 MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
     : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params)
 {
-  m_N_default = 1000;
-  setDefaultProblemSize(m_N_default*m_N_default);
+  m_N_default = 16;
+  setDefaultProblemSize(m_N_default);
   setDefaultReps(5);
 
   //If problem target size is not divisible by Ne, round up
-  m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \
-  		Index_type(m_Ne));
+//  m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \
+//  		Index_type(m_Ne));
 
+  m_N = 16;
   setActualProblemSize(m_N*m_N);
 
   setItsPerRep(getActualProblemSize());

From 8aa020bb282dffd2ba66e20229bb2f9727afdb78 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Wed, 27 Apr 2022 15:01:00 -0500
Subject: [PATCH 04/81] fixing flop calc

---
 src/basic/MAT_FUSED_MUL_ADD.cpp | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
index 5f2589f17..fca7af98b 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -34,11 +34,10 @@ MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( m_N*m_N*sizeof(Real_type) +
-                  m_N*m_N*sizeof(Real_type) );
+  setBytesPerRep( 2*m_N*m_N*sizeof(Real_type));
 
   //Square Mat-Mat product flops should be (2^N−1)N^2=2*N^3−N^2
-  setFLOPsPerRep(2*m_N*m_N*m_N - m_N*m_N);
+  setFLOPsPerRep(2*m_N*m_N*m_N);
 
   checksum_scale_factor = 1e-6 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
@@ -66,23 +65,16 @@ MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
 MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {}
 
 void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
-	
-  //hard coded for 16 at the moment
-  const Index_type m_Me = m_Ne;
-  const Index_type m_Ne = m_Ne;
-  const Index_type m_Ke = m_Ne;
+
   //global matrices
   allocAndInitDataConst(m_A, m_N * m_N, 1.0, vid);
   allocAndInitDataConst(m_B, m_N * m_N, 1.0, vid);
   allocAndInitDataConst(m_D, m_N * m_N, 0.0, vid);
-  //element/batch matrices
-  allocAndInitDataConst(m_Ae, m_Me * m_Ke, 1.0, vid);
-  allocAndInitDataConst(m_Be, m_Ke * m_Ne, 1.0, vid);
-  allocAndInitDataConst(m_De, m_Me * m_Ne, 0.0, vid);
+
 }
 
 void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) {
-  checksum[vid][tune_idx] += calcChecksum(m_Ae, m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_D, m_N*m_N, checksum_scale_factor );
 }
 
 void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
@@ -90,9 +82,7 @@ void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_
   deallocData(m_A);
   deallocData(m_B);
   deallocData(m_D);
-  deallocData(m_Ae);
-  deallocData(m_Be);
-  deallocData(m_De);
+
 }
 
 } // end namespace basic

From bc9770a1f7b712df5a675a3e6d66db03986aa823 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Wed, 27 Apr 2022 15:02:02 -0500
Subject: [PATCH 05/81] fixing some variable names

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 22 +++++++++-------------
 src/basic/MAT_FUSED_MUL_ADD.hpp     |  8 +-------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index e3c138ace..5e34fe993 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -22,27 +22,17 @@ namespace basic {
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
   const Index_type N = m_N;                        \
   const Index_type NN = m_N * m_N;                 \
-  const Index_type Ne = m_Ne;                      \
   allocAndInitHipDeviceData(A, m_A, NN);            \
   allocAndInitHipDeviceData(B, m_B, NN);            \
-  allocAndInitHipDeviceData(D, m_D, NN);			   \
-  allocAndInitHipDeviceData(Ae, m_Ae, Ne);         \
-  allocAndInitHipDeviceData(Be, m_Be, Ne);         \
-  allocAndInitHipDeviceData(De, m_De, Ne);
+  allocAndInitHipDeviceData(D, m_D, NN);			   
 
 #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP        \
   getHipDeviceData(m_A, A, NN);                     \
   getHipDeviceData(m_B, B, NN);                     \
   getHipDeviceData(m_D, D, NN);                     \
-  getHipDeviceData(m_Ae, Ae, Ne);                  \
-  getHipDeviceData(m_Be, Be, Ne);                  \
-  getHipDeviceData(m_De, De, Ne);                  \
   deallocHipDeviceData(A);                         \
   deallocHipDeviceData(B);                         \
-  deallocHipDeviceData(D);						   \
-  deallocHipDeviceData(Ae);                        \
-  deallocHipDeviceData(Be);                        \
-  deallocHipDeviceData(De);
+  deallocHipDeviceData(D);						   
 
 template < Index_type block_size >
 __launch_bounds__(block_size)
@@ -91,7 +81,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
   const Index_type NN = m_N * m_N;
-  //const Index_type Ne = m_Ne;
 
   dim3 gridDim (1, 1, 1);
   dim3 blockDim(16, 4, 1);
@@ -114,6 +103,13 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+//  for(int i = 0; i != A_h.size(); ++i){ 
+//      printf("A_h[%d] = %f\n", i, A_h[i]); 
+//  }
+  for(int i = 0; i != NN; ++i){ 
+      printf("D[%d] = %f\n", i, m_D[i]); 
+  }
+
 
 
   } else if (vid == Lambda_HIP) {
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index caf777934..e7c621408 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -33,10 +33,7 @@
 #define MAT_FUSED_MUL_ADD_DATA_SETUP        \
   Real_ptr A = m_A; 						\
   Real_ptr B = m_B; 						\
-  Real_ptr D = m_D; 						\
-  Real_ptr Ae = m_Ae;                       \
-  Real_ptr Be = m_Be;                       \
-  Real_ptr De = m_De;
+  Real_ptr D = m_D; 						
 
 namespace rajaperf {
 class RunParams;
@@ -73,9 +70,6 @@ class MAT_FUSED_MUL_ADD : public KernelBase {
   Real_ptr m_A;
   Real_ptr m_B;
   Real_ptr m_D;
-  Real_ptr m_Ae;
-  Real_ptr m_Be;
-  Real_ptr m_De;
 
   Index_type m_N;
   Index_type m_N_default;

From 774d74211d5eaa17e9ff06e6e191649b22a2872e Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 28 Apr 2022 15:03:52 -0500
Subject: [PATCH 06/81] setting up problem size infrastructure correctly

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 35 ++++++++++++++++-------------
 src/basic/MAT_FUSED_MUL_ADD.cpp     | 22 ++++++++----------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 5e34fe993..2238332cc 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -21,15 +21,16 @@ namespace basic {
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
   const Index_type N = m_N;                        \
-  const Index_type NN = m_N * m_N;                 \
-  allocAndInitHipDeviceData(A, m_A, NN);            \
-  allocAndInitHipDeviceData(B, m_B, NN);            \
-  allocAndInitHipDeviceData(D, m_D, NN);			   
+  const Index_type Ne = m_Ne;                 \
+  const Index_type NeNe = m_Ne * m_Ne;                 \
+  allocAndInitHipDeviceData(A, m_A, N);            \
+  allocAndInitHipDeviceData(B, m_B, N);            \
+  allocAndInitHipDeviceData(D, m_D, N);			   
 
 #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP        \
-  getHipDeviceData(m_A, A, NN);                     \
-  getHipDeviceData(m_B, B, NN);                     \
-  getHipDeviceData(m_D, D, NN);                     \
+  getHipDeviceData(m_A, A, N);                     \
+  getHipDeviceData(m_B, B, N);                     \
+  getHipDeviceData(m_D, D, N);                     \
   deallocHipDeviceData(A);                         \
   deallocHipDeviceData(B);                         \
   deallocHipDeviceData(D);						   
@@ -80,7 +81,8 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
-  const Index_type NN = m_N * m_N;
+  const Index_type Ne = m_Ne;
+  const Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
   dim3 blockDim(16, 4, 1);
@@ -89,8 +91,11 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 
   if (vid == Base_HIP) {
 
-  	for(Index_type i = 0; i != NN; ++i){ m_A[i] = i; }
-  	for(Index_type i = 0; i != NN; ++i){ m_B[i] = NN - 1 - i; }	
+	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+	//for(Index_type ii = 0; ii != 3; ++ii){
+  		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
+  		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
+	}
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
@@ -103,12 +108,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
-//  for(int i = 0; i != A_h.size(); ++i){ 
-//      printf("A_h[%d] = %f\n", i, A_h[i]); 
-//  }
-  for(int i = 0; i != NN; ++i){ 
-      printf("D[%d] = %f\n", i, m_D[i]); 
+  for(int i = 0; i != N; ++i){ 
+      printf("A[%d] = %f\n", i, m_A[i]); 
   }
+//  for(int i = 0; i != NN; ++i){ 
+//      printf("D[%d] = %f\n", i, m_D[i]); 
+//  }
 
 
 
diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
index fca7af98b..d6820cfd8 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -20,24 +20,20 @@ namespace basic {
 MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
     : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params)
 {
-  m_N_default = 16;
+  m_N_default = 1024;
   setDefaultProblemSize(m_N_default);
   setDefaultReps(5);
 
-  //If problem target size is not divisible by Ne, round up
-//  m_N = std::max(Index_type(getTargetProblemSize())*(Index_type(getTargetProblemSize())/16), \
-//  		Index_type(m_Ne));
-
-  m_N = 16;
-  setActualProblemSize(m_N*m_N);
+  //Make sure problem target size is divisible by Ne*Ne
+  m_N = RAJA_DIVIDE_CEILING_INT(Index_type(getTargetProblemSize()),Index_type(m_Ne*m_Ne))*Index_type(m_Ne*m_Ne);
+  setActualProblemSize(m_N);
 
   setItsPerRep(getActualProblemSize());
   setKernelsPerRep(1);
 
-  setBytesPerRep( 2*m_N*m_N*sizeof(Real_type));
+  setBytesPerRep(2*m_N*sizeof(Real_type));
+  setFLOPsPerRep(2*m_N*m_Ne);
 
-  //Square Mat-Mat product flops should be (2^N−1)N^2=2*N^3−N^2
-  setFLOPsPerRep(2*m_N*m_N*m_N);
 
   checksum_scale_factor = 1e-6 *
               ( static_cast<Checksum_type>(getDefaultProblemSize()) /
@@ -67,9 +63,9 @@ MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {}
 void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
 
   //global matrices
-  allocAndInitDataConst(m_A, m_N * m_N, 1.0, vid);
-  allocAndInitDataConst(m_B, m_N * m_N, 1.0, vid);
-  allocAndInitDataConst(m_D, m_N * m_N, 0.0, vid);
+  allocAndInitDataConst(m_A, m_N, 1.0, vid);
+  allocAndInitDataConst(m_B, m_N, 1.0, vid);
+  allocAndInitDataConst(m_D, m_N*m_N, 0.0, vid);
 
 }
 

From 83700d6c7949681c69b9bce7298937a697899518 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 28 Apr 2022 15:13:04 -0500
Subject: [PATCH 07/81] fixing results array storage

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 15 ++-------------
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  |  8 ++++----
 src/basic/MAT_FUSED_MUL_ADD.cpp      |  5 ++---
 3 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 33e157080..93503b2d7 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -24,26 +24,15 @@ namespace basic {
   const Index_type Ne = m_Ne;                       \
   allocAndInitCudaDeviceData(A, m_A, N);            \
   allocAndInitCudaDeviceData(B, m_B, N);            \
-  allocAndInitCudaDeviceData(D, m_D, N);			\
-  allocAndInitCudaDeviceData(Ae, m_Ae, Ne);         \
-  allocAndInitCudaDeviceData(Be, m_Be, Ne);         \
-  allocAndInitCudaDeviceData(De, m_De, Ne);
-
+  allocAndInitCudaDeviceData(D, m_D, N);			
 
 #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA        \
   getCudaDeviceData(m_A, A, N);                     \
   getCudaDeviceData(m_B, B, N);                     \
   getCudaDeviceData(m_D, D, N);                     \
-  getCudaDeviceData(m_Ae, Ae, Ne);                  \
-  getCudaDeviceData(m_Be, Be, Ne);                  \
-  getCudaDeviceData(m_De, De, Ne);                  \
   deallocCudaDeviceData(A);                         \
   deallocCudaDeviceData(B);                         \
-  deallocCudaDeviceData(D);							\
-  deallocCudaDeviceData(Ae);                        \
-  deallocCudaDeviceData(Be);                        \
-  deallocCudaDeviceData(De);
-
+  deallocCudaDeviceData(D);							
 
 template < Index_type block_size >
   __launch_bounds__(block_size)
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 2238332cc..6d348b313 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -108,12 +108,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+//  for(int i = 0; i != N; ++i){ 
+//      printf("A[%d] = %f\n", i, m_A[i]); 
+//  }
   for(int i = 0; i != N; ++i){ 
-      printf("A[%d] = %f\n", i, m_A[i]); 
+      printf("D[%d] = %f\n", i, m_D[i]); 
   }
-//  for(int i = 0; i != NN; ++i){ 
-//      printf("D[%d] = %f\n", i, m_D[i]); 
-//  }
 
 
 
diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
index d6820cfd8..391a03949 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -62,15 +62,14 @@ MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {}
 
 void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
 
-  //global matrices
   allocAndInitDataConst(m_A, m_N, 1.0, vid);
   allocAndInitDataConst(m_B, m_N, 1.0, vid);
-  allocAndInitDataConst(m_D, m_N*m_N, 0.0, vid);
+  allocAndInitDataConst(m_D, m_N, 0.0, vid);
 
 }
 
 void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) {
-  checksum[vid][tune_idx] += calcChecksum(m_D, m_N*m_N, checksum_scale_factor );
+  checksum[vid][tune_idx] += calcChecksum(m_D, m_N, checksum_scale_factor );
 }
 
 void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {

From 1bbd47368bdfdaf2ab72da0c3026e5d4e295c838 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 28 Apr 2022 16:04:01 -0500
Subject: [PATCH 08/81] finish multi matrix support

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 30 +++++++++++++----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 6d348b313..483160bc7 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -37,17 +37,20 @@ namespace basic {
 
 template < Index_type block_size >
 __launch_bounds__(block_size)
-__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D){
+__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
+                                  Index_type N){
+constexpr Index_type Ne = 16;                                  
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
   // compute a 16x16x16 matrix multiplication using a single wavefront.
 #if defined(RP_USE_DOUBLE)
   using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double;
   double4 result = {0};
 #elif defined(RP_USE_FLOAT)
   using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
-  float4 result = {0}; // zero out 4 vanilla VGPRs
+  float4 result = {0};
 #endif
-  Index_type a_idx = 16 * threadIdx.x + threadIdx.y;
-  Index_type b_idx = threadIdx.x + 16 * threadIdx.y;
+  Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne);
+  Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne);
 
   for(int i = 0; i < 4; ++i){
     Real_type a = A[a_idx];
@@ -61,19 +64,20 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
 #endif  
 #endif
     a_idx += 4; // move four columns to the right
-    b_idx += 4*16; // move four rows down
+    b_idx += 4*Ne; // move four rows down
   }
 
   #pragma unroll 4
   for(Index_type i = 0; i < 4; ++i){
     const Index_type d_idx =  threadIdx.x            
-                     + i * 16                
-                     + threadIdx.y * 4 * 16; 
+                     + i * Ne                
+                     + threadIdx.y * 4 * Ne
+                     + ii*(Ne*Ne); 
 
     D[d_idx] = result[i];
   }
 }
-
+}
 
 
 template < size_t block_size >
@@ -102,20 +106,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
       hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
-                         A, B, D);
+                         A, B, D, N);
       hipErrchk( hipGetLastError() );
     }
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
-//  for(int i = 0; i != N; ++i){ 
-//      printf("A[%d] = %f\n", i, m_A[i]); 
-//  }
-  for(int i = 0; i != N; ++i){ 
-      printf("D[%d] = %f\n", i, m_D[i]); 
-  }
-
-
 
   } else if (vid == Lambda_HIP) {
 

From 55918498a2d6b9cbf18d271ca3ea941427710171 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 28 Apr 2022 21:43:49 -0500
Subject: [PATCH 09/81] updating algorithm description

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 11 +++++-----
 src/basic/MAT_FUSED_MUL_ADD.cpp     |  2 +-
 src/basic/MAT_FUSED_MUL_ADD.hpp     | 34 +++++++++++++++++------------
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 483160bc7..16fbd1e0a 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -21,8 +21,8 @@ namespace basic {
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
   const Index_type N = m_N;                        \
-  const Index_type Ne = m_Ne;                 \
-  const Index_type NeNe = m_Ne * m_Ne;                 \
+  constexpr Index_type Ne = m_Ne;                 \
+  constexpr Index_type NeNe = m_Ne * m_Ne;                 \
   allocAndInitHipDeviceData(A, m_A, N);            \
   allocAndInitHipDeviceData(B, m_B, N);            \
   allocAndInitHipDeviceData(D, m_D, N);			   
@@ -85,18 +85,17 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
-  const Index_type Ne = m_Ne;
-  const Index_type NeNe = m_Ne * m_Ne;
+  constexpr Index_type Ne = m_Ne;
+  constexpr Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
-  dim3 blockDim(16, 4, 1);
+  dim3 blockDim(Ne, 4, 1);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
   if (vid == Base_HIP) {
 
 	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-	//for(Index_type ii = 0; ii != 3; ++ii){
   		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
   		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
 	}
diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
index 391a03949..957e09bfb 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -24,7 +24,7 @@ MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
   setDefaultProblemSize(m_N_default);
   setDefaultReps(5);
 
-  //Make sure problem target size is divisible by Ne*Ne
+  //Make sure problem target size is divisible by 16*16
   m_N = RAJA_DIVIDE_CEILING_INT(Index_type(getTargetProblemSize()),Index_type(m_Ne*m_Ne))*Index_type(m_Ne*m_Ne);
   setActualProblemSize(m_N);
 
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index e7c621408..552590c1a 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -5,24 +5,30 @@
 //
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
 // Compute D = A x B + C, where
-// A: a M x K matrix
-// B: a K x N matrix
-// C, D: M x N matrices
-// All square row-major matrices, C is a null matrix and ignored.
-//    for(int row = 0; row != m; ++row){
-//      for(int col = 0; col != n; ++col){
-//
-//        float sum = 0.0;
-//        for (int kk = 0; kk < k; ++kk){
-//            sum += A[row][kk] * B[kk][col];
-//        }
-//        D[row][col] = sum;
+// Inputs:
+// A: N/(Ne*Ne) Ne x Ne matrices
+// B: N/(Ne*Ne) Ne x Ne matrices
+// Ouput:
+// D: N/(Ne*Ne) Ne x Ne matrices
+// All square row-major matrices, C is ignored.
+// for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+//  for(int row = 0; row != Ne; ++row){
+//    for(int col = 0; col != Ne; ++col){
+//      float dot = 0.0;
+//      int A_idx = row * Ne + ii*NeNe;
+//      int B_idx = col + ii*NeNe;
+//      for(int i = 0; i != Ne; ++i){
+//        dot += A[A_idx] * B[B_idx];
+//        ++A_idx;
+//        B_idx += Ne;
 //      }
+//      D[row * Ne + col] = dot;
 //    }
 //  }
-
+//  return D;
+//}
+//}
 #ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
 #define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
 

From 4653c23650e5e546cd17a3291c69390ea9285c0c Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 28 Apr 2022 22:31:07 -0500
Subject: [PATCH 10/81] adding a few more guard rails for fma builtins. kernel
 still only implemented for gfx908 and gfx90a

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 16fbd1e0a..9d1bd598f 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -56,13 +56,20 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
     Real_type a = A[a_idx];
     Real_type b = B[b_idx];
 
-#ifdef __gfx90a__	
 #if defined(RP_USE_DOUBLE)
+#ifdef __gfx90a__	
 	result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
+#else
+	 result = {0}; //currenlty unimplemented
+#endif //end __gfx90a__
 #elif defined(RP_USE_FLOAT)
+#ifdef __gfx90a__ || __gfx908__
     result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0);
-#endif  
-#endif
+#else
+	result = {0}; //uncurrently unimplemented
+#endif //end __gfx90a__ or __gfx908__
+#endif //end FLOAT vs DOBULE
+
     a_idx += 4; // move four columns to the right
     b_idx += 4*Ne; // move four rows down
   }

From bd47d35131b13876c97f840ee0ba7db8f46b6efc Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 29 Apr 2022 16:59:38 -0500
Subject: [PATCH 11/81] adding some ifdefs to call a seperate mfma kernel if
 hardware support not availble. kernel is currently empty

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 44 ++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 9d1bd598f..29dad102c 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -21,8 +21,8 @@ namespace basic {
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
   const Index_type N = m_N;                        \
-  constexpr Index_type Ne = m_Ne;                 \
-  constexpr Index_type NeNe = m_Ne * m_Ne;                 \
+  constexpr Index_type Ne = m_Ne;                  \
+  constexpr Index_type NeNe = m_Ne * m_Ne;         \
   allocAndInitHipDeviceData(A, m_A, N);            \
   allocAndInitHipDeviceData(B, m_B, N);            \
   allocAndInitHipDeviceData(D, m_D, N);			   
@@ -37,7 +37,7 @@ namespace basic {
 
 template < Index_type block_size >
 __launch_bounds__(block_size)
-__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
+__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
 constexpr Index_type Ne = 16;                                  
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
@@ -60,13 +60,13 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
 #ifdef __gfx90a__	
 	result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
 #else
-	 result = {0}; //currenlty unimplemented
+	 result = {0}; //should never get here. 
 #endif //end __gfx90a__
 #elif defined(RP_USE_FLOAT)
-#ifdef __gfx90a__ || __gfx908__
+#if defined(__gfx90a__) || defined(__gfx908__)
     result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0);
 #else
-	result = {0}; //uncurrently unimplemented
+	result = {0}; //should never get here
 #endif //end __gfx90a__ or __gfx908__
 #endif //end FLOAT vs DOBULE
 
@@ -85,8 +85,13 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
   }
 }
 }
-
-
+//Reference for cases with no hardware support
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
+                                  Index_type N){
+	return;
+}
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 {
@@ -97,6 +102,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 
   dim3 gridDim (1, 1, 1);
   dim3 blockDim(Ne, 4, 1);
+  hipDeviceProp_t devProp;
+  hipError_t err = hipGetDeviceProperties(&devProp, 0);
+  std::string gcnArchName(devProp.gcnArchName);
+  std::string hipArch = gcnArchName.substr(0, 6);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
@@ -110,7 +119,23 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
+	if(hipArch == "gfx90a") 
+	  //Both FP32 and FP64 supported
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+                         A, B, D, N);
+	else if(hipArch == "gfx908"){
+#if defined(RP_USE_FLOAT)
+	  //Only FP32 supported on MI100
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+                         A, B, D, N);
+#elif defined(RP_USE_DOUBLE)
+	  //FP64 not supported on MI100
+      hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+                         A, B, D, N);
+	}
+#endif
+	else
+	 //Otherwise no mfma hardware support
       hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
                          A, B, D, N);
       hipErrchk( hipGetLastError() );
@@ -118,7 +143,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
-
   } else if (vid == Lambda_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;

From fde4b655a4f3178674dd2b6ec270ff9f596fc28f Mon Sep 17 00:00:00 2001
From: Corbin Andrew Robeck <coe62785@node015.cm.cluster>
Date: Mon, 2 May 2022 23:26:58 -0500
Subject: [PATCH 12/81] adding reference basic mat-mat kernel for comparison
 when no mfma instructions are available

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 29dad102c..bc757242d 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -90,6 +90,15 @@ template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
+  constexpr int Ne = 16;
+  int x = threadIdx.x + blockIdx.x * blockDim.x; 
+  int y = threadIdx.y + blockIdx.y * blockDim.y; 
+
+  float sum = 0;
+  for (int k = 0; k < Ne; ++k) {
+    sum += A[y*Ne + k] * B[k*Ne + x];
+  }
+  D[y*Ne + x] = sum;
 	return;
 }
 template < size_t block_size >
@@ -101,7 +110,8 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
-  dim3 blockDim(Ne, 4, 1);
+  dim3 blockDimBuiltin(Ne, Ne/4, 1);
+  dim3 blockDim(Ne, Ne, 1);
   hipDeviceProp_t devProp;
   hipError_t err = hipGetDeviceProperties(&devProp, 0);
   std::string gcnArchName(devProp.gcnArchName);
@@ -121,12 +131,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 	if(hipArch == "gfx90a") 
 	  //Both FP32 and FP64 supported
-      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
                          A, B, D, N);
 	else if(hipArch == "gfx908"){
 #if defined(RP_USE_FLOAT)
 	  //Only FP32 supported on MI100
-      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
                          A, B, D, N);
 #elif defined(RP_USE_DOUBLE)
 	  //FP64 not supported on MI100
@@ -143,6 +153,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+
   } else if (vid == Lambda_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;

From 717438238fd4d7d816da03cb8f1eff6032928bb7 Mon Sep 17 00:00:00 2001
From: Corbin Andrew Robeck <coe62785@tulip.cm.cluster>
Date: Tue, 3 May 2022 13:26:58 -0500
Subject: [PATCH 13/81] adding problem size support to base hip kernel

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index bc757242d..e47e00c97 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -91,6 +91,7 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
   constexpr int Ne = 16;
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   int x = threadIdx.x + blockIdx.x * blockDim.x; 
   int y = threadIdx.y + blockIdx.y * blockDim.y; 
 
@@ -98,8 +99,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
   for (int k = 0; k < Ne; ++k) {
     sum += A[y*Ne + k] * B[k*Ne + x];
   }
-  D[y*Ne + x] = sum;
-	return;
+  D[y*Ne + x + ii*(Ne*Ne)] = sum;
+}
 }
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
@@ -153,7 +154,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
-
   } else if (vid == Lambda_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;

From f5fe86145b35cf2288ef096bfed205e15007a786 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 13:26:58 -0500
Subject: [PATCH 14/81] adding problem size support to base hip kernel

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index bc757242d..e47e00c97 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -91,6 +91,7 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
   constexpr int Ne = 16;
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   int x = threadIdx.x + blockIdx.x * blockDim.x; 
   int y = threadIdx.y + blockIdx.y * blockDim.y; 
 
@@ -98,8 +99,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
   for (int k = 0; k < Ne; ++k) {
     sum += A[y*Ne + k] * B[k*Ne + x];
   }
-  D[y*Ne + x] = sum;
-	return;
+  D[y*Ne + x + ii*(Ne*Ne)] = sum;
+}
 }
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
@@ -153,7 +154,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
-
   } else if (vid == Lambda_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;

From 799e17c04d6edc7801f8101ad2993be040cc4855 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:18:42 -0500
Subject: [PATCH 15/81] updating top level cmake file for new mfma test

---
 src/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bc1bf6b77..06a80317c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -94,6 +94,9 @@ blt_add_executable(
   basic/INIT_VIEW1D_OFFSET.cpp
   basic/INIT_VIEW1D_OFFSET-Seq.cpp
   basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
+  basic/MAT_FUSED_MUL_ADD.cpp
+  basic/MAT_FUSED_MUL_ADD-Seq.cpp
+  basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
   basic/MAT_MAT_SHARED.cpp
   basic/MAT_MAT_SHARED-Seq.cpp
   basic/MAT_MAT_SHARED-OMPTarget.cpp

From 451dee25dbe34406ea0f70ca4fbb01b14f27a871 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:19:25 -0500
Subject: [PATCH 16/81] updating cmake file to add mfma test to basic list

---
 src/basic/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt
index ceeb1a502..8d2cee87b 100644
--- a/src/basic/CMakeLists.txt
+++ b/src/basic/CMakeLists.txt
@@ -56,6 +56,12 @@ blt_add_library(
           INIT_VIEW1D_OFFSET-Cuda.cpp
           INIT_VIEW1D_OFFSET-OMP.cpp
           INIT_VIEW1D_OFFSET-OMPTarget.cpp
+          MAT_FUSED_MUL_ADD.cpp
+          MAT_FUSED_MUL_ADD-Seq.cpp
+          MAT_FUSED_MUL_ADD-Hip.cpp
+          MAT_FUSED_MUL_ADD-Cuda.cpp
+          MAT_FUSED_MUL_ADD-OMP.cpp
+          MAT_FUSED_MUL_ADD-OMPTarget.cpp
           MAT_MAT_SHARED.cpp
           MAT_MAT_SHARED-Seq.cpp
           MAT_MAT_SHARED-Hip.cpp

From c015117cf544b6699dcf1d3c86860a96a9796e0c Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:19:53 -0500
Subject: [PATCH 17/81] adding mfma test to raja perf suite infrastructure

---
 src/common/RAJAPerfSuite.cpp | 6 ++++++
 src/common/RAJAPerfSuite.hpp | 1 +
 2 files changed, 7 insertions(+)

diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp
index 1bc10c31d..ecfc2f797 100644
--- a/src/common/RAJAPerfSuite.cpp
+++ b/src/common/RAJAPerfSuite.cpp
@@ -25,6 +25,7 @@
 #include "basic/INIT3.hpp"
 #include "basic/INIT_VIEW1D.hpp"
 #include "basic/INIT_VIEW1D_OFFSET.hpp"
+#include "basic/MAT_FUSED_MUL_ADD.hpp"
 #include "basic/MAT_MAT_SHARED.hpp"
 #include "basic/MULADDSUB.hpp"
 #include "basic/NESTED_INIT.hpp"
@@ -158,6 +159,7 @@ static const std::string KernelNames [] =
   std::string("Basic_INIT3"),
   std::string("Basic_INIT_VIEW1D"),
   std::string("Basic_INIT_VIEW1D_OFFSET"),
+  std::string("Basic_MAT_FUSED_MUL_ADD"),
   std::string("Basic_MAT_MAT_SHARED"),
   std::string("Basic_MULADDSUB"),
   std::string("Basic_NESTED_INIT"),
@@ -535,6 +537,10 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new basic::INIT_VIEW1D_OFFSET(run_params);
        break;
     }
+    case Basic_MAT_FUSED_MUL_ADD : {
+       kernel = new basic::MAT_FUSED_MUL_ADD(run_params);
+       break;
+    }
     case Basic_MAT_MAT_SHARED : {
        kernel = new basic::MAT_MAT_SHARED(run_params);
        break;
diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp
index fad672137..ea082325e 100644
--- a/src/common/RAJAPerfSuite.hpp
+++ b/src/common/RAJAPerfSuite.hpp
@@ -78,6 +78,7 @@ enum KernelID {
   Basic_INIT3,
   Basic_INIT_VIEW1D,
   Basic_INIT_VIEW1D_OFFSET,
+  Basic_MAT_FUSED_MUL_ADD,
   Basic_MAT_MAT_SHARED,
   Basic_MULADDSUB,
   Basic_NESTED_INIT,

From 6a6ccd7c2ed45794f8d7df8eac9eaadb5d08893b Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:20:47 -0500
Subject: [PATCH 18/81] adding mfma base and header files

---
 src/basic/MAT_FUSED_MUL_ADD.cpp | 84 +++++++++++++++++++++++++++++++
 src/basic/MAT_FUSED_MUL_ADD.hpp | 88 +++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD.hpp

diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
new file mode 100644
index 000000000..957e09bfb
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -0,0 +1,84 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <algorithm>
+
+namespace rajaperf {
+namespace basic {
+
+MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
+    : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params)
+{
+  m_N_default = 1024;
+  setDefaultProblemSize(m_N_default);
+  setDefaultReps(5);
+
+  //Make sure problem target size is divisible by 16*16
+  m_N = RAJA_DIVIDE_CEILING_INT(Index_type(getTargetProblemSize()),Index_type(m_Ne*m_Ne))*Index_type(m_Ne*m_Ne);
+  setActualProblemSize(m_N);
+
+  setItsPerRep(getActualProblemSize());
+  setKernelsPerRep(1);
+
+  setBytesPerRep(2*m_N*sizeof(Real_type));
+  setFLOPsPerRep(2*m_N*m_Ne);
+
+
+  checksum_scale_factor = 1e-6 *
+              ( static_cast<Checksum_type>(getDefaultProblemSize()) /
+                                           getActualProblemSize() );
+
+
+
+  setVariantDefined(Base_Seq);
+  setVariantDefined(Lambda_Seq);
+  setVariantDefined(RAJA_Seq);
+
+  setVariantDefined(Base_OpenMP);
+  setVariantDefined(Lambda_OpenMP);
+  setVariantDefined(RAJA_OpenMP);
+
+  setVariantDefined(Base_CUDA);
+  setVariantDefined(Lambda_CUDA);
+  setVariantDefined(RAJA_CUDA);
+
+  setVariantDefined(Base_HIP);
+  setVariantDefined(Lambda_HIP);
+  setVariantDefined(RAJA_HIP);
+}
+
+MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {}
+
+void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+
+  allocAndInitDataConst(m_A, m_N, 1.0, vid);
+  allocAndInitDataConst(m_B, m_N, 1.0, vid);
+  allocAndInitDataConst(m_D, m_N, 0.0, vid);
+
+}
+
+void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) {
+  checksum[vid][tune_idx] += calcChecksum(m_D, m_N, checksum_scale_factor );
+}
+
+void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+  (void)vid;
+  deallocData(m_A);
+  deallocData(m_B);
+  deallocData(m_D);
+
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
new file mode 100644
index 000000000..552590c1a
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -0,0 +1,88 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Compute D = A x B + C, where
+// Inputs:
+// A: N/(Ne*Ne) Ne x Ne matrices
+// B: N/(Ne*Ne) Ne x Ne matrices
+// Ouput:
+// D: N/(Ne*Ne) Ne x Ne matrices
+// All square row-major matrices, C is ignored.
+// for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+//  for(int row = 0; row != Ne; ++row){
+//    for(int col = 0; col != Ne; ++col){
+//      float dot = 0.0;
+//      int A_idx = row * Ne + ii*NeNe;
+//      int B_idx = col + ii*NeNe;
+//      for(int i = 0; i != Ne; ++i){
+//        dot += A[A_idx] * B[B_idx];
+//        ++A_idx;
+//        B_idx += Ne;
+//      }
+//      D[row * Ne + col] = dot;
+//    }
+//  }
+//  return D;
+//}
+//}
+#ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
+#define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
+
+#include "RAJA/RAJA.hpp"
+#include "common/KernelBase.hpp"
+
+
+#define MAT_FUSED_MUL_ADD_DATA_SETUP        \
+  Real_ptr A = m_A; 						\
+  Real_ptr B = m_B; 						\
+  Real_ptr D = m_D; 						
+
+namespace rajaperf {
+class RunParams;
+
+namespace basic {
+
+class MAT_FUSED_MUL_ADD : public KernelBase {
+public:
+  MAT_FUSED_MUL_ADD(const RunParams &params);
+
+  ~MAT_FUSED_MUL_ADD();
+
+  void setUp(VariantID vid, size_t tune_idx);
+  void updateChecksum(VariantID vid, size_t tune_idx);
+  void tearDown(VariantID vid, size_t tune_idx);
+
+  void runSeqVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPVariant(VariantID vid, size_t tune_idx);
+  void runCudaVariant(VariantID vid, size_t tune_idx);
+  void runHipVariant(VariantID vid, size_t tune_idx);
+  void runOpenMPTargetVariant(VariantID vid, size_t tune_idx);
+
+  void setCudaTuningDefinitions(VariantID vid);
+  void setHipTuningDefinitions(VariantID vid);
+  template < size_t block_size >
+  void runCudaVariantImpl(VariantID vid);
+  template < size_t block_size >
+  void runHipVariantImpl(VariantID vid);
+
+private:
+  static const size_t default_gpu_block_size = 256;
+  using gpu_block_sizes_type = gpu_block_size::make_list_type<default_gpu_block_size, gpu_block_size::ExactSqrt>;
+
+  Real_ptr m_A;
+  Real_ptr m_B;
+  Real_ptr m_D;
+
+  Index_type m_N;
+  Index_type m_N_default;
+  static constexpr Index_type m_Ne = 16;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard

From 43c1c6e413b70a0584b8d2dff8c27c6cf8cb40c9 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:21:27 -0500
Subject: [PATCH 19/81] adding mfma seq variant skeleton

---
 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 66 +++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
new file mode 100644
index 000000000..d1b11a9c1
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
@@ -0,0 +1,66 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  const Index_type Ne = m_Ne;
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  switch (vid) {
+
+  case Base_Seq: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    } // number of iterations
+    stopTimer();
+
+    break;
+  }
+
+#if defined(RUN_RAJA_SEQ)
+  case Lambda_Seq: {
+
+
+    startTimer();
+    for (Index_type irep = 0; irep < run_reps; ++irep) {
+    } // irep
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_Seq: {
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+#endif // RUN_RAJA_SEQ
+
+  default: {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown variant id = " << vid
+              << std::endl;
+  }
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf

From 2e759818b5ba9faf3a406a0e930b3997af5e22aa Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:22:02 -0500
Subject: [PATCH 20/81] adding mfma omp and omp offload variant skeleton
 structure

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp       | 72 +++++++++++++++++++++++
 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 39 ++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
new file mode 100644
index 000000000..0651e3ccd
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -0,0 +1,72 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
+
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  const Index_type Ne = m_Ne;
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+
+  switch (vid) {
+
+  case Base_OpenMP: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case Lambda_OpenMP: {
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    break;
+  }
+
+  case RAJA_OpenMP: {
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    break;
+  }
+
+  default: {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown variant id = " << vid
+              << std::endl;
+  }
+  }
+
+#else
+  RAJA_UNUSED_VAR(vid);
+#endif
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
new file mode 100644
index 000000000..e6c5933eb
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+
+  void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
+    const Index_type run_reps = getRunReps();
+
+    switch (vid) {
+
+    default: {
+
+      getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OpenMPTarget variant id = " << vid << std::endl;
+      break;
+    }
+    }
+  }
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_TARGET_OPENMP

From 518121981473f3c3b8c15acfa90a0fafddb1f386 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:22:16 -0500
Subject: [PATCH 21/81] adding mfma cuda variant skeleton

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 98 ++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
new file mode 100644
index 000000000..93503b2d7
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -0,0 +1,98 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+#define MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA           \
+  const Index_type N = m_N;                         \
+  const Index_type Ne = m_Ne;                       \
+  allocAndInitCudaDeviceData(A, m_A, N);            \
+  allocAndInitCudaDeviceData(B, m_B, N);            \
+  allocAndInitCudaDeviceData(D, m_D, N);			
+
+#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA        \
+  getCudaDeviceData(m_A, A, N);                     \
+  getCudaDeviceData(m_B, B, N);                     \
+  getCudaDeviceData(m_D, D, N);                     \
+  deallocCudaDeviceData(A);                         \
+  deallocCudaDeviceData(B);                         \
+  deallocCudaDeviceData(D);							
+
+template < Index_type block_size >
+  __launch_bounds__(block_size)
+__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B,
+                               Real_ptr D) {
+
+}
+
+template < size_t block_size >
+void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
+{
+
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  const Index_type Ne = m_Ne;
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  if (vid == Base_CUDA) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
+
+  } else if (vid == Lambda_CUDA) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
+
+  } else if (vid == RAJA_CUDA) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
+
+  } else {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown Cuda variant id = " << vid
+              << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Cuda)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_CUDA

From ef451af849ac9f4f02832a5fbe74c26f242f1655 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 3 May 2022 14:23:13 -0500
Subject: [PATCH 22/81] add inital set of HIP mfma varaint with builtin matrix
 core instructions

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 190 ++++++++++++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
new file mode 100644
index 000000000..e47e00c97
--- /dev/null
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -0,0 +1,190 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC
+// and RAJA Performance Suite project contributors.
+// See the RAJAPerf/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MAT_FUSED_MUL_ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include "common/HipDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf {
+namespace basic {
+
+#define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
+  const Index_type N = m_N;                        \
+  constexpr Index_type Ne = m_Ne;                  \
+  constexpr Index_type NeNe = m_Ne * m_Ne;         \
+  allocAndInitHipDeviceData(A, m_A, N);            \
+  allocAndInitHipDeviceData(B, m_B, N);            \
+  allocAndInitHipDeviceData(D, m_D, N);			   
+
+#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP        \
+  getHipDeviceData(m_A, A, N);                     \
+  getHipDeviceData(m_B, B, N);                     \
+  getHipDeviceData(m_D, D, N);                     \
+  deallocHipDeviceData(A);                         \
+  deallocHipDeviceData(B);                         \
+  deallocHipDeviceData(D);						   
+
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D,
+                                  Index_type N){
+constexpr Index_type Ne = 16;                                  
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+  // compute a 16x16x16 matrix multiplication using a single wavefront.
+#if defined(RP_USE_DOUBLE)
+  using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double;
+  double4 result = {0};
+#elif defined(RP_USE_FLOAT)
+  using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
+  float4 result = {0};
+#endif
+  Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne);
+  Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne);
+
+  for(int i = 0; i < 4; ++i){
+    Real_type a = A[a_idx];
+    Real_type b = B[b_idx];
+
+#if defined(RP_USE_DOUBLE)
+#ifdef __gfx90a__	
+	result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
+#else
+	 result = {0}; //should never get here. 
+#endif //end __gfx90a__
+#elif defined(RP_USE_FLOAT)
+#if defined(__gfx90a__) || defined(__gfx908__)
+    result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0);
+#else
+	result = {0}; //should never get here
+#endif //end __gfx90a__ or __gfx908__
+#endif //end FLOAT vs DOBULE
+
+    a_idx += 4; // move four columns to the right
+    b_idx += 4*Ne; // move four rows down
+  }
+
+  #pragma unroll 4
+  for(Index_type i = 0; i < 4; ++i){
+    const Index_type d_idx =  threadIdx.x            
+                     + i * Ne                
+                     + threadIdx.y * 4 * Ne
+                     + ii*(Ne*Ne); 
+
+    D[d_idx] = result[i];
+  }
+}
+}
+//Reference for cases with no hardware support
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
+                                  Index_type N){
+  constexpr int Ne = 16;
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
+  int x = threadIdx.x + blockIdx.x * blockDim.x; 
+  int y = threadIdx.y + blockIdx.y * blockDim.y; 
+
+  float sum = 0;
+  for (int k = 0; k < Ne; ++k) {
+    sum += A[y*Ne + k] * B[k*Ne + x];
+  }
+  D[y*Ne + x + ii*(Ne*Ne)] = sum;
+}
+}
+template < size_t block_size >
+void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type N = m_N;
+  constexpr Index_type Ne = m_Ne;
+  constexpr Index_type NeNe = m_Ne * m_Ne;
+
+  dim3 gridDim (1, 1, 1);
+  dim3 blockDimBuiltin(Ne, Ne/4, 1);
+  dim3 blockDim(Ne, Ne, 1);
+  hipDeviceProp_t devProp;
+  hipError_t err = hipGetDeviceProperties(&devProp, 0);
+  std::string gcnArchName(devProp.gcnArchName);
+  std::string hipArch = gcnArchName.substr(0, 6);
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  if (vid == Base_HIP) {
+
+	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+  		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
+  		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
+	}
+    MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+	if(hipArch == "gfx90a") 
+	  //Both FP32 and FP64 supported
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
+                         A, B, D, N);
+	else if(hipArch == "gfx908"){
+#if defined(RP_USE_FLOAT)
+	  //Only FP32 supported on MI100
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
+                         A, B, D, N);
+#elif defined(RP_USE_DOUBLE)
+	  //FP64 not supported on MI100
+      hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+                         A, B, D, N);
+	}
+#endif
+	else
+	 //Otherwise no mfma hardware support
+      hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
+                         A, B, D, N);
+      hipErrchk( hipGetLastError() );
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+  } else if (vid == Lambda_HIP) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+
+  } else if (vid == RAJA_HIP) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    }  // loop over kernel reps
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+
+  } else {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid
+              << std::endl;
+  }
+}
+
+RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Hip)
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // RAJA_ENABLE_HIP

From 0bd36f4d1840215d091203034a3906f823c34259 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Wed, 4 May 2022 20:49:22 -0500
Subject: [PATCH 23/81] add mat_fused_mul_add cuda variant kernel

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 30 ++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 93503b2d7..657e5eea6 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -34,29 +34,47 @@ namespace basic {
   deallocCudaDeviceData(B);                         \
   deallocCudaDeviceData(D);							
 
-template < Index_type block_size >
-  __launch_bounds__(block_size)
-__global__ void MAT_FUSED_MUL_ADD(Index_type N, Real_ptr A, Real_ptr B,
-                               Real_ptr D) {
 
+template < Index_type block_size >
+__launch_bounds__(block_size)
+__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
+                                  Index_type N){
+  constexpr int Ne = 16;
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
+  int x = threadIdx.x + blockIdx.x * blockDim.x; 
+  int y = threadIdx.y + blockIdx.y * blockDim.y; 
+
+  float sum = 0;
+  for (int k = 0; k < Ne; ++k) {
+    sum += A[y*Ne + k] * B[k*Ne + x];
+  }
+  D[y*Ne + x + ii*(Ne*Ne)] = sum;
+}
 }
-
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
 {
 
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
-  const Index_type Ne = m_Ne;
+  constexpr Index_type Ne = m_Ne;
+  constexpr Index_type NeNe = m_Ne * m_Ne;
 
+  dim3 gridDim (1, 1, 1);
+  dim3 blockDim(Ne, Ne, 1);
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
   if (vid == Base_CUDA) {
+	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+  		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
+  		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
+	}
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      mat_fused_mul_add<block_size><<<dim3(gridDim), dim3(blockDim)>>>(A, B, D, N);
     }
     stopTimer();
 

From 3c906a0125567299932bade1033d94dfc9a60610 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Fri, 6 May 2022 13:57:02 -0500
Subject: [PATCH 24/81] filling in seq MAT_FUSED_MUL_ADD seq variant

---
 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 30 +++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
index d1b11a9c1..176ef0676 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
@@ -27,6 +27,14 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+        for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+              for(Index_type row = 0; row != Ne; ++row){
+                for(Index_type col = 0; col != Ne; ++col){
+                    MAT_FUSED_MUL_ADD_BODY
+                }
+            }
+        }
+
     } // number of iterations
     stopTimer();
 
@@ -36,9 +44,20 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
 #if defined(RUN_RAJA_SEQ)
   case Lambda_Seq: {
 
+    auto mat_fused_lam = [=](Index_type ii, Index_type row, Index_type col){
+        MAT_FUSED_MUL_ADD_BODY;
+        };
 
     startTimer();
     for (Index_type irep = 0; irep < run_reps; ++irep) {
+        for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+              for(Index_type row = 0; row != Ne; ++row){
+                for(Index_type col = 0; col != Ne; ++col){
+                    mat_fused_lam(ii,row,col);
+                    }
+                 }
+         }
+                    
     } // irep
     stopTimer();
 
@@ -46,8 +65,19 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
   }
 
   case RAJA_Seq: {
+    RAJA::RangeSegment row_range(0, Ne);
+    RAJA::RangeSegment col_range(0, Ne);    
+    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+        RAJA::forall<RAJA::loop_exec>( ii_range, [=](int ii) {
+            RAJA::forall<RAJA::loop_exec>( row_range, [=](int row) {
+                RAJA::forall<RAJA::loop_exec>( col_range, [=](int col) {    
+                    MAT_FUSED_MUL_ADD_BODY;
+                });
+            });
+         });
     }  // loop over kernel reps
     stopTimer();
 

From 464380d89a72dab159a71fd121e29d9917c146b5 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Fri, 6 May 2022 13:57:34 -0500
Subject: [PATCH 25/81] cleaning up MAT_FUSED_MUL_ADD macros a bit

---
 src/basic/MAT_FUSED_MUL_ADD.hpp | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 552590c1a..583c74a03 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -13,17 +13,17 @@
 // D: N/(Ne*Ne) Ne x Ne matrices
 // All square row-major matrices, C is ignored.
 // for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-//  for(int row = 0; row != Ne; ++row){
-//    for(int col = 0; col != Ne; ++col){
-//      float dot = 0.0;
-//      int A_idx = row * Ne + ii*NeNe;
-//      int B_idx = col + ii*NeNe;
-//      for(int i = 0; i != Ne; ++i){
-//        dot += A[A_idx] * B[B_idx];
+//  for(Index_type row = 0; row != Ne; ++row){
+//    for(Index_type col = 0; col != Ne; ++col){
+//      Real_type dot = 0.0;
+//      Index_type A_idx = row * Ne;
+//      Index_type B_idx = col;
+//      for(Index_type i = 0; i != Ne; ++i){
+//        sum += A[A_idx] * B[B_idx];
 //        ++A_idx;
 //        B_idx += Ne;
 //      }
-//      D[row * Ne + col] = dot;
+//      D[row * Ne + col + ii*(Ne*Ne)] = sum;
 //    }
 //  }
 //  return D;
@@ -35,12 +35,22 @@
 #include "RAJA/RAJA.hpp"
 #include "common/KernelBase.hpp"
 
-
 #define MAT_FUSED_MUL_ADD_DATA_SETUP        \
   Real_ptr A = m_A; 						\
   Real_ptr B = m_B; 						\
   Real_ptr D = m_D; 						
 
+#define MAT_FUSED_MUL_ADD_BODY          \
+  Real_type sum = 0.0;                  \
+  Index_type A_idx = row * Ne;          \
+  Index_type B_idx = col;               \
+  for(Index_type i = 0; i != Ne; ++i){  \
+    sum += A[A_idx] * B[B_idx];         \
+    A_idx++;                            \
+    B_idx += Ne;                        \
+  }                                     \
+  D[row * Ne + col + ii*(Ne*Ne)] = sum;
+
 namespace rajaperf {
 class RunParams;
 

From ee3c695807642cb00e4531498ad99acf0c157b3a Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Fri, 6 May 2022 13:58:17 -0500
Subject: [PATCH 26/81] making MAT_FUSED_MUL_ADD cuda and hip variants more
 consistent

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 14 +++++++-------
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 15 ++++++++++-----
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 657e5eea6..1d531c12d 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -39,16 +39,16 @@ template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
-  constexpr int Ne = 16;
+constexpr int Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  int x = threadIdx.x + blockIdx.x * blockDim.x; 
-  int y = threadIdx.y + blockIdx.y * blockDim.y; 
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
 
-  float sum = 0;
-  for (int k = 0; k < Ne; ++k) {
-    sum += A[y*Ne + k] * B[k*Ne + x];
+  Real_type dot = 0;
+  for (Real_type k = 0; k < Ne; ++k) {
+    dot += A[row*Ne + k] * B[k*Ne + col];
   }
-  D[y*Ne + x + ii*(Ne*Ne)] = sum;
+  D[row*Ne + col + ii*(Ne*Ne)] = dot;
 }
 }
 template < size_t block_size >
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index e47e00c97..2a59f4559 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -92,14 +92,14 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
                                   Index_type N){
   constexpr int Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  int x = threadIdx.x + blockIdx.x * blockDim.x; 
-  int y = threadIdx.y + blockIdx.y * blockDim.y; 
+  int col = threadIdx.x + blockIdx.x * blockDim.x; 
+  int row = threadIdx.y + blockIdx.y * blockDim.y; 
 
-  float sum = 0;
+  float dot = 0;
   for (int k = 0; k < Ne; ++k) {
-    sum += A[y*Ne + k] * B[k*Ne + x];
+    dot += A[y*Ne + k] * B[k*Ne + x];
   }
-  D[y*Ne + x + ii*(Ne*Ne)] = sum;
+  D[row*Ne + col + ii*(Ne*Ne)] = dot;
 }
 }
 template < size_t block_size >
@@ -154,6 +154,11 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+ 	for(int i = 0; i != N; ++i){ 
+ 	     printf("D[%d] = %f\n", i, m_D[i]); 
+ 	 }
+
+
   } else if (vid == Lambda_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;

From 24279eb6302647cb6355d6cf032d94cac806649a Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Fri, 6 May 2022 16:34:55 -0500
Subject: [PATCH 27/81] fixing bug in MAT_FUSED_MUL_ADD Cuda kernel

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 1d531c12d..ce386fca9 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -39,13 +39,13 @@ template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
-constexpr int Ne = 16;
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
-  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
+  constexpr int Ne = 16;
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+  int col = threadIdx.x + blockIdx.x * blockDim.x;
+  int row = threadIdx.y + blockIdx.y * blockDim.y;
 
-  Real_type dot = 0;
-  for (Real_type k = 0; k < Ne; ++k) {
+  float dot = 0;
+  for (int k = 0; k < Ne; ++k) {
     dot += A[row*Ne + k] * B[k*Ne + col];
   }
   D[row*Ne + col + ii*(Ne*Ne)] = dot;

From 75c7a07828cca1f3dde13cc33229208d0ac27b3d Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Fri, 6 May 2022 16:36:03 -0500
Subject: [PATCH 28/81] making MAT_FUSED_MUL_ADD naming consistent

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 2a59f4559..eb7d5d5f9 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -97,7 +97,7 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
 
   float dot = 0;
   for (int k = 0; k < Ne; ++k) {
-    dot += A[y*Ne + k] * B[k*Ne + x];
+    dot += A[row*Ne + k] * B[k*Ne + col];
   }
   D[row*Ne + col + ii*(Ne*Ne)] = dot;
 }

From 75875b9e44153b3e3ef42e3b8fe0d8226f71684f Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Mon, 9 May 2022 15:03:49 -0500
Subject: [PATCH 29/81] updating mat fused body and ading raja lam variant

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 10 ++----
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 53 +++++++++++++++++-----------
 src/basic/MAT_FUSED_MUL_ADD.hpp      | 18 ++++------
 3 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index ce386fca9..4f50efeda 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -41,14 +41,10 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
                                   Index_type N){
   constexpr int Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  int col = threadIdx.x + blockIdx.x * blockDim.x;
-  int row = threadIdx.y + blockIdx.y * blockDim.y;
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x;
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y;
 
-  float dot = 0;
-  for (int k = 0; k < Ne; ++k) {
-    dot += A[row*Ne + k] * B[k*Ne + col];
-  }
-  D[row*Ne + col + ii*(Ne*Ne)] = dot;
+  MAT_FUSED_MUL_ADD_BODY;
 }
 }
 template < size_t block_size >
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index eb7d5d5f9..63ddd02c0 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -92,20 +92,28 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
                                   Index_type N){
   constexpr int Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  int col = threadIdx.x + blockIdx.x * blockDim.x; 
-  int row = threadIdx.y + blockIdx.y * blockDim.y; 
-
-  float dot = 0;
-  for (int k = 0; k < Ne; ++k) {
-    dot += A[row*Ne + k] * B[k*Ne + col];
-  }
-  D[row*Ne + col + ii*(Ne*Ne)] = dot;
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
+  MAT_FUSED_MUL_ADD_BODY;
 }
 }
+template <  Index_type block_size, typename Lambda >
+__launch_bounds__(block_size)
+__global__ void mat_fused_lam(Index_type N, Lambda body)
+{
+  constexpr int Ne = 16;
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
+     body(ii,col,row);
+   }
+}
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
+//  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();  
   const Index_type N = m_N;
   constexpr Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
@@ -133,50 +141,53 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 	if(hipArch == "gfx90a") 
 	  //Both FP32 and FP64 supported
       hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
-                         A, B, D, N);
+                         A, B, D, iend);
 	else if(hipArch == "gfx908"){
 #if defined(RP_USE_FLOAT)
 	  //Only FP32 supported on MI100
       hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
-                         A, B, D, N);
+                         A, B, D, iend);
 #elif defined(RP_USE_DOUBLE)
 	  //FP64 not supported on MI100
       hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
-                         A, B, D, N);
+                         A, B, D, iend);
 	}
 #endif
 	else
 	 //Otherwise no mfma hardware support
       hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
-                         A, B, D, N);
+                         A, B, D, iend);
       hipErrchk( hipGetLastError() );
     }
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
- 	for(int i = 0; i != N; ++i){ 
- 	     printf("D[%d] = %f\n", i, m_D[i]); 
- 	 }
-
 
   } else if (vid == Lambda_HIP) {
-
+    dim3 gridDim (1, 1, 1);
+    dim3 blockDim(Ne, Ne, 1);
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      auto mat_fused_lamda =
+        [=] __device__ (Index_type ii, Index_type row, Index_type col) {
+            MAT_FUSED_MUL_ADD_BODY;
+        };        
+      hipLaunchKernelGGL((mat_fused_lam<block_size, decltype(mat_fused_lamda)>),
+                         dim3(gridDim), dim3(blockDim), 0, 0,
+                         iend, mat_fused_lamda);      
+    hipErrchk( hipGetLastError() );
     }
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
-
   } else if (vid == RAJA_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
-
     startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-    }  // loop over kernel reps
+
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 583c74a03..396702d84 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -19,7 +19,7 @@
 //      Index_type A_idx = row * Ne;
 //      Index_type B_idx = col;
 //      for(Index_type i = 0; i != Ne; ++i){
-//        sum += A[A_idx] * B[B_idx];
+//        dot += A[A_idx] * B[B_idx];
 //        ++A_idx;
 //        B_idx += Ne;
 //      }
@@ -40,16 +40,12 @@
   Real_ptr B = m_B; 						\
   Real_ptr D = m_D; 						
 
-#define MAT_FUSED_MUL_ADD_BODY          \
-  Real_type sum = 0.0;                  \
-  Index_type A_idx = row * Ne;          \
-  Index_type B_idx = col;               \
-  for(Index_type i = 0; i != Ne; ++i){  \
-    sum += A[A_idx] * B[B_idx];         \
-    A_idx++;                            \
-    B_idx += Ne;                        \
-  }                                     \
-  D[row * Ne + col + ii*(Ne*Ne)] = sum;
+#define MAT_FUSED_MUL_ADD_BODY             \
+  Real_type dot = 0;                        \
+  for (Index_type k = 0; k < Ne; ++k) {     \
+    dot += A[row*Ne + k] * B[k*Ne + col];   \
+  }                                         \
+  D[row*Ne + col + ii*(Ne*Ne)] = dot;       \
 
 namespace rajaperf {
 class RunParams;

From 2a46e307a829d0466f868a9905d5209042f21913 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Mon, 9 May 2022 15:30:16 -0500
Subject: [PATCH 30/81] add data set up to mat_fused_lam

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 63ddd02c0..7eacf2af0 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -166,6 +166,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   } else if (vid == Lambda_HIP) {
     dim3 gridDim (1, 1, 1);
     dim3 blockDim(Ne, Ne, 1);
+	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+  		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
+  		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
+	}
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
@@ -183,6 +187,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+
   } else if (vid == RAJA_HIP) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;

From a4328c8390f5dfe3c5372aca4fb631a2b3f55b1a Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Mon, 9 May 2022 15:56:26 -0500
Subject: [PATCH 31/81] moving MAT_FUSED_MUL_ADD array init into macro

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp |  6 ++----
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 11 +++--------
 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp  |  4 +++-
 src/basic/MAT_FUSED_MUL_ADD.hpp      |  6 ++++++
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 4f50efeda..98aeef388 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -60,11 +60,9 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
   dim3 blockDim(Ne, Ne, 1);
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
+  MAT_FUSED_MUL_ADD_DATA_INIT;
+
   if (vid == Base_CUDA) {
-	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
-  		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
-	}
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
 
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 7eacf2af0..f7567376d 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -128,12 +128,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
+  MAT_FUSED_MUL_ADD_DATA_INIT;
+
   if (vid == Base_HIP) {
 
-	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
-  		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
-	}
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
@@ -166,10 +164,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   } else if (vid == Lambda_HIP) {
     dim3 gridDim (1, 1, 1);
     dim3 blockDim(Ne, Ne, 1);
-	for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  		for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }
-  		for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }
-	}
+
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
index 176ef0676..d142f38b0 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
@@ -17,10 +17,12 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
 
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
-  const Index_type Ne = m_Ne;
+  constexpr Index_type Ne = m_Ne;
+  constexpr Index_type NeNe = m_Ne * m_Ne;
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
+  MAT_FUSED_MUL_ADD_DATA_INIT;
   switch (vid) {
 
   case Base_Seq: {
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 396702d84..87580cb3b 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -35,6 +35,12 @@
 #include "RAJA/RAJA.hpp"
 #include "common/KernelBase.hpp"
 
+#define MAT_FUSED_MUL_ADD_DATA_INIT \
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \
+	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \
+	for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \
+} 
+
 #define MAT_FUSED_MUL_ADD_DATA_SETUP        \
   Real_ptr A = m_A; 						\
   Real_ptr B = m_B; 						\

From be2430fc60eb92e64c270a557044265fb68d8703 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Mon, 9 May 2022 16:11:20 -0500
Subject: [PATCH 32/81] fixing missing ;

---
 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
index d142f38b0..842966146 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
@@ -32,7 +32,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
         for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
               for(Index_type row = 0; row != Ne; ++row){
                 for(Index_type col = 0; col != Ne; ++col){
-                    MAT_FUSED_MUL_ADD_BODY
+                    MAT_FUSED_MUL_ADD_BODY;
                 }
             }
         }

From 1ff6f933dbc00e3209980845bee632df2db203bd Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 10 May 2022 09:35:21 -0500
Subject: [PATCH 33/81] finish filling in RAJA_HIP MAT_FUSED_MUL_ADD variant

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index f7567376d..d5cc299e8 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -184,10 +184,35 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
 
   } else if (vid == RAJA_HIP) {
+    dim3 gridDim (1, 1, 1);
+    dim3 blockDim(Ne, Ne, 1);
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
-    startTimer();
 
+    startTimer();
+    RAJA::RangeSegment row_range(0, Ne);
+    RAJA::RangeSegment col_range(0, Ne);
+    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::HipKernel<
+        RAJA::statement::For<2, RAJA::loop_exec,
+          RAJA::statement::Tile<1, RAJA::tile_fixed<block_size>, RAJA::hip_block_y_loop,
+            RAJA::statement::Tile<0, RAJA::tile_fixed<block_size>, RAJA::hip_block_x_loop,
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct,
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+        >
+      >;
+      RAJA::kernel<EXEC_POL>(RAJA::make_tuple(ii_range, col_range, row_range),
+    [=] RAJA_DEVICE (int ii, int col, int row) {
+        MAT_FUSED_MUL_ADD_BODY;
+        });
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;

From 9dcd3d84290070e2904813065e65440f3d0c4f5a Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 10 May 2022 09:59:51 -0500
Subject: [PATCH 34/81] finish filling in RAJA_CUDA MAT_FUSED_MUL_ADD variant

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 46 ++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 98aeef388..4714fde59 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -47,6 +47,17 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
   MAT_FUSED_MUL_ADD_BODY;
 }
 }
+template <  Index_type block_size, typename Lambda >
+__launch_bounds__(block_size)
+__global__ void mat_fused_lam(Index_type N, Lambda body)
+{
+  constexpr int Ne = 16;
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
+     body(ii,col,row);
+   }
+}
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
 {
@@ -80,22 +91,53 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      auto mat_fused_lamda =
+        [=] __device__ (Index_type ii, Index_type row, Index_type col) {
+            MAT_FUSED_MUL_ADD_BODY;
+        };        
+      mat_fused_lam<block_size, decltype(mat_fused_lamda)>
+                    <<<dim3(gridDim), dim3(blockDim)>>>(N, mat_fused_lamda);
     }
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
 
   } else if (vid == RAJA_CUDA) {
+    dim3 gridDim (1, 1, 1);
+    dim3 blockDim(Ne, Ne, 1);
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
 
     startTimer();
-    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-    }  // loop over kernel reps
+    RAJA::RangeSegment row_range(0, Ne);
+    RAJA::RangeSegment col_range(0, Ne);
+    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+        RAJA::statement::CudaKernel<
+        RAJA::statement::For<2, RAJA::loop_exec,
+          RAJA::statement::Tile<1, RAJA::tile_fixed<block_size>, RAJA::cuda_block_y_loop,
+            RAJA::statement::Tile<0, RAJA::tile_fixed<block_size>, RAJA::cuda_block_x_loop,
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+        >
+      >;
+      RAJA::kernel<EXEC_POL>(RAJA::make_tuple(ii_range, col_range, row_range),
+    [=] RAJA_DEVICE (int ii, int col, int row) {
+        MAT_FUSED_MUL_ADD_BODY;
+        });
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
 
+
   } else {
     getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown Cuda variant id = " << vid
               << std::endl;

From 66d9756b84e0af05320ba4d5282f8557a8c23c5a Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Tue, 10 May 2022 12:46:22 -0500
Subject: [PATCH 35/81] filling in MAT_FUSED_MUL_ADD OMP variants

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp       | 34 +++++++++
 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 92 +++++++++++++++++++++--
 src/basic/MAT_FUSED_MUL_ADD.hpp           | 24 ++----
 3 files changed, 125 insertions(+), 25 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 0651e3ccd..22a83fcc1 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -31,6 +31,15 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    #pragma omp parallel for
+    for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+          for(Index_type row = 0; row != Ne; ++row){
+            for(Index_type col = 0; col != Ne; ++col){
+                MAT_FUSED_MUL_ADD_BODY;
+            }
+        }
+    }
+
 
     }
     stopTimer();
@@ -39,9 +48,22 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
   }
 
   case Lambda_OpenMP: {
+    auto mat_fused_base_lam = [=](Index_type ii, Index_type row, Index_type col){
+        MAT_FUSED_MUL_ADD_BODY;
+        };
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    #pragma omp parallel for
+    for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+          for(Index_type row = 0; row != Ne; ++row){
+            for(Index_type col = 0; col != Ne; ++col){
+                mat_fused_base_lam(ii, row, col);
+            }
+        }
+    }
+
+
     }
     stopTimer();
 
@@ -49,8 +71,20 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
   }
 
   case RAJA_OpenMP: {
+ 
+    RAJA::RangeSegment row_range(0, Ne);
+    RAJA::RangeSegment col_range(0, Ne);    
+    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>( ii_range, [=](int ii) {
+            RAJA::forall<RAJA::loop_exec>( row_range, [=](int row) {
+                RAJA::forall<RAJA::loop_exec>( col_range, [=](int col) {    
+                    MAT_FUSED_MUL_ADD_BODY;
+                });
+            });
+         });
     }  // loop over kernel reps
     stopTimer();
 
diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
index e6c5933eb..801747766 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
@@ -19,18 +19,94 @@
 namespace rajaperf {
 namespace basic {
 
+  //
+  // Define threads per team for target execution
+  //
+  const size_t threads_per_team = 256;
 
-  void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) {
-    const Index_type run_reps = getRunReps();
+#define MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET           \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+  const Index_type N = m_N;                        \
+  constexpr Index_type Ne = m_Ne;                  \
+  constexpr Index_type NeNe = m_Ne * m_Ne;         \
+  allocAndInitOpenMPDeviceData(A, m_A, N, did, hid);            \
+  allocAndInitOpenMPDeviceData(B, m_B, N, did, hid);            \
+  allocAndInitOpenMPDeviceData(D, m_D, N, did, hid);			  
 
-    switch (vid) {
+#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET        \
+  getOpenMPDeviceData(m_A, A, N, hid, did);                     \
+  getOpenMPDeviceData(m_B, B, N, hid, did);                     \
+  getOpenMPDeviceData(m_D, D, N, hid, did);                     \
+  deallocOpenMPDeviceData(A, did);                         \
+  deallocOpenMPDeviceData(B, did);                         \
+  deallocOpenMPDeviceData(D, did);
 
-    default: {
 
-      getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OpenMPTarget variant id = " << vid << std::endl;
-      break;
-    }
-    }
+void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      #pragma omp target is_device_ptr(A, B, D) device( did )
+      #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
+      for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+        for(Index_type row = 0; row != Ne; ++row){
+            for(Index_type col = 0; col != Ne; ++col){
+            MAT_FUSED_MUL_ADD_BODY;
+            }
+          }
+        }
+      }
+    
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET;
+
+    RAJA::RangeSegment row_range(0, Ne);
+    RAJA::RangeSegment col_range(0, Ne);    
+    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+
+
+    using EXEC_POL =
+      RAJA::KernelPolicy<
+
+        RAJA::statement::For<0, RAJA::seq_exec,         // ii
+        RAJA::statement::Collapse<RAJA::omp_target_parallel_collapse_exec,
+                                  RAJA::ArgList<1, 2>, // row, col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::kernel<EXEC_POL>( RAJA::make_tuple(ii_range,
+                                               row_range,
+                                               col_range),
+        [=] (Index_type ii, Index_type row, Index_type col) {
+        MAT_FUSED_MUL_ADD_BODY;
+      });
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}    
   }
 
 } // end namespace basic
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 87580cb3b..52ce76bd2 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -12,23 +12,13 @@
 // Ouput:
 // D: N/(Ne*Ne) Ne x Ne matrices
 // All square row-major matrices, C is ignored.
-// for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-//  for(Index_type row = 0; row != Ne; ++row){
-//    for(Index_type col = 0; col != Ne; ++col){
-//      Real_type dot = 0.0;
-//      Index_type A_idx = row * Ne;
-//      Index_type B_idx = col;
-//      for(Index_type i = 0; i != Ne; ++i){
-//        dot += A[A_idx] * B[B_idx];
-//        ++A_idx;
-//        B_idx += Ne;
-//      }
-//      D[row * Ne + col + ii*(Ne*Ne)] = sum;
-//    }
-//  }
-//  return D;
-//}
-//}
+//for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+//      for(Index_type row = 0; row != Ne; ++row){
+//        for(Index_type col = 0; col != Ne; ++col){
+//            MAT_FUSED_MUL_ADD_BODY;
+//        }   
+//    }   
+//} 
 #ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
 #define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP
 

From 73d745a749951ee3cb77239807b074b80931a6fc Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Tue, 10 May 2022 13:54:06 -0400
Subject: [PATCH 36/81] add  MAT_FUSED_MUL_ADD_DATA_INIT

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 22a83fcc1..0611f5640 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -24,6 +24,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
+  MAT_FUSED_MUL_ADD_DATA_INIT;
 
   switch (vid) {
 

From 9c63d3bdd3fcdd604ff293edf6446106dc38637e Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Tue, 10 May 2022 13:54:58 -0400
Subject: [PATCH 37/81] add  MAT_FUSED_MUL_ADD_DATA_INIT

---
 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
index 801747766..313b8e391 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
@@ -51,6 +51,8 @@ void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
+  MAT_FUSED_MUL_ADD_DATA_INIT;
+
   if ( vid == Base_OpenMPTarget ) {
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET;

From ca7b81021411be9e555d421c503746a6bd483bff Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Mon, 16 May 2022 08:49:11 -0500
Subject: [PATCH 38/81] fixing some unused vars

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index d5cc299e8..72baf4f78 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -21,8 +21,6 @@ namespace basic {
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP           \
   const Index_type N = m_N;                        \
-  constexpr Index_type Ne = m_Ne;                  \
-  constexpr Index_type NeNe = m_Ne * m_Ne;         \
   allocAndInitHipDeviceData(A, m_A, N);            \
   allocAndInitHipDeviceData(B, m_B, N);            \
   allocAndInitHipDeviceData(D, m_D, N);			   
@@ -52,7 +50,7 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
   Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne);
   Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne);
 
-  for(int i = 0; i < 4; ++i){
+  for(Index_type i = 0; i < 4; ++i){
     Real_type a = A[a_idx];
     Real_type b = B[b_idx];
 
@@ -90,7 +88,7 @@ template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
-  constexpr int Ne = 16;
+  constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
@@ -101,7 +99,7 @@ template <  Index_type block_size, typename Lambda >
 __launch_bounds__(block_size)
 __global__ void mat_fused_lam(Index_type N, Lambda body)
 {
-  constexpr int Ne = 16;
+  constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
@@ -122,7 +120,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   dim3 blockDimBuiltin(Ne, Ne/4, 1);
   dim3 blockDim(Ne, Ne, 1);
   hipDeviceProp_t devProp;
-  hipError_t err = hipGetDeviceProperties(&devProp, 0);
+  hipGetDeviceProperties(&devProp, 0);
   std::string gcnArchName(devProp.gcnArchName);
   std::string hipArch = gcnArchName.substr(0, 6);
 
@@ -184,8 +182,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
 
   } else if (vid == RAJA_HIP) {
-    dim3 gridDim (1, 1, 1);
-    dim3 blockDim(Ne, Ne, 1);
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
@@ -210,7 +206,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
         >
       >;
       RAJA::kernel<EXEC_POL>(RAJA::make_tuple(ii_range, col_range, row_range),
-    [=] RAJA_DEVICE (int ii, int col, int row) {
+    [=] RAJA_DEVICE (Index_type ii, Index_type col, Index_type row) {
         MAT_FUSED_MUL_ADD_BODY;
         });
     stopTimer();

From 02633eafd4f59347b3aab200d7cf39946aa237f3 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Mon, 16 May 2022 11:35:47 -0500
Subject: [PATCH 39/81] fix indexing issue

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 67 +++++++++++------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 72baf4f78..81d0a3085 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -33,53 +33,35 @@ namespace basic {
   deallocHipDeviceData(B);                         \
   deallocHipDeviceData(D);						   
 
-template < Index_type block_size >
-__launch_bounds__(block_size)
-__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D,
-                                  Index_type N){
-constexpr Index_type Ne = 16;                                  
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  // compute a 16x16x16 matrix multiplication using a single wavefront.
-#if defined(RP_USE_DOUBLE)
-  using double4 = __attribute__((__vector_size__(4 * sizeof(double)))) double;
-  double4 result = {0};
-#elif defined(RP_USE_FLOAT)
-  using float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
-  float4 result = {0};
-#endif
-  Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne);
-  Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne);
+__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ 
+  constexpr Index_type Ne = 16;
+  for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+  // This kernel computes a 16x16x16 matrix multiplication using a single wavefront.
+  using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type;
+  real4 result = {0};
 
-  for(Index_type i = 0; i < 4; ++i){
-    Real_type a = A[a_idx];
-    Real_type b = B[b_idx];
+  int a_idx = Ne * threadIdx.x + threadIdx.y;
+  int b_idx = threadIdx.x + Ne * threadIdx.y;
 
-#if defined(RP_USE_DOUBLE)
+  for(int i = 0; i < 4; ++i){
+    double a = A[a_idx];
+    double b = B[b_idx];
 #ifdef __gfx90a__	
-	result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
-#else
-	 result = {0}; //should never get here. 
-#endif //end __gfx90a__
+#if defined(RP_USE_DOUBLE)
+    result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
 #elif defined(RP_USE_FLOAT)
-#if defined(__gfx90a__) || defined(__gfx908__)
     result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0);
-#else
-	result = {0}; //should never get here
-#endif //end __gfx90a__ or __gfx908__
-#endif //end FLOAT vs DOBULE
-
-    a_idx += 4; // move four columns to the right
-    b_idx += 4*Ne; // move four rows down
+#endif  
+#endif
+    a_idx += 4; // move two columns to the right
+    b_idx += 4*Ne; // move two rows down
   }
 
   #pragma unroll 4
-  for(Index_type i = 0; i < 4; ++i){
-    const Index_type d_idx =  threadIdx.x            
-                     + i * Ne                
-                     + threadIdx.y * 4 * Ne
-                     + ii*(Ne*Ne); 
-
-    D[d_idx] = result[i];
+  for(int i = 0; i < 4; ++i){
+    const int d_idx =  threadIdx.x
+      + Ne * (threadIdx.y + 4 * i); 
+    D[d_idx + ii*(Ne*Ne)] = result[i];
   }
 }
 }
@@ -117,7 +99,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
-  dim3 blockDimBuiltin(Ne, Ne/4, 1);
+  dim3 blockDimBuiltin(16, 4, 1);
   dim3 blockDim(Ne, Ne, 1);
   hipDeviceProp_t devProp;
   hipGetDeviceProperties(&devProp, 0);
@@ -136,12 +118,12 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 	if(hipArch == "gfx90a") 
 	  //Both FP32 and FP64 supported
-      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
                          A, B, D, iend);
 	else if(hipArch == "gfx908"){
 #if defined(RP_USE_FLOAT)
 	  //Only FP32 supported on MI100
-      hipLaunchKernelGGL((mat_fused_mul_add_builtin<block_size>), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
                          A, B, D, iend);
 #elif defined(RP_USE_DOUBLE)
 	  //FP64 not supported on MI100
@@ -158,7 +140,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
-
   } else if (vid == Lambda_HIP) {
     dim3 gridDim (1, 1, 1);
     dim3 blockDim(Ne, Ne, 1);

From f1fb1abdc0a0de3c5436a3d1082091ce38c9ac61 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Mon, 16 May 2022 13:11:47 -0500
Subject: [PATCH 40/81] fix incorrect order of raja_hip segments

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 81d0a3085..59200a668 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -99,7 +99,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
-  dim3 blockDimBuiltin(16, 4, 1);
+  dim3 blockDimBuiltin(Ne, 4, 1);
   dim3 blockDim(Ne, Ne, 1);
   hipDeviceProp_t devProp;
   hipGetDeviceProperties(&devProp, 0);
@@ -167,9 +167,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
+    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
     RAJA::RangeSegment row_range(0, Ne);
     RAJA::RangeSegment col_range(0, Ne);
-    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::HipKernel<
@@ -186,8 +187,8 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
         >
         >
       >;
-      RAJA::kernel<EXEC_POL>(RAJA::make_tuple(ii_range, col_range, row_range),
-    [=] RAJA_DEVICE (Index_type ii, Index_type col, Index_type row) {
+      RAJA::kernel<EXEC_POL>(RAJA::make_tuple(row_range, col_range, ii_range),
+    [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) {
         MAT_FUSED_MUL_ADD_BODY;
         });
     stopTimer();

From faf046fb242d328917f6f05895d0817a1d8d5f8d Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Mon, 16 May 2022 13:28:42 -0500
Subject: [PATCH 41/81] fix incorrect order of raja_cuda segments

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 4714fde59..cc71c9307 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -129,8 +129,8 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
         >
         >
       >;
-      RAJA::kernel<EXEC_POL>(RAJA::make_tuple(ii_range, col_range, row_range),
-    [=] RAJA_DEVICE (int ii, int col, int row) {
+      RAJA::kernel<EXEC_POL>(RAJA::make_tuple(row_range, col_range, ii_range),
+    [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) {
         MAT_FUSED_MUL_ADD_BODY;
         });
     stopTimer();

From 73bd85e6e7c2161e8cc233ef15c5823e0a866756 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Mon, 16 May 2022 16:02:14 -0400
Subject: [PATCH 42/81] update data types

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 59200a668..ed84c9ec8 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -40,12 +40,12 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
   using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type;
   real4 result = {0};
 
-  int a_idx = Ne * threadIdx.x + threadIdx.y;
-  int b_idx = threadIdx.x + Ne * threadIdx.y;
+  Index_type a_idx = Ne * threadIdx.x + threadIdx.y;
+  Index_type b_idx = threadIdx.x + Ne * threadIdx.y;
 
-  for(int i = 0; i < 4; ++i){
-    double a = A[a_idx];
-    double b = B[b_idx];
+  for(Index_type i = 0; i < 4; ++i){
+    Real_type a = A[a_idx];
+    Real_type b = B[b_idx];
 #ifdef __gfx90a__	
 #if defined(RP_USE_DOUBLE)
     result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
@@ -58,8 +58,8 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
   }
 
   #pragma unroll 4
-  for(int i = 0; i < 4; ++i){
-    const int d_idx =  threadIdx.x
+  for(Index_type i = 0; i < 4; ++i){
+    const Index_type d_idx =  threadIdx.x
       + Ne * (threadIdx.y + 4 * i); 
     D[d_idx + ii*(Ne*Ne)] = result[i];
   }

From 4f7dac37a0506fe5b8ac4b3ddd985838371ceeea Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 19 May 2022 08:41:21 -0500
Subject: [PATCH 43/81] fix indexing issue

---
 src/basic/MAT_FUSED_MUL_ADD.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 52ce76bd2..bef4a8660 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -39,7 +39,7 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \
 #define MAT_FUSED_MUL_ADD_BODY             \
   Real_type dot = 0;                        \
   for (Index_type k = 0; k < Ne; ++k) {     \
-    dot += A[row*Ne + k] * B[k*Ne + col];   \
+    dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)];   \
   }                                         \
   D[row*Ne + col + ii*(Ne*Ne)] = dot;       \
 

From 8e119d7053b5c6ad26a143c80dc0574413fa188e Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 19 May 2022 08:42:03 -0500
Subject: [PATCH 44/81] fix indexing issue

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 59200a668..f53f01cdb 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -33,15 +33,15 @@ namespace basic {
   deallocHipDeviceData(B);                         \
   deallocHipDeviceData(D);						   
 
-__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, Index_type N){ 
+__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ 
   constexpr Index_type Ne = 16;
   for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
   // This kernel computes a 16x16x16 matrix multiplication using a single wavefront.
   using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type;
   real4 result = {0};
 
-  int a_idx = Ne * threadIdx.x + threadIdx.y;
-  int b_idx = threadIdx.x + Ne * threadIdx.y;
+  int a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne);
+  int b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne);
 
   for(int i = 0; i < 4; ++i){
     double a = A[a_idx];
@@ -52,6 +52,12 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
 #elif defined(RP_USE_FLOAT)
     result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0);
 #endif  
+#endif
+
+#ifdef __gfx908__
+#if defined(RP_USE_FLOAT)
+    result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0);
+#endif  
 #endif
     a_idx += 4; // move two columns to the right
     b_idx += 4*Ne; // move two rows down
@@ -69,17 +75,17 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
 template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
-                                  Index_type N){
+                                  const Index_type N){
   constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
-  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne); 
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne); 
   MAT_FUSED_MUL_ADD_BODY;
 }
 }
 template <  Index_type block_size, typename Lambda >
 __launch_bounds__(block_size)
-__global__ void mat_fused_lam(Index_type N, Lambda body)
+__global__ void mat_fused_lam(const Index_type N, Lambda body)
 {
   constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  

From e7ef2b421697d5d2e1d14490717a450073f514e1 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 19 May 2022 08:51:03 -0500
Subject: [PATCH 45/81] fix one more indexing issue

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index cfa3d4e4a..889e80359 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -40,8 +40,8 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
   using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type;
   real4 result = {0};
 
-  Index_type a_idx = Ne * threadIdx.x + threadIdx.y;
-  Index_type b_idx = threadIdx.x + Ne * threadIdx.y;
+  Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne);
+  Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne);
 
   for(Index_type i = 0; i < 4; ++i){
     Real_type a = A[a_idx];

From 78e259080db9b50220f153063f6474ce5e1718d8 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 19 May 2022 09:36:39 -0500
Subject: [PATCH 46/81] fix indexing issue

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index cc71c9307..91b041382 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -41,8 +41,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
                                   Index_type N){
   constexpr int Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  Index_type col = threadIdx.x + blockIdx.x * blockDim.x;
-  Index_type row = threadIdx.y + blockIdx.y * blockDim.y;
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne);
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne);
 
   MAT_FUSED_MUL_ADD_BODY;
 }

From feadc6482b7aa2e98891cdc2026337568b078f47 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 19 May 2022 14:14:05 -0500
Subject: [PATCH 47/81] adding tunings for builtin version of mfma hip kernel

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 143 +++++++++++++++++++++++-----
 src/basic/MAT_FUSED_MUL_ADD.hpp     |   1 +
 2 files changed, 120 insertions(+), 24 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 889e80359..45fd0489f 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -46,6 +46,7 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
   for(Index_type i = 0; i < 4; ++i){
     Real_type a = A[a_idx];
     Real_type b = B[b_idx];
+
 #ifdef __gfx90a__	
 #if defined(RP_USE_DOUBLE)
     result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0);
@@ -94,6 +95,41 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
      body(ii,col,row);
    }
 }
+void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+//  const Index_type ibegin = 0;
+  const Index_type iend = getActualProblemSize();  
+  const Index_type N = m_N;
+  constexpr Index_type Ne = m_Ne;
+  constexpr Index_type NeNe = m_Ne * m_Ne;
+
+  dim3 gridDim (1, 1, 1);
+  dim3 blockDimBuiltin(Ne, 4, 1);
+  dim3 blockDim(Ne, Ne, 1);
+
+  MAT_FUSED_MUL_ADD_DATA_SETUP;
+
+  MAT_FUSED_MUL_ADD_DATA_INIT;
+
+  if (vid == Base_HIP) {
+
+    MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, A, B, D, iend);
+      hipErrchk( hipGetLastError() );
+    }
+    stopTimer();
+
+    MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
+  
+  } else {
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid
+              << std::endl;
+  }
+}
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 {
@@ -107,10 +143,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   dim3 gridDim (1, 1, 1);
   dim3 blockDimBuiltin(Ne, 4, 1);
   dim3 blockDim(Ne, Ne, 1);
-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
-  std::string gcnArchName(devProp.gcnArchName);
-  std::string hipArch = gcnArchName.substr(0, 6);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
@@ -122,25 +154,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-	if(hipArch == "gfx90a") 
-	  //Both FP32 and FP64 supported
-      hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
-                         A, B, D, iend);
-	else if(hipArch == "gfx908"){
-#if defined(RP_USE_FLOAT)
-	  //Only FP32 supported on MI100
-      hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0,
-                         A, B, D, iend);
-#elif defined(RP_USE_DOUBLE)
-	  //FP64 not supported on MI100
-      hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
-                         A, B, D, iend);
-	}
-#endif
-	else
-	 //Otherwise no mfma hardware support
-      hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0,
-                         A, B, D, iend);
+      hipLaunchKernelGGL((mat_fused_mul_add<block_size>), dim3(gridDim), dim3(blockDim), 0, 0, A, B, D, iend);
       hipErrchk( hipGetLastError() );
     }
     stopTimer();
@@ -206,8 +220,89 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
               << std::endl;
   }
 }
+std::string getArch()
+{  
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+  std::string gcnArchName(devProp.gcnArchName);
+  std::string hipArch = gcnArchName.substr(0, 6);
+  return hipArch;
+}
+bool builtinSupported()
+{
+	std::string hipArch = getArch();
+#if defined(RP_USE_DOUBLE)
+	if (hipArch=="gfx90a") 
+		return true;
+#endif
+#if defined(RP_USE_FLOAT)
+	if (hipArch=="gfx90a" || hipArch=="gfx908")
+		return true;
+#endif
+return false;
+}
+void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx)
+{
+  bool builtin_supported = builtinSupported();
+
+   size_t t = 0;
+  if ( vid == Base_HIP  && builtin_supported) {
+
+    if (tune_idx == t) {
+
+      runHipVariantBuiltin(vid);
+
+    }
+
+    t += 1;
+  }
+  if ( (vid == Base_HIP) || (vid == RAJA_HIP) || (vid == Lambda_HIP)){
+
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0 ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        if (tune_idx == t) {
 
-RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Hip)
+          runHipVariantImpl<block_size>(vid);
+
+        }
+
+        t += 1;
+
+      }
+
+    });
+  }
+  else {
+
+    getCout() << "\n  MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid << std::endl;
+
+  }
+
+}
+
+void MAT_FUSED_MUL_ADD::setHipTuningDefinitions(VariantID vid)
+{
+  bool builtin_supported = builtinSupported();
+  if ( vid == Base_HIP ) {
+
+  if (builtin_supported) {
+    addVariantTuningName(vid, "builtin");
+  }
+  }
+    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+
+      if (run_params.numValidGPUBlockSize() == 0u ||
+          run_params.validGPUBlockSize(block_size)) {
+
+        addVariantTuningName(vid, "block_"+std::to_string(block_size));
+      }
+
+    });
+
+}
 
 } // end namespace basic
 } // end namespace rajaperf
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index bef4a8660..74beb78d6 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -70,6 +70,7 @@ class MAT_FUSED_MUL_ADD : public KernelBase {
   void runCudaVariantImpl(VariantID vid);
   template < size_t block_size >
   void runHipVariantImpl(VariantID vid);
+  void runHipVariantBuiltin(VariantID vid);
 
 private:
   static const size_t default_gpu_block_size = 256;

From 900671ab1916b319f0fd81fa54a06b38410d25eb Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 08:09:45 -0500
Subject: [PATCH 48/81] fix one more indexing issue

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 91b041382..b60b6eb95 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -39,7 +39,7 @@ template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
-  constexpr int Ne = 16;
+  constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne);
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne);
@@ -51,7 +51,7 @@ template <  Index_type block_size, typename Lambda >
 __launch_bounds__(block_size)
 __global__ void mat_fused_lam(Index_type N, Lambda body)
 {
-  constexpr int Ne = 16;
+  constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 

From 162a8557f92903c154c9647d10969e3a0b7ac8b0 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 08:13:42 -0500
Subject: [PATCH 49/81] clean up some comments and unused vars

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 45fd0489f..2d7e33f05 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -36,7 +36,6 @@ namespace basic {
 __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ 
   constexpr Index_type Ne = 16;
   for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  // This kernel computes a 16x16x16 matrix multiplication using a single wavefront.
   using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type;
   real4 result = {0};
 
@@ -134,14 +133,12 @@ template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-//  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();  
   const Index_type N = m_N;
   constexpr Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
-  dim3 blockDimBuiltin(Ne, 4, 1);
   dim3 blockDim(Ne, Ne, 1);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;

From fbf2820d7a23dad6cd02de31a8a75a9194bcd4da Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 10:26:23 -0500
Subject: [PATCH 50/81] hopefully fixing final  indexing issue

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 4 ++--
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index b60b6eb95..6478a1aa4 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -41,8 +41,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
                                   Index_type N){
   constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
-  Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne);
-  Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne);
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x;
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y;
 
   MAT_FUSED_MUL_ADD_BODY;
 }
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 2d7e33f05..6a3506ed2 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -78,8 +78,8 @@ __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D
                                   const Index_type N){
   constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  Index_type col = threadIdx.x + blockIdx.x * blockDim.x + ii*(Ne*Ne); 
-  Index_type row = threadIdx.y + blockIdx.y * blockDim.y + ii*(Ne*Ne); 
+  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
+  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
   MAT_FUSED_MUL_ADD_BODY;
 }
 }

From e1ebcdd691173a3b5a4ce72a0a577379f223f61d Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 10:29:00 -0500
Subject: [PATCH 51/81] removing some unused vars

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 6a3506ed2..10cca500f 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -97,14 +97,12 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
 void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
-//  const Index_type ibegin = 0;
   const Index_type iend = getActualProblemSize();  
   const Index_type N = m_N;
   constexpr Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
-  dim3 blockDimBuiltin(Ne, 4, 1);
   dim3 blockDim(Ne, Ne, 1);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;

From f02da326c27bd0e057b76d4d3806fb58b6b241e0 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 10:53:10 -0500
Subject: [PATCH 52/81] cleaning up block/grid defs

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 12 +++++++-----
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 12 ++++++------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 6478a1aa4..11eab1b79 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -67,8 +67,12 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
   constexpr Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
-  dim3 gridDim (1, 1, 1);
-  dim3 blockDim(Ne, Ne, 1);
+  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
+  dim3 blockDim(tile_size, tile_size);
+  dim3 gridDim(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), 
+               static_cast<size_t>(1)); 
+
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
   MAT_FUSED_MUL_ADD_DATA_INIT;
@@ -104,9 +108,7 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA;
 
   } else if (vid == RAJA_CUDA) {
-    dim3 gridDim (1, 1, 1);
-    dim3 blockDim(Ne, Ne, 1);
-
+    
     MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA;
 
     startTimer();
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 10cca500f..fc71cc68b 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -103,7 +103,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid)
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
   dim3 gridDim (1, 1, 1);
-  dim3 blockDim(Ne, Ne, 1);
+  dim3 blockDimBuiltin(Ne, 4, 1);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
@@ -136,9 +136,11 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   constexpr Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
-  dim3 gridDim (1, 1, 1);
-  dim3 blockDim(Ne, Ne, 1);
-
+  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
+  dim3 blockDim(tile_size, tile_size);
+  dim3 gridDim(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), 
+               static_cast<size_t>(1));
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
   MAT_FUSED_MUL_ADD_DATA_INIT;
@@ -156,8 +158,6 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;
   } else if (vid == Lambda_HIP) {
-    dim3 gridDim (1, 1, 1);
-    dim3 blockDim(Ne, Ne, 1);
 
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 

From 59f6766ce1be979eca22a916c3dfcd5d6b75cb3a Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 11:00:09 -0500
Subject: [PATCH 53/81] cleaning up block/grid defs further

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 5 +++--
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 11eab1b79..ad084045b 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -67,8 +67,9 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
   constexpr Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
-  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
-  dim3 blockDim(tile_size, tile_size);
+  constexpr Index_type block_x = gpu_block_size::sqrt(block_size);
+  constexpr Index_type block_y = gpu_block_size::sqrt(block_size);
+  dim3 blockDim(block_x, block_y);
   dim3 gridDim(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)),
                static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), 
                static_cast<size_t>(1)); 
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index fc71cc68b..1df7b7ba2 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -136,8 +136,9 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   constexpr Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
 
-  constexpr Index_type tile_size = gpu_block_size::sqrt(block_size);
-  dim3 blockDim(tile_size, tile_size);
+  constexpr Index_type block_x = gpu_block_size::sqrt(block_size);
+  constexpr Index_type block_y = gpu_block_size::sqrt(block_size);
+  dim3 blockDim(block_x, block_y);
   dim3 gridDim(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)),
                static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), 
                static_cast<size_t>(1));

From 89a8c007acdee2081acff3524835961d7b2a60e7 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Fri, 20 May 2022 13:35:52 -0400
Subject: [PATCH 54/81] fix spacing

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index ad084045b..a78bc3487 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -39,11 +39,10 @@ template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
-  constexpr Index_type Ne = 16;
+constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x;
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y;
-
   MAT_FUSED_MUL_ADD_BODY;
 }
 }
@@ -51,7 +50,7 @@ template <  Index_type block_size, typename Lambda >
 __launch_bounds__(block_size)
 __global__ void mat_fused_lam(Index_type N, Lambda body)
 {
-  constexpr Index_type Ne = 16;
+constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
@@ -61,7 +60,6 @@ for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
 {
-
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
   constexpr Index_type Ne = m_Ne;

From 1156a665c3bb565d2487e92fbed9bc7ae03bb3a7 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Fri, 20 May 2022 13:38:06 -0400
Subject: [PATCH 55/81] fix spacing

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 1df7b7ba2..607972654 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -76,7 +76,7 @@ template < Index_type block_size >
 __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   const Index_type N){
-  constexpr Index_type Ne = 16;
+constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
@@ -87,7 +87,7 @@ template <  Index_type block_size, typename Lambda >
 __launch_bounds__(block_size)
 __global__ void mat_fused_lam(const Index_type N, Lambda body)
 {
-  constexpr Index_type Ne = 16;
+constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 

From fd9d80a085aaa13fd30db50e57482f954615e453 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Fri, 20 May 2022 13:40:39 -0400
Subject: [PATCH 56/81] spacing

---
 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp | 24 +++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
index 313b8e391..4847a05c5 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
@@ -25,21 +25,21 @@ namespace basic {
   const size_t threads_per_team = 256;
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET           \
-  int hid = omp_get_initial_device(); \
-  int did = omp_get_default_device(); \
-  const Index_type N = m_N;                        \
-  constexpr Index_type Ne = m_Ne;                  \
-  constexpr Index_type NeNe = m_Ne * m_Ne;         \
-  allocAndInitOpenMPDeviceData(A, m_A, N, did, hid);            \
-  allocAndInitOpenMPDeviceData(B, m_B, N, did, hid);            \
+  int hid = omp_get_initial_device();                     \
+  int did = omp_get_default_device();                     \
+  const Index_type N = m_N;                               \
+  constexpr Index_type Ne = m_Ne;                         \
+  constexpr Index_type NeNe = m_Ne * m_Ne;                \
+  allocAndInitOpenMPDeviceData(A, m_A, N, did, hid);      \
+  allocAndInitOpenMPDeviceData(B, m_B, N, did, hid);      \
   allocAndInitOpenMPDeviceData(D, m_D, N, did, hid);			  
 
 #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET        \
-  getOpenMPDeviceData(m_A, A, N, hid, did);                     \
-  getOpenMPDeviceData(m_B, B, N, hid, did);                     \
-  getOpenMPDeviceData(m_D, D, N, hid, did);                     \
-  deallocOpenMPDeviceData(A, did);                         \
-  deallocOpenMPDeviceData(B, did);                         \
+  getOpenMPDeviceData(m_A, A, N, hid, did);               \
+  getOpenMPDeviceData(m_B, B, N, hid, did);               \
+  getOpenMPDeviceData(m_D, D, N, hid, did);               \
+  deallocOpenMPDeviceData(A, did);                        \
+  deallocOpenMPDeviceData(B, did);                        \
   deallocOpenMPDeviceData(D, did);
 
 

From c7b35a90b211be6418cb1842579a5fa0eeb42dbe Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Fri, 20 May 2022 13:42:24 -0400
Subject: [PATCH 57/81] spacing

---
 src/basic/MAT_FUSED_MUL_ADD.hpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 74beb78d6..f5bee524d 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -25,23 +25,23 @@
 #include "RAJA/RAJA.hpp"
 #include "common/KernelBase.hpp"
 
-#define MAT_FUSED_MUL_ADD_DATA_INIT \
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){ \
-	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; } \
+#define MAT_FUSED_MUL_ADD_DATA_INIT                                               \
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){                                  \
+	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }            \
 	for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \
 } 
 
-#define MAT_FUSED_MUL_ADD_DATA_SETUP        \
-  Real_ptr A = m_A; 						\
-  Real_ptr B = m_B; 						\
+#define MAT_FUSED_MUL_ADD_DATA_SETUP                                              \
+  Real_ptr A = m_A; 						                  \
+  Real_ptr B = m_B; 								  \
   Real_ptr D = m_D; 						
 
-#define MAT_FUSED_MUL_ADD_BODY             \
-  Real_type dot = 0;                        \
-  for (Index_type k = 0; k < Ne; ++k) {     \
-    dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)];   \
-  }                                         \
-  D[row*Ne + col + ii*(Ne*Ne)] = dot;       \
+#define MAT_FUSED_MUL_ADD_BODY                                               	  \
+  Real_type dot = 0;                                                         	  \
+  for (Index_type k = 0; k < Ne; ++k) {                                      	  \
+    dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)];          	  \
+  }                                                                          	  \
+  D[row*Ne + col + ii*(Ne*Ne)] = dot;                                        	  \
 
 namespace rajaperf {
 class RunParams;

From 2d591b283ff7d62362e54db81c8057a06fcfe154 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 12:51:43 -0500
Subject: [PATCH 58/81] moving function to unnamed namespace

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 1df7b7ba2..495b6a6a3 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -216,6 +216,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
               << std::endl;
   }
 }
+namespace{
 std::string getArch()
 {  
   hipDeviceProp_t devProp;
@@ -237,6 +238,7 @@ bool builtinSupported()
 #endif
 return false;
 }
+}
 void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx)
 {
   bool builtin_supported = builtinSupported();

From 18622fb1cc4ec750b2f8d3f08fcd331d483b82ae Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 13:17:18 -0500
Subject: [PATCH 59/81] fixing undefined var

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 0611f5640..6892b3ddb 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -21,6 +21,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
   const Index_type Ne = m_Ne;
+  constexpr Index_type NeNe = m_Ne * m_Ne;
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 

From 23b9277fab0e3975dd33288a419b1f8b267e37bc Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 13:40:25 -0500
Subject: [PATCH 60/81] fixing omp looping issue for gcc

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 6892b3ddb..9cc62bf30 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -19,6 +19,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
 #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)
 
   const Index_type run_reps = getRunReps();
+  const Index_type iend = getActualProblemSize();
   const Index_type N = m_N;
   const Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
@@ -32,9 +33,11 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
   case Base_OpenMP: {
 
     startTimer();
+
+    Index_type ii_end = (N/(Ne*Ne);
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     #pragma omp parallel for
-    for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+    for(Index_type ii = 0; ii != ii_end; ++ii){
           for(Index_type row = 0; row != Ne; ++row){
             for(Index_type col = 0; col != Ne; ++col){
                 MAT_FUSED_MUL_ADD_BODY;

From 5f46c0e88ffc25dad64b3207b8dbe62c46699811 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 13:40:52 -0500
Subject: [PATCH 61/81] fixing omp looping issue for gcc

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 9cc62bf30..b8610f1f4 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -34,7 +34,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
 
     startTimer();
 
-    Index_type ii_end = (N/(Ne*Ne);
+    Index_type ii_end = (N/(Ne*Ne));
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     #pragma omp parallel for
     for(Index_type ii = 0; ii != ii_end; ++ii){

From 8f0e5124e73d75dc7aede2dd0c6c0aeb8292fa1b Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 14:14:15 -0500
Subject: [PATCH 62/81] fixing one more omp looping issue for gcc

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index b8610f1f4..5b501d33c 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -58,9 +58,10 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
         };
 
     startTimer();
+    Index_type ii_end = (N/(Ne*Ne));
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     #pragma omp parallel for
-    for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+    for(Index_type ii = 0; ii != ii_end; ++ii){
           for(Index_type row = 0; row != Ne; ++row){
             for(Index_type col = 0; col != Ne; ++col){
                 mat_fused_base_lam(ii, row, col);

From 8f0799651ee4b29cc609477c842110d247f3c657 Mon Sep 17 00:00:00 2001
From: CRobeck <corbin.robeck@amd.com>
Date: Fri, 20 May 2022 14:34:23 -0500
Subject: [PATCH 63/81] omp requires canonical loop forms

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 5b501d33c..5fe03bd90 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -23,7 +23,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
   const Index_type N = m_N;
   const Index_type Ne = m_Ne;
   constexpr Index_type NeNe = m_Ne * m_Ne;
-
+  const Index_type ii_end = (N/(Ne*Ne));
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
   MAT_FUSED_MUL_ADD_DATA_INIT;
@@ -33,13 +33,11 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
   case Base_OpenMP: {
 
     startTimer();
-
-    Index_type ii_end = (N/(Ne*Ne));
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     #pragma omp parallel for
-    for(Index_type ii = 0; ii != ii_end; ++ii){
-          for(Index_type row = 0; row != Ne; ++row){
-            for(Index_type col = 0; col != Ne; ++col){
+    for(Index_type ii = 0; ii < ii_end; ++ii){
+          for(Index_type row = 0; row < Ne; ++row){
+            for(Index_type col = 0; col < Ne; ++col){
                 MAT_FUSED_MUL_ADD_BODY;
             }
         }
@@ -58,12 +56,11 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
         };
 
     startTimer();
-    Index_type ii_end = (N/(Ne*Ne));
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     #pragma omp parallel for
-    for(Index_type ii = 0; ii != ii_end; ++ii){
-          for(Index_type row = 0; row != Ne; ++row){
-            for(Index_type col = 0; col != Ne; ++col){
+    for(Index_type ii = 0; ii < ii_end; ++ii){
+          for(Index_type row = 0; row < Ne; ++row){
+            for(Index_type col = 0; col < Ne; ++col){
                 mat_fused_base_lam(ii, row, col);
             }
         }

From c94586fd856e77dd87091abc6421b218eb845261 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Fri, 20 May 2022 21:09:19 -0400
Subject: [PATCH 64/81] update default problem size

---
 src/basic/MAT_FUSED_MUL_ADD.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp
index 957e09bfb..f7e40d3a8 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.cpp
@@ -20,8 +20,8 @@ namespace basic {
 MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams &params)
     : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params)
 {
-  m_N_default = 1024;
-  setDefaultProblemSize(m_N_default);
+  m_N_default = 1000;
+  setDefaultProblemSize(m_N_default*m_N_default);
   setDefaultReps(5);
 
   //Make sure problem target size is divisible by 16*16

From 43e277b838d4eb8e8f465a67188b617d64601ef1 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Mon, 6 Jun 2022 10:01:29 -0400
Subject: [PATCH 65/81] update spacing

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 01d5a428e..46cb6b2e3 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -227,14 +227,14 @@ std::string getArch()
 }
 bool builtinSupported()
 {
-	std::string hipArch = getArch();
+   std::string hipArch = getArch();
 #if defined(RP_USE_DOUBLE)
-	if (hipArch=="gfx90a") 
-		return true;
+   if (hipArch=="gfx90a") 
+      return true;
 #endif
 #if defined(RP_USE_FLOAT)
-	if (hipArch=="gfx90a" || hipArch=="gfx908")
-		return true;
+   if (hipArch=="gfx90a" || hipArch=="gfx908")
+      return true;
 #endif
 return false;
 }
@@ -244,7 +244,7 @@ void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx)
   bool builtin_supported = builtinSupported();
 
    size_t t = 0;
-  if ( vid == Base_HIP  && builtin_supported) {
+   if ( vid == Base_HIP  && builtin_supported) {
 
     if (tune_idx == t) {
 
@@ -283,14 +283,14 @@ void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx)
 
 void MAT_FUSED_MUL_ADD::setHipTuningDefinitions(VariantID vid)
 {
-  bool builtin_supported = builtinSupported();
-  if ( vid == Base_HIP ) {
+   bool builtin_supported = builtinSupported();
+   if ( vid == Base_HIP ) {
 
-  if (builtin_supported) {
-    addVariantTuningName(vid, "builtin");
-  }
-  }
-    seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
+   if (builtin_supported) {
+      addVariantTuningName(vid, "builtin");
+      }
+   }
+   seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
 
       if (run_params.numValidGPUBlockSize() == 0u ||
           run_params.validGPUBlockSize(block_size)) {

From 44c36b4e01d3b60190d2481a6e4c834e74af8d8f Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Mon, 6 Jun 2022 10:53:01 -0500
Subject: [PATCH 66/81] fix spacing

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp |  8 ++++----
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index a78bc3487..c739a0b16 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -52,10 +52,10 @@ __global__ void mat_fused_lam(Index_type N, Lambda body)
 {
 constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
-  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
-     body(ii,col,row);
-   }
+    Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
+    Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
+    body(ii,col,row);
+  }
 }
 template < size_t block_size >
 void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 46cb6b2e3..9dfb25af9 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -89,9 +89,9 @@ __global__ void mat_fused_lam(const Index_type N, Lambda body)
 {
 constexpr Index_type Ne = 16;
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
-  Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
-  Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
-     body(ii,col,row);
+    Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
+    Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
+    body(ii,col,row);
    }
 }
 void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid)
@@ -172,7 +172,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
       hipLaunchKernelGGL((mat_fused_lam<block_size, decltype(mat_fused_lamda)>),
                          dim3(gridDim), dim3(blockDim), 0, 0,
                          iend, mat_fused_lamda);      
-    hipErrchk( hipGetLastError() );
+      hipErrchk( hipGetLastError() );
     }
     stopTimer();
 
@@ -203,10 +203,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
         >
         >
       >;
-      RAJA::kernel<EXEC_POL>(RAJA::make_tuple(row_range, col_range, ii_range),
-    [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) {
-        MAT_FUSED_MUL_ADD_BODY;
-        });
+    RAJA::kernel<EXEC_POL>(RAJA::make_tuple(row_range, col_range, ii_range),
+      [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) {
+      MAT_FUSED_MUL_ADD_BODY;
+     });
     stopTimer();
 
     MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP;

From 8e3fbf68f64ef782eefcdfcd907f42f27750046f Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Mon, 6 Jun 2022 13:43:29 -0500
Subject: [PATCH 67/81] formating clean up

---
 src/basic/MAT_FUSED_MUL_ADD.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index f5bee524d..5cb193822 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -27,13 +27,13 @@
 
 #define MAT_FUSED_MUL_ADD_DATA_INIT                                               \
 for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){                                  \
-	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }            \
-	for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; } \
+	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }                \
+	for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }     \
 } 
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP                                              \
-  Real_ptr A = m_A; 						                  \
-  Real_ptr B = m_B; 								  \
+  Real_ptr A = m_A; 						                                      \
+  Real_ptr B = m_B; 								                              \
   Real_ptr D = m_D; 						
 
 #define MAT_FUSED_MUL_ADD_BODY                                               	  \

From bd2d44c159d272210e61201d8ab0c80c3f93c76d Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Mon, 6 Jun 2022 20:10:32 -0500
Subject: [PATCH 68/81] move getHipArch into common header file as a static
 free function

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 11 ++---------
 src/common/HipDataUtils.hpp         |  9 ++++++++-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 9dfb25af9..17368da20 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -217,17 +217,10 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   }
 }
 namespace{
-std::string getArch()
-{  
-  hipDeviceProp_t devProp;
-  hipGetDeviceProperties(&devProp, 0);
-  std::string gcnArchName(devProp.gcnArchName);
-  std::string hipArch = gcnArchName.substr(0, 6);
-  return hipArch;
-}
+
 bool builtinSupported()
 {
-   std::string hipArch = getArch();
+   std::string hipArch = getHipArch();
 #if defined(RP_USE_DOUBLE)
    if (hipArch=="gfx90a") 
       return true;
diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index a3871d31e..e9855ae9c 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -22,7 +22,6 @@
 
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 
-
 namespace rajaperf
 {
 
@@ -178,6 +177,14 @@ void deallocHipPinnedData(T& pptr)
   pptr = nullptr;
 }
 
+static std::string getHipArch()
+{  
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+  std::string gcnArchName(devProp.gcnArchName);
+  std::string hipArch = gcnArchName.substr(0, 6);
+  return hipArch;
+}
 
 }  // closing brace for rajaperf namespace
 

From 49084efa42584a9c037ced8efd74f34d7f83c15d Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Tue, 7 Jun 2022 09:12:39 -0500
Subject: [PATCH 69/81] update out loop for raja hip/cuda variant

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 6 +++---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index c739a0b16..85fc8ab37 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -117,9 +117,9 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::CudaKernel<
-        RAJA::statement::For<2, RAJA::loop_exec,
-          RAJA::statement::Tile<1, RAJA::tile_fixed<block_size>, RAJA::cuda_block_y_loop,
-            RAJA::statement::Tile<0, RAJA::tile_fixed<block_size>, RAJA::cuda_block_x_loop,
+        RAJA::statement::For<2, RAJA::hip_block_z_loop,
+          RAJA::statement::Tile<1, RAJA::tile_fixed<block_size>, RAJA::cuda_block_y_direct,
+            RAJA::statement::Tile<0, RAJA::tile_fixed<block_size>, RAJA::cuda_block_x_direct,
               RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
                 RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
                   RAJA::statement::Lambda<0>
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 17368da20..a4eece3a8 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -190,9 +190,9 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::HipKernel<
-        RAJA::statement::For<2, RAJA::loop_exec,
-          RAJA::statement::Tile<1, RAJA::tile_fixed<block_size>, RAJA::hip_block_y_loop,
-            RAJA::statement::Tile<0, RAJA::tile_fixed<block_size>, RAJA::hip_block_x_loop,
+        RAJA::statement::For<2, RAJA::hip_block_z_loop,
+          RAJA::statement::Tile<1, RAJA::tile_fixed<block_size>, RAJA::hip_block_y_direct,
+            RAJA::statement::Tile<0, RAJA::tile_fixed<block_size>, RAJA::hip_block_x_direct,
               RAJA::statement::For<1, RAJA::hip_thread_y_direct,
                 RAJA::statement::For<0, RAJA::hip_thread_x_direct,
                   RAJA::statement::Lambda<0>

From 5169e6e4db3242fa36bbe7092bb84a7a89446f29 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Tue, 7 Jun 2022 09:25:20 -0500
Subject: [PATCH 70/81] update unroll pragma to RAJAPERF_UNROLL

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index a4eece3a8..dbf926933 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -63,7 +63,7 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
     b_idx += 4*Ne; // move two rows down
   }
 
-  #pragma unroll 4
+  RAJAPERF_UNROLL(4)
   for(Index_type i = 0; i < 4; ++i){
     const Index_type d_idx =  threadIdx.x
       + Ne * (threadIdx.y + 4 * i); 

From a400d9303028631b04b28eccab1a430909a0d340 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Tue, 7 Jun 2022 09:42:07 -0500
Subject: [PATCH 71/81] fix RAJAPERF_UNROLL def

---
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index dbf926933..54a332f3b 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -63,7 +63,7 @@ __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Re
     b_idx += 4*Ne; // move two rows down
   }
 
-  RAJAPERF_UNROLL(4)
+  RAJA_UNROLL_COUNT(4)
   for(Index_type i = 0; i < 4; ++i){
     const Index_type d_idx =  threadIdx.x
       + Ne * (threadIdx.y + 4 * i); 

From 5bc407fcd4e22b2a5622545436f04164f7528e91 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Tue, 7 Jun 2022 10:15:14 -0500
Subject: [PATCH 72/81] fix type in cuda variant

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 85fc8ab37..0f51f8cb4 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -117,7 +117,7 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::CudaKernel<
-        RAJA::statement::For<2, RAJA::hip_block_z_loop,
+        RAJA::statement::For<2, RAJA::cuda_block_z_loop,
           RAJA::statement::Tile<1, RAJA::tile_fixed<block_size>, RAJA::cuda_block_y_direct,
             RAJA::statement::Tile<0, RAJA::tile_fixed<block_size>, RAJA::cuda_block_x_direct,
               RAJA::statement::For<1, RAJA::cuda_thread_y_direct,

From 3186df1388f3cfc13fd9967b530a25ae819e683a Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Mon, 20 Jun 2022 16:07:56 -0400
Subject: [PATCH 73/81] Update MAT_FUSED_MUL_ADD.hpp

fix spacing
---
 src/basic/MAT_FUSED_MUL_ADD.hpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 5cb193822..8d92a2f13 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -25,23 +25,23 @@
 #include "RAJA/RAJA.hpp"
 #include "common/KernelBase.hpp"
 
-#define MAT_FUSED_MUL_ADD_DATA_INIT                                               \
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){                                  \
-	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }                \
-	for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }     \
+#define MAT_FUSED_MUL_ADD_DATA_INIT                                                  \
+for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){                                     \
+	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }               \
+	for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }    \
 } 
 
-#define MAT_FUSED_MUL_ADD_DATA_SETUP                                              \
-  Real_ptr A = m_A; 						                                      \
-  Real_ptr B = m_B; 								                              \
+#define MAT_FUSED_MUL_ADD_DATA_SETUP                                                 \
+  Real_ptr A = m_A; 						                     \
+  Real_ptr B = m_B; 								     \
   Real_ptr D = m_D; 						
 
-#define MAT_FUSED_MUL_ADD_BODY                                               	  \
-  Real_type dot = 0;                                                         	  \
-  for (Index_type k = 0; k < Ne; ++k) {                                      	  \
-    dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)];          	  \
-  }                                                                          	  \
-  D[row*Ne + col + ii*(Ne*Ne)] = dot;                                        	  \
+#define MAT_FUSED_MUL_ADD_BODY                                               	     \
+  Real_type dot = 0;                                                         	     \
+  for (Index_type k = 0; k < Ne; ++k) {                                      	     \
+    dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)];          	     \
+  }                                                                          	     \
+  D[row*Ne + col + ii*(Ne*Ne)] = dot;                                        	     \
 
 namespace rajaperf {
 class RunParams;

From 9eb3ce4d13c1a1ed087736d9869dfaa7342c6dd4 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 14 Jul 2022 15:05:42 -0500
Subject: [PATCH 74/81] cleaning up some variable naming and create a N_Elem
 var

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp      | 10 ++++++----
 src/basic/MAT_FUSED_MUL_ADD-Hip.cpp       | 15 +++++++++------
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp       |  7 +++----
 src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp |  6 +++---
 src/basic/MAT_FUSED_MUL_ADD-Seq.cpp       |  8 ++++----
 src/basic/MAT_FUSED_MUL_ADD.hpp           | 12 ++++++------
 6 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 0f51f8cb4..49b79f38d 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -40,7 +40,8 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
 constexpr Index_type Ne = 16;
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+const Index_Type N_Elem = N/(Ne*Ne);
+for(Index_type ii = 0; ii != N_Elem; ++ii){
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x;
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y;
   MAT_FUSED_MUL_ADD_BODY;
@@ -51,7 +52,8 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_lam(Index_type N, Lambda body)
 {
 constexpr Index_type Ne = 16;
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
+const Index_Type N_Elem = N/(Ne*Ne);
+for(Index_type ii = 0; ii != N_Elem; ++ii){  
     Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
     Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
     body(ii,col,row);
@@ -62,8 +64,8 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
+  const Index_Type N_Elem = N/(Ne*Ne);
   constexpr Index_type Ne = m_Ne;
-  constexpr Index_type NeNe = m_Ne * m_Ne;
 
   constexpr Index_type block_x = gpu_block_size::sqrt(block_size);
   constexpr Index_type block_y = gpu_block_size::sqrt(block_size);
@@ -113,7 +115,7 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
     startTimer();
     RAJA::RangeSegment row_range(0, Ne);
     RAJA::RangeSegment col_range(0, Ne);
-    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+    RAJA::RangeSegment ii_range(0, N_Elem);
     using EXEC_POL =
       RAJA::KernelPolicy<
         RAJA::statement::CudaKernel<
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
index 54a332f3b..b8a0d286b 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp
@@ -35,7 +35,8 @@ namespace basic {
 
 __global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ 
   constexpr Index_type Ne = 16;
-  for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+  const Index_type N_Elem = N/(Ne*Ne);
+  for(Index_type ii = 0; ii != N_Elem; ++ii){
   using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type;
   real4 result = {0};
 
@@ -77,7 +78,8 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   const Index_type N){
 constexpr Index_type Ne = 16;
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
+const Index_type N_Elem = N/(Ne*Ne);
+for(Index_type ii = 0; ii != N_Elem; ++ii){  
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
   MAT_FUSED_MUL_ADD_BODY;
@@ -88,7 +90,8 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_lam(const Index_type N, Lambda body)
 {
 constexpr Index_type Ne = 16;
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){  
+const Index_type N_Elem = N/(Ne*Ne);
+for(Index_type ii = 0; ii != N_Elem; ++ii){  
     Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
     Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
     body(ii,col,row);
@@ -100,7 +103,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid)
   const Index_type iend = getActualProblemSize();  
   const Index_type N = m_N;
   constexpr Index_type Ne = m_Ne;
-  constexpr Index_type NeNe = m_Ne * m_Ne;
+  const Index_type N_Elem = N/(Ne*Ne);
 
   dim3 gridDim (1, 1, 1);
   dim3 blockDimBuiltin(Ne, 4, 1);
@@ -134,7 +137,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
   const Index_type iend = getActualProblemSize();  
   const Index_type N = m_N;
   constexpr Index_type Ne = m_Ne;
-  constexpr Index_type NeNe = m_Ne * m_Ne;
+  const Index_type N_Elem = N/(Ne*Ne);
 
   constexpr Index_type block_x = gpu_block_size::sqrt(block_size);
   constexpr Index_type block_y = gpu_block_size::sqrt(block_size);
@@ -183,7 +186,7 @@ void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid)
     MAT_FUSED_MUL_ADD_DATA_SETUP_HIP;
 
     startTimer();
-    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+    RAJA::RangeSegment ii_range(0, N_Elem);
     RAJA::RangeSegment row_range(0, Ne);
     RAJA::RangeSegment col_range(0, Ne);
 
diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 5fe03bd90..868c5aa99 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -22,8 +22,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
   const Index_type iend = getActualProblemSize();
   const Index_type N = m_N;
   const Index_type Ne = m_Ne;
-  constexpr Index_type NeNe = m_Ne * m_Ne;
-  const Index_type ii_end = (N/(Ne*Ne));
+  const Index_type N_Elem = (N/(Ne*Ne));
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
   MAT_FUSED_MUL_ADD_DATA_INIT;
@@ -35,7 +34,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     #pragma omp parallel for
-    for(Index_type ii = 0; ii < ii_end; ++ii){
+    for(Index_type ii = 0; ii < N_Elem; ++ii){
           for(Index_type row = 0; row < Ne; ++row){
             for(Index_type col = 0; col < Ne; ++col){
                 MAT_FUSED_MUL_ADD_BODY;
@@ -58,7 +57,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     #pragma omp parallel for
-    for(Index_type ii = 0; ii < ii_end; ++ii){
+    for(Index_type ii = 0; ii < N_Elem; ++ii){
           for(Index_type row = 0; row < Ne; ++row){
             for(Index_type col = 0; col < Ne; ++col){
                 mat_fused_base_lam(ii, row, col);
diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
index 4847a05c5..18e7a140e 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp
@@ -29,7 +29,7 @@ namespace basic {
   int did = omp_get_default_device();                     \
   const Index_type N = m_N;                               \
   constexpr Index_type Ne = m_Ne;                         \
-  constexpr Index_type NeNe = m_Ne * m_Ne;                \
+  const Index_type N_Elem = (N/(Ne*Ne);                   \
   allocAndInitOpenMPDeviceData(A, m_A, N, did, hid);      \
   allocAndInitOpenMPDeviceData(B, m_B, N, did, hid);      \
   allocAndInitOpenMPDeviceData(D, m_D, N, did, hid);			  
@@ -61,7 +61,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
       #pragma omp target is_device_ptr(A, B, D) device( did )
       #pragma omp teams distribute parallel for schedule(static, 1) collapse(2)
-      for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+      for(Index_type ii = 0; ii != N_Elem; ++ii){
         for(Index_type row = 0; row != Ne; ++row){
             for(Index_type col = 0; col != Ne; ++col){
             MAT_FUSED_MUL_ADD_BODY;
@@ -80,7 +80,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UN
 
     RAJA::RangeSegment row_range(0, Ne);
     RAJA::RangeSegment col_range(0, Ne);    
-    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+    RAJA::RangeSegment ii_range(0, N_Elem);
 
 
     using EXEC_POL =
diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
index 842966146..14f783f66 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp
@@ -18,7 +18,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
   constexpr Index_type Ne = m_Ne;
-  constexpr Index_type NeNe = m_Ne * m_Ne;
+  const Index_type N_Elem = N/(Ne*Ne);
 
   MAT_FUSED_MUL_ADD_DATA_SETUP;
 
@@ -29,7 +29,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+        for(Index_type ii = 0; ii != N_Elem; ++ii){
               for(Index_type row = 0; row != Ne; ++row){
                 for(Index_type col = 0; col != Ne; ++col){
                     MAT_FUSED_MUL_ADD_BODY;
@@ -52,7 +52,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
 
     startTimer();
     for (Index_type irep = 0; irep < run_reps; ++irep) {
-        for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+        for(Index_type ii = 0; ii != N_Elem; ++ii){
               for(Index_type row = 0; row != Ne; ++row){
                 for(Index_type col = 0; col != Ne; ++col){
                     mat_fused_lam(ii,row,col);
@@ -69,7 +69,7 @@ void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(
   case RAJA_Seq: {
     RAJA::RangeSegment row_range(0, Ne);
     RAJA::RangeSegment col_range(0, Ne);    
-    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+    RAJA::RangeSegment ii_range(0, N_Elem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp
index 8d92a2f13..01aae2601 100644
--- a/src/basic/MAT_FUSED_MUL_ADD.hpp
+++ b/src/basic/MAT_FUSED_MUL_ADD.hpp
@@ -7,12 +7,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 // Compute D = A x B + C, where
 // Inputs:
-// A: N/(Ne*Ne) Ne x Ne matrices
-// B: N/(Ne*Ne) Ne x Ne matrices
+// A: N_Elem x (Ne x Ne) matrices
+// B: N_Elem x (Ne x Ne) matrices
 // Ouput:
 // D: N/(Ne*Ne) Ne x Ne matrices
 // All square row-major matrices, C is ignored.
-//for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){
+//for(Index_type ii = 0; ii != N_Elem; ++ii){
 //      for(Index_type row = 0; row != Ne; ++row){
 //        for(Index_type col = 0; col != Ne; ++col){
 //            MAT_FUSED_MUL_ADD_BODY;
@@ -26,9 +26,9 @@
 #include "common/KernelBase.hpp"
 
 #define MAT_FUSED_MUL_ADD_DATA_INIT                                                  \
-for(Index_type ii = 0; ii != (N/(Ne*Ne)); ++ii){                                     \
-	for(Index_type i = 0; i != NeNe; ++i){ m_A[i+(ii*NeNe)] = i; }               \
-	for(Index_type i = 0; i != NeNe; ++i){ m_B[i+(ii*NeNe)] = NeNe - 1 - i; }    \
+for(Index_type ii = 0; ii != N_Elem; ++ii){                                     \
+	for(Index_type i = 0; i != Ne*Ne; ++i){ m_A[i+(ii*Ne*Ne)] = i; }               \
+	for(Index_type i = 0; i != Ne*Ne; ++i){ m_B[i+(ii*Ne*Ne)] = (Ne*Ne) - 1 - i; }    \
 } 
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP                                                 \

From f8c1f2c175abe0a398685396b39a8f40f18be220 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Thu, 14 Jul 2022 16:15:33 -0400
Subject: [PATCH 75/81] add raja_hiperrchk header

Co-authored-by: Jason Burmark <MrBurmark@users.noreply.github.com>
---
 src/common/HipDataUtils.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index e9855ae9c..709e07525 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -22,6 +22,7 @@
 
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 
+
 namespace rajaperf
 {
 

From 13474345ef47b6fabf9bd6094a0c368e299255ed Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Thu, 14 Jul 2022 16:16:37 -0400
Subject: [PATCH 76/81] update lone N_Elem var

---
 src/basic/MAT_FUSED_MUL_ADD-OMP.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
index 868c5aa99..6b417faae 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp
@@ -76,7 +76,7 @@ void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_A
  
     RAJA::RangeSegment row_range(0, Ne);
     RAJA::RangeSegment col_range(0, Ne);    
-    RAJA::RangeSegment ii_range(0, (N/(Ne*Ne)));
+    RAJA::RangeSegment ii_range(0, N_Elem);
 
     startTimer();
     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

From cde5c950653113a479e3bcbe07f09526eb853d43 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 14 Jul 2022 15:31:07 -0500
Subject: [PATCH 77/81] update lambda forall

---
 src/apps/DEL_DOT_VEC_2D-Hip.cpp      | 2 +-
 src/basic/DAXPY-Hip.cpp              | 2 +-
 src/basic/DAXPY_ATOMIC-Hip.cpp       | 2 +-
 src/basic/IF_QUAD-Hip.cpp            | 2 +-
 src/basic/INIT3-Hip.cpp              | 2 +-
 src/basic/INIT_VIEW1D-Hip.cpp        | 2 +-
 src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +-
 src/basic/MULADDSUB-Hip.cpp          | 2 +-
 src/basic/PI_ATOMIC-Hip.cpp          | 2 +-
 src/common/HipDataUtils.hpp          | 4 ++--
 src/stream/ADD-Hip.cpp               | 2 +-
 src/stream/COPY-Hip.cpp              | 2 +-
 src/stream/MUL-Hip.cpp               | 2 +-
 src/stream/TRIAD-Hip.cpp             | 2 +-
 14 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
index 782e4099c..3ddd555b8 100644
--- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
@@ -122,7 +122,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(deldotvec2d_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(deldotvec2d_lambda)>),
         grid_size, block_size, 0, 0,
         0, iend, deldotvec2d_lambda);
       hipErrchk( hipGetLastError() );
diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp
index 25810c19e..e3793b18c 100644
--- a/src/basic/DAXPY-Hip.cpp
+++ b/src/basic/DAXPY-Hip.cpp
@@ -82,7 +82,7 @@ void DAXPY::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(daxpy_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, daxpy_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp
index a1e7a6465..65f258d8d 100644
--- a/src/basic/DAXPY_ATOMIC-Hip.cpp
+++ b/src/basic/DAXPY_ATOMIC-Hip.cpp
@@ -81,7 +81,7 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_atomic_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(daxpy_atomic_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, daxpy_atomic_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp
index 6ded209a9..cf4b7d8e9 100644
--- a/src/basic/IF_QUAD-Hip.cpp
+++ b/src/basic/IF_QUAD-Hip.cpp
@@ -89,7 +89,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(ifquad_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(ifquad_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, ifquad_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp
index af3276a7d..bc9030d13 100644
--- a/src/basic/INIT3-Hip.cpp
+++ b/src/basic/INIT3-Hip.cpp
@@ -90,7 +90,7 @@ void INIT3::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(init3_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(init3_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, init3_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp
index 6f9d41924..b9f28795b 100644
--- a/src/basic/INIT_VIEW1D-Hip.cpp
+++ b/src/basic/INIT_VIEW1D-Hip.cpp
@@ -80,7 +80,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(initview1d_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, initview1d_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
index ae98f56ab..2a0e06681 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
@@ -81,7 +81,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_offset_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(initview1d_offset_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, initview1d_offset_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp
index cb9076b38..b60dcf4b1 100644
--- a/src/basic/MULADDSUB-Hip.cpp
+++ b/src/basic/MULADDSUB-Hip.cpp
@@ -90,7 +90,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(muladdsub_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(muladdsub_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, muladdsub_lambda );
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp
index 605696676..1ae18f3d9 100644
--- a/src/basic/PI_ATOMIC-Hip.cpp
+++ b/src/basic/PI_ATOMIC-Hip.cpp
@@ -87,7 +87,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(atomic_pi_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(atomic_pi_lambda)>),
           grid_size, block_size, 0, 0, ibegin, iend, atomic_pi_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index e9855ae9c..47a255c2a 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -29,7 +29,7 @@ namespace rajaperf
  * \brief Simple forall hip kernel that runs a lambda.
  */
 template < typename Lambda >
-__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body)
+__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body)
 {
   Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x;
   if (i < iend) {
@@ -39,7 +39,7 @@ __global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda bod
 ///
 template < size_t block_size, typename Lambda >
 __launch_bounds__(block_size)
-__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body)
+__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body)
 {
   Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp
index 5e53500c8..838893318 100644
--- a/src/stream/ADD-Hip.cpp
+++ b/src/stream/ADD-Hip.cpp
@@ -82,7 +82,7 @@ void ADD::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(add_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(add_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, add_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp
index fe302a7fc..9dd9fdeed 100644
--- a/src/stream/COPY-Hip.cpp
+++ b/src/stream/COPY-Hip.cpp
@@ -80,7 +80,7 @@ void COPY::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(copy_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(copy_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, copy_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp
index 3e5e3f9f0..4894a3a83 100644
--- a/src/stream/MUL-Hip.cpp
+++ b/src/stream/MUL-Hip.cpp
@@ -80,7 +80,7 @@ void MUL::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(mul_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(mul_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, mul_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp
index 740727530..c8a89308c 100644
--- a/src/stream/TRIAD-Hip.cpp
+++ b/src/stream/TRIAD-Hip.cpp
@@ -82,7 +82,7 @@ void TRIAD::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(triad_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, triad_lambda);
       hipErrchk( hipGetLastError() );
 

From 5b9312325bc2d14672e743ccf481e1c3ed08ba95 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 14 Jul 2022 22:23:30 -0500
Subject: [PATCH 78/81] roll back lambda_hip_forall naming

---
 src/apps/DEL_DOT_VEC_2D-Hip.cpp      | 2 +-
 src/basic/DAXPY-Hip.cpp              | 2 +-
 src/basic/DAXPY_ATOMIC-Hip.cpp       | 2 +-
 src/basic/IF_QUAD-Hip.cpp            | 2 +-
 src/basic/INIT3-Hip.cpp              | 2 +-
 src/basic/INIT_VIEW1D-Hip.cpp        | 2 +-
 src/basic/INIT_VIEW1D_OFFSET-Hip.cpp | 2 +-
 src/basic/MULADDSUB-Hip.cpp          | 2 +-
 src/basic/PI_ATOMIC-Hip.cpp          | 2 +-
 src/common/HipDataUtils.hpp          | 4 ++--
 src/stream/ADD-Hip.cpp               | 2 +-
 src/stream/COPY-Hip.cpp              | 2 +-
 src/stream/MUL-Hip.cpp               | 2 +-
 src/stream/TRIAD-Hip.cpp             | 2 +-
 14 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/apps/DEL_DOT_VEC_2D-Hip.cpp b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
index 3ddd555b8..782e4099c 100644
--- a/src/apps/DEL_DOT_VEC_2D-Hip.cpp
+++ b/src/apps/DEL_DOT_VEC_2D-Hip.cpp
@@ -122,7 +122,7 @@ void DEL_DOT_VEC_2D::runHipVariantImpl(VariantID vid)
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
 
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(deldotvec2d_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(deldotvec2d_lambda)>),
         grid_size, block_size, 0, 0,
         0, iend, deldotvec2d_lambda);
       hipErrchk( hipGetLastError() );
diff --git a/src/basic/DAXPY-Hip.cpp b/src/basic/DAXPY-Hip.cpp
index e3793b18c..25810c19e 100644
--- a/src/basic/DAXPY-Hip.cpp
+++ b/src/basic/DAXPY-Hip.cpp
@@ -82,7 +82,7 @@ void DAXPY::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(daxpy_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, daxpy_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/DAXPY_ATOMIC-Hip.cpp b/src/basic/DAXPY_ATOMIC-Hip.cpp
index 65f258d8d..a1e7a6465 100644
--- a/src/basic/DAXPY_ATOMIC-Hip.cpp
+++ b/src/basic/DAXPY_ATOMIC-Hip.cpp
@@ -81,7 +81,7 @@ void DAXPY_ATOMIC::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(daxpy_atomic_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(daxpy_atomic_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, daxpy_atomic_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/IF_QUAD-Hip.cpp b/src/basic/IF_QUAD-Hip.cpp
index cf4b7d8e9..6ded209a9 100644
--- a/src/basic/IF_QUAD-Hip.cpp
+++ b/src/basic/IF_QUAD-Hip.cpp
@@ -89,7 +89,7 @@ void IF_QUAD::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(ifquad_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(ifquad_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, ifquad_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/INIT3-Hip.cpp b/src/basic/INIT3-Hip.cpp
index bc9030d13..af3276a7d 100644
--- a/src/basic/INIT3-Hip.cpp
+++ b/src/basic/INIT3-Hip.cpp
@@ -90,7 +90,7 @@ void INIT3::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(init3_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(init3_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, init3_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/INIT_VIEW1D-Hip.cpp b/src/basic/INIT_VIEW1D-Hip.cpp
index b9f28795b..6f9d41924 100644
--- a/src/basic/INIT_VIEW1D-Hip.cpp
+++ b/src/basic/INIT_VIEW1D-Hip.cpp
@@ -80,7 +80,7 @@ void INIT_VIEW1D::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(initview1d_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, initview1d_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
index 2a0e06681..ae98f56ab 100644
--- a/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
+++ b/src/basic/INIT_VIEW1D_OFFSET-Hip.cpp
@@ -81,7 +81,7 @@ void INIT_VIEW1D_OFFSET::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend-ibegin, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(initview1d_offset_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(initview1d_offset_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, initview1d_offset_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/MULADDSUB-Hip.cpp b/src/basic/MULADDSUB-Hip.cpp
index b60dcf4b1..cb9076b38 100644
--- a/src/basic/MULADDSUB-Hip.cpp
+++ b/src/basic/MULADDSUB-Hip.cpp
@@ -90,7 +90,7 @@ void MULADDSUB::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(muladdsub_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(muladdsub_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, muladdsub_lambda );
       hipErrchk( hipGetLastError() );
 
diff --git a/src/basic/PI_ATOMIC-Hip.cpp b/src/basic/PI_ATOMIC-Hip.cpp
index 1ae18f3d9..605696676 100644
--- a/src/basic/PI_ATOMIC-Hip.cpp
+++ b/src/basic/PI_ATOMIC-Hip.cpp
@@ -87,7 +87,7 @@ void PI_ATOMIC::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(atomic_pi_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(atomic_pi_lambda)>),
           grid_size, block_size, 0, 0, ibegin, iend, atomic_pi_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index 459b4016e..709e07525 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -30,7 +30,7 @@ namespace rajaperf
  * \brief Simple forall hip kernel that runs a lambda.
  */
 template < typename Lambda >
-__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body)
+__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body)
 {
   Index_type i = ibegin + blockIdx.x * blockDim.x + threadIdx.x;
   if (i < iend) {
@@ -40,7 +40,7 @@ __global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda
 ///
 template < size_t block_size, typename Lambda >
 __launch_bounds__(block_size)
-__global__ void lambda_hip_forall_1D(Index_type ibegin, Index_type iend, Lambda body)
+__global__ void lambda_hip_forall(Index_type ibegin, Index_type iend, Lambda body)
 {
   Index_type i = ibegin + blockIdx.x * block_size + threadIdx.x;
   if (i < iend) {
diff --git a/src/stream/ADD-Hip.cpp b/src/stream/ADD-Hip.cpp
index 838893318..5e53500c8 100644
--- a/src/stream/ADD-Hip.cpp
+++ b/src/stream/ADD-Hip.cpp
@@ -82,7 +82,7 @@ void ADD::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(add_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(add_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, add_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/stream/COPY-Hip.cpp b/src/stream/COPY-Hip.cpp
index 9dd9fdeed..fe302a7fc 100644
--- a/src/stream/COPY-Hip.cpp
+++ b/src/stream/COPY-Hip.cpp
@@ -80,7 +80,7 @@ void COPY::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(copy_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(copy_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, copy_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/stream/MUL-Hip.cpp b/src/stream/MUL-Hip.cpp
index 4894a3a83..3e5e3f9f0 100644
--- a/src/stream/MUL-Hip.cpp
+++ b/src/stream/MUL-Hip.cpp
@@ -80,7 +80,7 @@ void MUL::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(mul_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(mul_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, mul_lambda);
       hipErrchk( hipGetLastError() );
 
diff --git a/src/stream/TRIAD-Hip.cpp b/src/stream/TRIAD-Hip.cpp
index c8a89308c..740727530 100644
--- a/src/stream/TRIAD-Hip.cpp
+++ b/src/stream/TRIAD-Hip.cpp
@@ -82,7 +82,7 @@ void TRIAD::runHipVariantImpl(VariantID vid)
       };
 
       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-      hipLaunchKernelGGL((lambda_hip_forall_1D<block_size, decltype(triad_lambda)>),
+      hipLaunchKernelGGL((lambda_hip_forall<block_size, decltype(triad_lambda)>),
         grid_size, block_size, 0, 0, ibegin, iend, triad_lambda);
       hipErrchk( hipGetLastError() );
 

From 39fbf158ccbcebe39a0f8d3ea4b74a7a1aea4128 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 14 Jul 2022 22:26:17 -0500
Subject: [PATCH 79/81] git rid of some unused static function warning

---
 src/common/HipDataUtils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index 709e07525..742f0c54b 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -178,7 +178,7 @@ void deallocHipPinnedData(T& pptr)
   pptr = nullptr;
 }
 
-static std::string getHipArch()
+static inline std::string getHipArch()
 {  
   hipDeviceProp_t devProp;
   hipGetDeviceProperties(&devProp, 0);

From 831e61cdf07142f27cbf0030c6fcef316c946457 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Thu, 14 Jul 2022 23:00:43 -0500
Subject: [PATCH 80/81] make hipArch more consistent with device naming

---
 src/common/HipDataUtils.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp
index 742f0c54b..782fd7a1f 100644
--- a/src/common/HipDataUtils.hpp
+++ b/src/common/HipDataUtils.hpp
@@ -183,7 +183,8 @@ static inline std::string getHipArch()
   hipDeviceProp_t devProp;
   hipGetDeviceProperties(&devProp, 0);
   std::string gcnArchName(devProp.gcnArchName);
-  std::string hipArch = gcnArchName.substr(0, 6);
+  std::string hipArch = gcnArchName.substr(0, 7);
+  if(hipArch.back() == ':' ) hipArch.pop_back();
   return hipArch;
 }
 

From 4e6a7b33b5834479c06760638edc0ed3a729039b Mon Sep 17 00:00:00 2001
From: Corbin Robeck <corbin.robeck@amd.com>
Date: Fri, 15 Jul 2022 14:03:01 -0500
Subject: [PATCH 81/81] fixing var naming issue

---
 src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
index 49b79f38d..87966915c 100644
--- a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
+++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp
@@ -21,10 +21,10 @@ namespace basic {
 
 #define MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA           \
   const Index_type N = m_N;                         \
-  const Index_type Ne = m_Ne;                       \
   allocAndInitCudaDeviceData(A, m_A, N);            \
   allocAndInitCudaDeviceData(B, m_B, N);            \
-  allocAndInitCudaDeviceData(D, m_D, N);			
+  allocAndInitCudaDeviceData(D, m_D, N);			   
+
 
 #define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA        \
   getCudaDeviceData(m_A, A, N);                     \
@@ -40,7 +40,7 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D,
                                   Index_type N){
 constexpr Index_type Ne = 16;
-const Index_Type N_Elem = N/(Ne*Ne);
+const Index_type N_Elem = N/(Ne*Ne);
 for(Index_type ii = 0; ii != N_Elem; ++ii){
   Index_type col = threadIdx.x + blockIdx.x * blockDim.x;
   Index_type row = threadIdx.y + blockIdx.y * blockDim.y;
@@ -52,7 +52,7 @@ __launch_bounds__(block_size)
 __global__ void mat_fused_lam(Index_type N, Lambda body)
 {
 constexpr Index_type Ne = 16;
-const Index_Type N_Elem = N/(Ne*Ne);
+const Index_type N_Elem = N/(Ne*Ne);
 for(Index_type ii = 0; ii != N_Elem; ++ii){  
     Index_type col = threadIdx.x + blockIdx.x * blockDim.x; 
     Index_type row = threadIdx.y + blockIdx.y * blockDim.y; 
@@ -64,8 +64,8 @@ void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
   const Index_type N = m_N;
-  const Index_Type N_Elem = N/(Ne*Ne);
-  constexpr Index_type Ne = m_Ne;
+  constexpr Index_type Ne = 16;
+  const Index_type N_Elem = N/(Ne*Ne);
 
   constexpr Index_type block_x = gpu_block_size::sqrt(block_size);
   constexpr Index_type block_y = gpu_block_size::sqrt(block_size);