From 3debb2970d1222f562ddec1fa992bc6ad2a26c92 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Fri, 22 Nov 2024 13:20:16 -0600
Subject: [PATCH] clean up some casting in tests

---
 tests/contract_ft_test.cpp                    |  11 +-
 tests/gauge_alg_test.cpp                      |   2 +-
 tests/host_reference/contract_ft_reference.h  |   8 +-
 .../host_reference/gauge_force_reference.cpp  |  43 +++---
 tests/host_reference/hisq_force_reference.cpp | 139 ++++++++----------
 .../staggered_dslash_reference.cpp            |   8 +-
 tests/laph_test.cpp                           |   4 +-
 7 files changed, 102 insertions(+), 113 deletions(-)
diff --git a/tests/contract_ft_test.cpp b/tests/contract_ft_test.cpp
index cc793ede5b..740adf6cc9 100644
--- a/tests/contract_ft_test.cpp
+++ b/tests/contract_ft_test.cpp
@@ -171,9 +171,9 @@ inline int launch_contract_test(const QudaContractType cType, const std::array<i
 
   fill_buffers<Float, 2>(buffs, X, dof);
 
-  for (int s = 0; s < nprops; ++s, off += spinor_field_floats * sizeof(Float)) {
-    spinorX[s] = (void *)((uintptr_t)buffs[0].data() + off);
-    spinorY[s] = (void *)((uintptr_t)buffs[1].data() + off);
+  for (int s = 0; s < nprops; ++s, off += spinor_field_floats) {
+    spinorX[s] = static_cast<void *>(buffs[0].data() + off);
+    spinorY[s] = static_cast<void *>(buffs[1].data() + off);
   }
   // Perform GPU contraction:
   void *d_result_ = static_cast<void *>(d_result.data());
@@ -181,9 +181,8 @@ inline int launch_contract_test(const QudaContractType cType, const std::array<i
   contractFTQuda(spinorX.data(), spinorY.data(), &d_result_, cType, (void *)(&cs_param), src_colors, X.data(),
                  source_position.data(), n_mom, mom.data(), fft_type.data());
   // Check results:
-  int faults
-    = contractionFT_reference<Float>((Float **)spinorX.data(), (Float **)spinorY.data(), d_result.data(), cType,
-                                     src_colors, X.data(), source_position.data(), n_mom, mom.data(), fft_type.data());
+  int faults = contractionFT_reference<Float>(spinorX.data(), spinorY.data(), d_result.data(), cType, src_colors,
+                                              X.data(), source_position.data(), n_mom, mom.data(), fft_type.data());
 
   return faults;
 }
diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp
index c5197af83c..2994904e29 100644
--- a/tests/gauge_alg_test.cpp
+++ b/tests/gauge_alg_test.cpp
@@ -79,7 +79,7 @@ struct GaugeAlgTest : public ::testing::TestWithParam<test_t> {
 #ifndef QUDA_BUILD_NATIVE_FFT // skip FFT tests if FFT not available
     const ::testing::TestInfo *const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
     const char *name = test_info->name();
-    if (strcmp(name, "Landau_FFT") == 0 || strcmp(name, "Coulomb_FFT") == 0) {
+    if (strncmp(name, "Landau_FFT", 10) == 0 || strncmp(name, "Coulomb_FFT", 11) == 0) {
       execute = false;
       GTEST_SKIP();
     }
diff --git a/tests/host_reference/contract_ft_reference.h b/tests/host_reference/contract_ft_reference.h
index 697ed550e2..e811d4ec99 100644
--- a/tests/host_reference/contract_ft_reference.h
+++ b/tests/host_reference/contract_ft_reference.h
@@ -58,7 +58,7 @@ template <typename Float> inline void FourierPhase(Float z[2], const Float theta
 };
 
 template <typename Float>
-void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2, double *h_result,
+void contractFTHost(void **h_prop_array_flavor_1, void **h_prop_array_flavor_2, double *h_result,
                     const QudaContractType cType, const int src_colors, const int *X, const int *const source_position,
                     const int n_mom, const int *const mom_modes, const QudaFFTSymmType *const fft_type)
 {
@@ -126,8 +126,8 @@ void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2
         for (int c1 = 0; c1 < src_colors; c1++) {
           // color contraction
           size_t off = nSpin * 3 * 2 * (Vh * parity + cb_idx);
-          contractColors<Float>(h_prop_array_flavor_1[s1 * src_colors + c1] + off,
-                                h_prop_array_flavor_2[s2 * src_colors + c1] + off, nSpin, M.data());
+          contractColors<Float>(static_cast<Float *>(h_prop_array_flavor_1[s1 * src_colors + c1]) + off,
+                                static_cast<Float *>(h_prop_array_flavor_2[s2 * src_colors + c1]) + off, nSpin, M.data());
 
           // apply gamma matrices here
 
@@ -158,7 +158,7 @@ void contractFTHost(Float **h_prop_array_flavor_1, Float **h_prop_array_flavor_2
 };
 
 template <typename Float>
-int contractionFT_reference(Float **spinorX, Float **spinorY, const double *const d_result, const QudaContractType cType,
+int contractionFT_reference(void **spinorX, void **spinorY, const double *const d_result, const QudaContractType cType,
                             const int src_colors, const int *X, const int *const source_position, const int n_mom,
                             const int *const mom_modes, const QudaFFTSymmType *const fft_type)
 {
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index ce285a6b99..741efdf0b7 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -426,7 +426,7 @@ static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_
 /* This function only computes one direction @dir
  *
  */
-void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *sitelink, void *const *sitelink_ex,
+void gauge_force_reference_dir(void *refMom, int dir, double eb3, quda::GaugeField &u, quda::GaugeField &u_ex,
                                QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths,
                                const lattice_t &lat, bool compute_force)
 {
@@ -437,26 +437,30 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s
   for (int i = 0; i < num_paths; i++) {
     if (prec == QUDA_DOUBLE_PRECISION) {
       double *my_loop_coeff = (double *)loop_coeff;
-      compute_path_product((dsu3_matrix *)staple, (dsu3_matrix **)sitelink_ex, path_dir[i], length[i], my_loop_coeff[i],
-                           dir, lat);
+      compute_path_product((dsu3_matrix *)staple, u_ex.data_array<dsu3_matrix *>().data, path_dir[i], length[i],
+                           my_loop_coeff[i], dir, lat);
     } else {
       float *my_loop_coeff = (float *)loop_coeff;
-      compute_path_product((fsu3_matrix *)staple, (fsu3_matrix **)sitelink_ex, path_dir[i], length[i], my_loop_coeff[i],
-                           dir, lat);
+      compute_path_product((fsu3_matrix *)staple, u_ex.data_array<fsu3_matrix *>().data, path_dir[i], length[i],
+                           my_loop_coeff[i], dir, lat);
     }
   }
 
   if (compute_force) {
     if (prec == QUDA_DOUBLE_PRECISION) {
-      update_mom((danti_hermitmat *)refMom, dir, (dsu3_matrix **)sitelink, (dsu3_matrix *)staple, (double)eb3, lat);
+      update_mom((danti_hermitmat *)refMom, dir, u.data_array<dsu3_matrix *>().data, (dsu3_matrix *)staple, (double)eb3,
+                 lat);
     } else {
-      update_mom((fanti_hermitmat *)refMom, dir, (fsu3_matrix **)sitelink, (fsu3_matrix *)staple, (float)eb3, lat);
+      update_mom((fanti_hermitmat *)refMom, dir, u.data_array<fsu3_matrix *>().data, (fsu3_matrix *)staple, (float)eb3,
+                 lat);
     }
   } else {
     if (prec == QUDA_DOUBLE_PRECISION) {
-      update_gauge((dsu3_matrix *)refMom, dir, (dsu3_matrix **)sitelink, (dsu3_matrix *)staple, (double)eb3, lat);
+      update_gauge((dsu3_matrix *)refMom, dir, u.data_array<dsu3_matrix *>().data, (dsu3_matrix *)staple, (double)eb3,
+                   lat);
     } else {
-      update_gauge((fsu3_matrix *)refMom, dir, (fsu3_matrix **)sitelink, (fsu3_matrix *)staple, (float)eb3, lat);
+      update_gauge((fsu3_matrix *)refMom, dir, u.data_array<fsu3_matrix *>().data, (fsu3_matrix *)staple, (float)eb3,
+                   lat);
     }
   }
   host_free(staple);
@@ -465,8 +469,6 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s
 void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir, int *length,
                            void *loop_coeff, int num_paths, bool compute_force)
 {
-  void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
-
   // created extended field
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
@@ -475,13 +477,12 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
   param.gauge_order = QUDA_QDP_GAUGE_ORDER;
   param.t_boundary = QUDA_PERIODIC_T;
 
-  auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
+  auto qdp_ex = quda::createExtendedGauge(u.data_array().data, param, R);
   lattice_t lat(*qdp_ex);
 
-  void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
   for (int dir = 0; dir < 4; dir++) {
-    gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, u.Precision(), path_dir[dir], length, loop_coeff,
-                              num_paths, lat, compute_force);
+    gauge_force_reference_dir(refMom, dir, eb3, u, *qdp_ex, u.Precision(), path_dir[dir], length, loop_coeff, num_paths,
+                              lat, compute_force);
   }
 
   delete qdp_ex;
@@ -490,8 +491,6 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
 void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces, double factor,
                                 int **input_path, int *length, double *path_coeff, int num_paths)
 {
-  void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
-
   // create extended field
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
@@ -499,20 +498,20 @@ void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex>
   setGaugeParam(param);
   param.gauge_order = QUDA_QDP_GAUGE_ORDER;
   param.t_boundary = QUDA_PERIODIC_T;
-
-  auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
+  auto qdp_ex = quda::createExtendedGauge(u.data_array().data, param, R);
   lattice_t lat(*qdp_ex);
-  void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
 
   std::vector<double> loop_tr_dbl(2 * num_paths);
 
   for (int i = 0; i < num_paths; i++) {
     if (u.Precision() == QUDA_DOUBLE_PRECISION) {
-      dcomplex tr = compute_loop_trace((dsu3_matrix **)sitelink_ex, input_path[i], length[i], path_coeff[i], lat);
+      dcomplex tr
+        = compute_loop_trace(qdp_ex->data_array<dsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
       loop_tr_dbl[2 * i] = factor * tr.real;
       loop_tr_dbl[2 * i + 1] = factor * tr.imag;
     } else {
-      dcomplex tr = compute_loop_trace((fsu3_matrix **)sitelink_ex, input_path[i], length[i], path_coeff[i], lat);
+      dcomplex tr
+        = compute_loop_trace(qdp_ex->data_array<fsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
       loop_tr_dbl[2 * i] = factor * tr.real;
       loop_tr_dbl[2 * i + 1] = factor * tr.imag;
     }
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 1bef7b74bb..33d773ed95 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -117,9 +117,9 @@ void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, siz
 void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, QudaPrecision precision, size_t nhops)
 {
   if (precision == QUDA_SINGLE_PRECISION) {
-    computeLinkOrderedOuterProduct<fsu3_matrix>((fsu3_vector *)src, dst, nhops);
+    computeLinkOrderedOuterProduct<fsu3_matrix>(static_cast<fsu3_vector *>(src), dst, nhops);
   } else {
-    computeLinkOrderedOuterProduct<dsu3_matrix>((dsu3_vector *)src, dst, nhops);
+    computeLinkOrderedOuterProduct<dsu3_matrix>(static_cast<dsu3_vector *>(src), dst, nhops);
   }
 }
 
@@ -342,7 +342,7 @@ template <class Real> class LoadStore
   void loadMatrixFromField(const Real *const field, int oddBit, int half_lattice_index,
                            Matrix<3, std::complex<Real>> *const mat) const;
 
-  void loadMatrixFromField(const Real *const field, int oddBit, int dir, int half_lattice_index,
+  void loadMatrixFromField(const Real *const *const field, int oddBit, int dir, int half_lattice_index,
                            Matrix<3, std::complex<Real>> *const mat) const;
 
   void storeMatrixToField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int half_lattice_index,
@@ -352,12 +352,12 @@ template <class Real> class LoadStore
                         Real *const) const;
 
   void addMatrixToField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int dir, int half_lattice_index,
-                        Real coeff, Real *const) const;
+                        Real coeff, Real *const *const) const;
 
   void storeMatrixToMomentumField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int dir, int half_lattice_index,
                                   Real coeff, Real *const) const;
-  Real getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const;
-  void addData(Real *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
+  Real getData(const Real *const *const field, int idx, int dir, int oddBit, int offset, int hfv) const;
+  void addData(Real *const *const field, int idx, int dir, int oddBit, int offset, Real, int hfv) const;
   int half_idx_conversion_ex2normal(int half_lattice_index, const int *dim, int oddBit) const;
   int half_idx_conversion_normal2ex(int half_lattice_index, const int *dim, int oddBit) const;
 };
@@ -423,16 +423,16 @@ int LoadStore<Real>::half_idx_conversion_normal2ex(int half_lattice_index, const
 }
 
 template <class Real>
-Real LoadStore<Real>::getData(const Real *const field, int idx, int dir, int oddBit, int offset, int hfv) const
+Real LoadStore<Real>::getData(const Real *const *const field, int idx, int dir, int oddBit, int offset, int hfv) const
 {
   // QDP format
-  return ((Real **)field)[dir][(hfv * oddBit + idx) * 18 + offset];
+  return field[dir][(hfv * oddBit + idx) * 18 + offset];
 }
 template <class Real>
-void LoadStore<Real>::addData(Real *const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const
+void LoadStore<Real>::addData(Real *const *const field, int idx, int dir, int oddBit, int offset, Real v, int hfv) const
 {
   // QDP format
-  ((Real **)field)[dir][(hfv * oddBit + idx) * 18 + offset] += v;
+  field[dir][(hfv * oddBit + idx) * 18 + offset] += v;
 }
 
 template <class Real>
@@ -455,7 +455,7 @@ void LoadStore<Real>::loadMatrixFromField(const Real *const field, int oddBit, i
 }
 
 template <class Real>
-void LoadStore<Real>::loadMatrixFromField(const Real *const field, int oddBit, int dir, int half_lattice_index,
+void LoadStore<Real>::loadMatrixFromField(const Real *const *const field, int oddBit, int dir, int half_lattice_index,
                                           Matrix<3, std::complex<Real>> *const mat) const
 {
 #ifdef MULTI_GPU
@@ -464,11 +464,10 @@ void LoadStore<Real>::loadMatrixFromField(const Real *const field, int oddBit, i
   int hfv = Vh;
 #endif
 
-  // const Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
   int offset = 0;
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
-      (*mat)(i, j) = (getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
+      (*mat)(i, j) = getData(field, half_lattice_index, dir, oddBit, offset++, hfv);
       (*mat)(i, j) += std::complex<Real>(0, getData(field, half_lattice_index, dir, oddBit, offset++, hfv));
     }
   }
@@ -515,23 +514,18 @@ void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real>> &mat,
 
 template <class Real>
 void LoadStore<Real>::addMatrixToField(const Matrix<3, std::complex<Real>> &mat, int oddBit, int dir,
-                                       int half_lattice_index, Real coeff, Real *const field) const
+                                       int half_lattice_index, Real coeff, Real *const *const field) const
 {
-
 #ifdef MULTI_GPU
   int hfv = Vh_ex;
 #else
   int hfv = Vh;
 #endif
 
-  // Real* const local_field = field + ((oddBit*half_volume + half_lattice_index)*4 + dir)*18;
   int offset = 0;
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
-      // local_field[offset++] += coeff*mat(i,j).real();
       addData(field, half_lattice_index, dir, oddBit, offset++, coeff * mat(i, j).real(), hfv);
-
-      // local_field[offset++] += coeff*mat(i,j).imag();
       addData(field, half_lattice_index, dir, oddBit, offset++, coeff * mat(i, j).imag(), hfv);
     }
   }
@@ -762,7 +756,8 @@ void computeOneLinkSite(
 #else
   const int[],
 #endif
-  int half_lattice_index, const Real *const oprod, int sig, Real coeff, const LoadStore<Real> &ls, Real *const output)
+  int half_lattice_index, const Real *const *const oprod, int sig, Real coeff, const LoadStore<Real> &ls,
+  Real *const *const output)
 {
   if (GOES_FORWARDS(sig)) {
     typename ColorMatrix<Real>::Type colorMatW;
@@ -777,7 +772,7 @@ void computeOneLinkSite(
 }
 
 template <class Real>
-void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Real coeff, Real *const output)
+void computeOneLinkField(const int dim[4], const Real *const *const oprod, int sig, Real coeff, Real *const *const output)
 {
   int volume = 1;
   for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
@@ -795,10 +790,10 @@ void computeOneLinkField(const int dim[4], const Real *const oprod, int sig, Rea
 // middleLinkKernel compiles for now, but lots of debugging to be done
 template <class Real, int oddBit>
 void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
-                           const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
+                           const int dim[4], void *const oprod, const Real *const Qprev, const Real *const *const link,
                            int sig, int mu, Real coeff,
                            const LoadStore<Real> &ls, // pass a function object to read from and write to matrix fields
-                           Real *const Pmu, Real *const P3, Real *const Qmu, Real *const newOprod)
+                           Real *const Pmu, Real *const P3, Real *const Qmu, Real *const *const newOprod)
 {
   const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
   const bool sig_positive = (GOES_FORWARDS(sig)) ? true : false;
@@ -842,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett
 
   if (Qprev == NULL) {
     if (sig_positive) {
-      ls.loadMatrixFromField(oprod, 1 - oddBit, sig, point_d, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
     } else {
-      ls.loadMatrixFromField(oprod, oddBit, OPP_DIR(sig), point_c, &colorMatY);
+      ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
       colorMatY = conj(colorMatY);
     }
   } else { // Qprev != NULL
-    ls.loadMatrixFromField(oprod, oddBit, point_c, &colorMatY);
+    ls.loadMatrixFromField(static_cast<const Real *const>(oprod), oddBit, point_c, &colorMatY);
   }
 
   colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY;
@@ -880,9 +875,9 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett
 } // computeMiddleLinkSite
 
 template <class Real>
-void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
+void computeMiddleLinkField(const int dim[4], void *const oprod, const Real *const Qprev, const Real *const *const link,
                             int sig, int mu, Real coeff, Real *const Pmu, Real *const P3, Real *const Qmu,
-                            Real *const newOprod)
+                            Real *const *const newOprod)
 {
 
   int volume = 1;
@@ -911,9 +906,9 @@ template <class Real, int oddBit>
 void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
                          const int dim[4], const Real *const P3,
                          const Real *const Qprod, // why?
-                         const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff,
+                         const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
                          const LoadStore<Real> &ls, // pass a function object to read from and write to matrix fields
-                         Real *const shortP, Real *const newOprod)
+                         Real *const shortP, Real *const *const newOprod)
 {
 
   const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
@@ -979,8 +974,8 @@ void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better
 template <class Real>
 void computeSideLinkField(const int dim[4], const Real *const P3,
                           const Real *const Qprod, // why?
-                          const Real *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP,
-                          Real *const newOprod)
+                          const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
+                          Real *const shortP, Real *const *const newOprod)
 {
   // Need some way of setting half_volume
   int volume = 1;
@@ -1005,10 +1000,10 @@ void computeSideLinkField(const int dim[4], const Real *const P3,
 
 template <class Real, int oddBit>
 void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
-                        const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
-                        int sig, int mu, Real coeff, Real accumu_coeff,
+                        const int dim[4], const Real *const oprod, const Real *const Qprev,
+                        const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
                         const LoadStore<Real> &ls, // pass a function object to read from and write to matrix fields
-                        Real *const shortP, Real *const newOprod)
+                        Real *const shortP, Real *const *const newOprod)
 {
 
   const bool mu_positive = (GOES_FORWARDS(mu)) ? true : false;
@@ -1092,8 +1087,9 @@ void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better
 } // allLinkKernel
 
 template <class Real>
-void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const link,
-                         int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const newOprod)
+void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev,
+                         const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
+                         Real *const shortP, Real *const *const newOprod)
 {
   int volume = 1;
   for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
@@ -1132,8 +1128,8 @@ template <class Real> struct PathCoefficients {
 };
 
 template <class Real>
-void doHisqStaplesForceCPU(const int dim[4], PathCoefficients<double> staple_coeff, Real *oprod, Real *link,
-                           Real **tempmat, Real *newOprod)
+void doHisqStaplesForceCPU(const int dim[4], PathCoefficients<double> staple_coeff, Real **oprod, Real **link,
+                           Real **tempmat, Real **newOprod)
 {
   Real OneLink, ThreeSt, FiveSt, SevenSt, Lepage, coeff;
 
@@ -1217,9 +1213,6 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
   uint64_t len = 1;
   for (int dir = 0; dir < 4; ++dir) len *= X_[dir];
 #endif
-  // allocate memory for temporary fields
-  void *tempmat[6];
-  for (int i = 0; i < 6; i++) { tempmat[i] = safe_malloc(len * 18 * precision); }
 
   PathCoefficients<double> act_path_coeff;
   act_path_coeff.one = path_coeff[0];
@@ -1229,27 +1222,29 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
   act_path_coeff.seven = path_coeff[4];
   act_path_coeff.lepage = path_coeff[5];
 
-  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
-  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
-  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
-  if (precision == QUDA_DOUBLE_PRECISION) {
-    doHisqStaplesForceCPU<double>(X_, act_path_coeff, reinterpret_cast<double *>(oprod_array),
-                                  reinterpret_cast<double *>(link_array), (double **)tempmat,
-                                  reinterpret_cast<double *>(noprod_array));
-  } else if (precision == QUDA_SINGLE_PRECISION) {
-    doHisqStaplesForceCPU<float>(X_, act_path_coeff, reinterpret_cast<float *>(oprod_array),
-                                 reinterpret_cast<float *>(link_array), (float **)tempmat,
-                                 reinterpret_cast<float *>(noprod_array));
+  if (precision == QUDA_SINGLE_PRECISION) {
+    // allocate memory for temporary fields
+    float *tempmat[6];
+    for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<float *>(safe_malloc(len * 18 * precision)); }
+    doHisqStaplesForceCPU<float>(X_, act_path_coeff, oprod.data_array<float *>().data, link.data_array<float *>().data,
+                                 tempmat, newOprod->data_array<float *>().data);
+    for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
+  } else if (precision == QUDA_DOUBLE_PRECISION) {
+    // allocate memory for temporary fields
+    double *tempmat[6];
+    for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<double *>(safe_malloc(len * 18 * precision)); }
+    doHisqStaplesForceCPU<double>(X_, act_path_coeff, oprod.data_array<double *>().data,
+                                  link.data_array<double *>().data, tempmat, newOprod->data_array<double *>().data);
+    for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
   } else {
     errorQuda("Unsupported precision");
   }
-
-  for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
 }
 
 template <class Real, int oddBit>
-void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const oprod, const Real *const link,
-                         int sig, Real coeff, const LoadStore<Real> &ls, Real *const output)
+void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const *const oprod,
+                         const Real *const *const link, int sig, Real coeff, const LoadStore<Real> &ls,
+                         Real *const *const output)
 {
   if (GOES_FORWARDS(sig)) {
 
@@ -1296,8 +1291,8 @@ void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *c
 }
 
 template <class Real>
-void computeLongLinkField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real coeff,
-                          Real *const output)
+void computeLongLinkField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig,
+                          Real coeff, Real *const *const output)
 {
   int volume = 1;
   for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
@@ -1321,16 +1316,13 @@ void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeFiel
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
-  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
-  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
-  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      computeLongLinkField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array),
-                                  sig, coeff, reinterpret_cast<float *>(noprod_array));
+      computeLongLinkField<float>(X_, oprod.data_array<float *>().data, link.data_array<float *>().data, sig, coeff,
+                                  newOprod->data_array<float *>().data);
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      computeLongLinkField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
-                                   sig, coeff, reinterpret_cast<double *>(noprod_array));
+      computeLongLinkField<double>(X_, oprod.data_array<double *>().data, link.data_array<double *>().data, sig, coeff,
+                                   newOprod->data_array<double *>().data);
     } else {
       errorQuda("Unrecognised precision");
     }
@@ -1344,8 +1336,8 @@ void completeForceSite(int half_lattice_index,
 #else
                        const int[],
 #endif
-                       const Real *const oprod, const Real *const link, int sig, const LoadStore<Real> &ls,
-                       Real *const mom)
+                       const Real *const *const oprod, const Real *const *const link, int sig,
+                       const LoadStore<Real> &ls, Real *const mom)
 {
 
   typename ColorMatrix<Real>::Type colorMatX, colorMatY, linkW;
@@ -1366,7 +1358,8 @@ void completeForceSite(int half_lattice_index,
 }
 
 template <class Real>
-void completeForceField(const int dim[4], const Real *const oprod, const Real *const link, int sig, Real *const mom)
+void completeForceField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig,
+                        Real *const mom)
 {
   int volume = dim[0] * dim[1] * dim[2] * dim[3];
   const int half_volume = volume / 2;
@@ -1384,15 +1377,13 @@ void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda:
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
-  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
-  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      completeForceField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array), sig,
+      completeForceField<float>(X_, oprod.data_array<float *>().data, link.data_array<float *>().data, sig,
                                 mom->data<float *>());
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      completeForceField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
-                                 sig, mom->data<double *>());
+      completeForceField<double>(X_, oprod.data_array<double *>().data, link.data_array<double *>().data, sig,
+                                 mom->data<double *>());
     } else {
       errorQuda("Unrecognised precision");
     }
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 212549dc38..a04d28dedb 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -139,15 +139,15 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
                             long_link.Ghost()[3].data()};
 
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
-    staggeredDslashReference(static_cast<double *>(out.data()), reinterpret_cast<double **>(qdp_fatlink),
+    staggeredDslashReference(out.data<double *>(), reinterpret_cast<double **>(qdp_fatlink),
                              reinterpret_cast<double **>(qdp_longlink), reinterpret_cast<double **>(ghost_fatlink),
-                             reinterpret_cast<double **>(ghost_longlink), static_cast<double *>(in.data()),
+                             reinterpret_cast<double **>(ghost_longlink), in.data<double *>(),
                              reinterpret_cast<double **>(in.fwdGhostFaceBuffer),
                              reinterpret_cast<double **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
   } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
-    staggeredDslashReference(static_cast<float *>(out.data()), reinterpret_cast<float **>(qdp_fatlink),
+    staggeredDslashReference(out.data<float *>(), reinterpret_cast<float **>(qdp_fatlink),
                              reinterpret_cast<float **>(qdp_longlink), reinterpret_cast<float **>(ghost_fatlink),
-                             reinterpret_cast<float **>(ghost_longlink), static_cast<float *>(in.data()),
+                             reinterpret_cast<float **>(ghost_longlink), in.data<float *>(),
                              reinterpret_cast<float **>(in.fwdGhostFaceBuffer),
                              reinterpret_cast<float **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
   }
diff --git a/tests/laph_test.cpp b/tests/laph_test.cpp
index ac20bce407..75f244d406 100644
--- a/tests/laph_test.cpp
+++ b/tests/laph_test.cpp
@@ -114,8 +114,8 @@ auto laph_test(test_t param)
   std::vector<Complex> qudaRes(nSink * nEv * Lt * nSpin, 0.);
 
   int X[4] = {xdim, ydim, zdim, tdim};
-  laphSinkProject((__complex__ double *)qudaRes.data(), (void **)snkPtr.data(), nSink, tileSink,
-                  (void **)evPtr.data(), nEv, tileEv, &invParam, X);
+  laphSinkProject((__complex__ double *)qudaRes.data(), snkPtr.data(), nSink, tileSink, evPtr.data(), nEv, tileEv,
+                  &invParam, X);
   printfQuda("laphSinkProject Done: %g secs, %g Gflops\n", invParam.secs, invParam.gflops / invParam.secs);
 
   auto tol = getTolerance(cuda_prec);