diff --git a/README.md b/README.md
index 8b1afc8b43..274f708b3e 100644
--- a/README.md
+++ b/README.md
@@ -272,6 +272,7 @@ Advanced Scientific Computing (PASC21) [arXiv:2104.05615[hep-lat]].
 *  Hwancheol Jeong (Indiana University)
 *  Xiangyu Jiang (ITP, Chinese Academy of Sciences)
 *  Balint Joo (OLCF, Oak Ridge National Laboratory, formerly Jefferson Lab)
+*  Rohith Karur (UC Berkeley, Lawrence Berkeley National Laboratory)
 *  Hyung-Jin Kim (Samsung Advanced Institute of Technology)
 *  Bartosz Kostrzewa (HPC/A-Lab, University of Bonn)
 *  Damon McDougall (AMD)
diff --git a/include/quda.h b/include/quda.h
index 7501b4c9dd..5230d18eda 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -873,6 +873,8 @@ extern "C" {
     double alpha3;                 /**< The coefficient used in HYP smearing step 1*/
     unsigned int meas_interval;    /**< Perform the requested measurements on the gauge field at this interval */
     QudaGaugeSmearType smear_type; /**< The smearing type to perform */
+    unsigned int adj_n_save; /**< How many intermediate gauge fields to save at each large nblock to perform adj flow*/
+    unsigned int hier_threshold;   /**< Minimum *hierarchical* threshold for adj gradient flow*/
     QudaBoolean restart;           /**< Used to restart the smearing from existing gaugeSmeared */
     double t0;                     /**< Starting flow time for Wilson flow */
     int dir_ignore;                /**< The direction to be ignored by the smearing algorithm
@@ -1707,6 +1709,26 @@ extern "C" {
   void performGFlowQuda(void **h_out, void **h_in, QudaInvertParam *inv_param, QudaGaugeSmearParam *smear_param,
                         QudaGaugeObservableParam *obs_param, const size_t nSpinors);
 
+  /**
+   * Performs Adjoint Gradient Flow (gauge + fermion) the "safe" way on gaugePrecise and stores it in gaugeSmeared
+   * @param[out] h_out Output fermion field
+   * @param[in] h_in Input fermion field
+   * @param[in] smear_param Parameter struct that defines the computation parameters
+   * @param[in,out] obs_param Parameter struct that defines which
+   * observables we are making and the resulting observables.
+   */
+  void performAdjGFlowSafe(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaGaugeSmearParam *smear_param);
+
+  /**
+   * Performs Adjoint Gradient Flow (gauge + fermion) the Hierarchical way on gaugePrecise and stores it in gaugeSmeared
+   * @param[out] h_out Output fermion field
+   * @param[in] h_in Input fermion field
+   * @param[in] smear_param Parameter struct that defines the computation parameters
+   * @param[in,out] obs_param Parameter struct that defines which
+   * observables we are making and the resulting observables.
+   */
+  void performAdjGFlowHier(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaGaugeSmearParam *smear_param);
+
   /**
    * @brief Calculates a variety of gauge-field observables.  If a
    * smeared gauge field is presently loaded (in gaugeSmeared) the
diff --git a/lib/check_params.h b/lib/check_params.h
index ee5063f4c2..a0554356ba 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -1161,6 +1161,7 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param)
 #if defined CHECK_PARAM
   if (param->struct_size != (size_t)INVALID_INT && param->struct_size != sizeof(*param))
     errorQuda("Unexpected QudaGaugeSmearParam struct size %lu, expected %lu", param->struct_size, sizeof(*param));
+
 #else
   P(struct_size, (size_t)INVALID_INT);
 #endif
@@ -1176,6 +1177,8 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param)
   P(smear_anisotropy, 1.0);
   P(rk_order, 3);
   P(restart, QUDA_BOOLEAN_FALSE);
+  P(adj_n_save, 5);
+  P(hier_threshold, 6);
   P(t0, 0.0);
   P(alpha1, 0.0);
   P(alpha2, 0.0);
@@ -1190,6 +1193,8 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param)
   P(smear_anisotropy, INVALID_DOUBLE);
   P(rk_order, (unsigned int)INVALID_INT);
   P(restart, QUDA_BOOLEAN_INVALID);
+  P(adj_n_save, (unsigned int)INVALID_INT);
+  P(hier_threshold, (unsigned int)INVALID_INT);
   P(t0, INVALID_DOUBLE);
   P(alpha1, INVALID_DOUBLE);
   P(alpha2, INVALID_DOUBLE);
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 6b010fe066..3266167d5d 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3,6 +3,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
+#include <functional>
 #include <sys/time.h>
 
 #include <quda.h>
@@ -196,6 +197,11 @@ static TimeProfile profileWFlow("wFlowQuda");
 //!< Profiler for gFlowQuda
 static TimeProfile profileGFlow("gFlowQuda");
 
+//!< Profiler for gFlowQuda
+static TimeProfile profileAdjGFlowSafe("AdjgFlowSafeQuda");
+
+static TimeProfile profileAdjGFlowHier("AdjgFlowHierQuda");
+
 //!< Profiler for projectSU3Quda
 static TimeProfile profileProject("projectSU3Quda");
 
@@ -5468,6 +5474,451 @@ void performGFlowQuda(void **h_out, void **h_in, QudaInvertParam *inv_param, Qud
 
 } /* end of performGFlowQuda */
 
+// perform adjoint (backwards) gradient flow on gauge and spinor field following the algorithm in arXiv:1302.5246
+// (Appendix E) the gauge flow steps are identical to Wilson Flow algorithm in arXiv:1006.4518 (Vt <-> W3)
+void performAdjGFlowSafe(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaGaugeSmearParam *smear_param)
+{
+
+  auto profile = pushProfile(profileAdjGFlowSafe);
+  pushOutputPrefix("performAdjGFlowQudaSafe: ");
+  checkGaugeSmearParam(smear_param);
+
+  pushVerbosity(inv_param->verbosity);
+  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
+
+  if (smear_param->restart) {
+    if (gaugeSmeared == nullptr) errorQuda("gaugeSmeared must be loaded");
+  } else {
+    if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
+    freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
+    gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAdjGFlowSafe);
+  }
+
+  GaugeFieldParam gParamDummy(*gaugeSmeared);
+  GaugeField gaugeW0(gParamDummy);
+  GaugeField gaugeW1(gParamDummy);
+  GaugeField gaugeW2(gParamDummy);
+  GaugeField gaugeVT(gParamDummy);
+
+  GaugeFieldParam gParam(*gaugePrecise);
+  gParam.reconstruct = QUDA_RECONSTRUCT_NO; // temporary field is not on manifold so cannot use reconstruct
+  GaugeField gaugeTemp(gParam);
+
+  const GaugeField gin = *gaugeSmeared;
+  GaugeField &g_W0 = *gaugeSmeared;
+  GaugeField &g_W1 = gaugeW1;
+  GaugeField &g_W2 = gaugeW2;
+  GaugeField &g_VT = gaugeVT;
+
+  // helper gauge field for Laplace operator
+  GaugeField precise;
+  GaugeFieldParam gParam_helper(*gaugePrecise);
+  gParam_helper.create = QUDA_NULL_FIELD_CREATE;
+  precise = GaugeField(gParam_helper);
+
+  // spinor fields
+  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), false, inv_param->input_location);
+  ColorSpinorField fin_h(cpuParam);
+
+  ColorSpinorParam deviceParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION);
+  ColorSpinorField fin(deviceParam);
+  fin = fin_h;
+
+  deviceParam.create = QUDA_NULL_FIELD_CREATE;
+  ColorSpinorField fout(deviceParam);
+
+  int parity = 0;
+
+  // initialize a and b for Laplace operator
+  double a = 1.;
+  double b = -8.;
+
+  int comm_dim[4] = {};
+  // Will add fermion measruement utilities later
+  // int measurement_n = 0; // The nth measurement to take
+  // only switch on comms needed for directions with a derivative
+  for (int i = 0; i < 4; i++) { comm_dim[i] = comm_dim_partitioned(i); }
+
+  // auxilliary fermion fields [0], [1], [2] and [3]
+  ColorSpinorField f_temp0(deviceParam);
+  ColorSpinorField f_temp1(deviceParam);
+  ColorSpinorField f_temp2(deviceParam);
+  ColorSpinorField f_temp3(deviceParam);
+  ColorSpinorField f_temp4(deviceParam);
+
+  // set [3] = input spinor
+  f_temp3 = fin;
+
+  for (unsigned int j = 0; j < smear_param->n_steps; j++) {
+    for (unsigned int i = 0; i < smear_param->n_steps - j; i++) {
+
+      if (i == 0)
+        g_W0 = gin;
+      else
+        std::swap(g_W0, g_VT);
+
+      GFlowStep(g_W1, gaugeTemp, g_W0, smear_param->epsilon, smear_param->smear_type, WFLOW_STEP_W1);
+      GFlowStep(g_W2, gaugeTemp, g_W1, smear_param->epsilon, smear_param->smear_type, WFLOW_STEP_W2);
+      GFlowStep(g_VT, gaugeTemp, g_W2, smear_param->epsilon, smear_param->smear_type, WFLOW_STEP_VT);
+    }
+
+    // init auxilliary fields [0], [1] and [2] as [3]
+    f_temp0 = f_temp3;
+    f_temp1 = f_temp3;
+    f_temp2 = f_temp3;
+
+    copyExtendedGauge(precise, g_W2, QUDA_CUDA_FIELD_LOCATION);
+    precise.exchangeGhost();
+    ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, comm_dim, profileAdjGFlowSafe);
+
+    blas::ax(smear_param->epsilon * 3. / 4., f_temp4);
+
+    f_temp2 = f_temp4;
+
+    copyExtendedGauge(precise, g_W1, QUDA_CUDA_FIELD_LOCATION);
+    precise.exchangeGhost();
+    ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, comm_dim, profileAdjGFlowSafe);
+
+    blas::axpy(smear_param->epsilon * 8. / 9., f_temp4, f_temp3);
+
+    f_temp1 = f_temp3;
+    f_temp4 = f_temp1;
+
+    blas::axpy(-8. / 9., f_temp2, f_temp4);
+
+    copyExtendedGauge(precise, g_W0, QUDA_CUDA_FIELD_LOCATION);
+    precise.exchangeGhost();
+    ApplyLaplace(f_temp0, f_temp4, precise, 4, a, b, f_temp4, parity, comm_dim, profileAdjGFlowSafe);
+
+    blas::ax(smear_param->epsilon * 1. / 4., f_temp0);
+    blas::axpy(1., f_temp2, f_temp0);
+    blas::axpy(1., f_temp1, f_temp0);
+
+    fout = f_temp0;
+    // redefining f_temp0 to restart loop
+    f_temp3 = f_temp0;
+  }
+  cpuParam.v = h_out;
+  cpuParam.location = inv_param->output_location;
+  ColorSpinorField fout_h(cpuParam);
+  fout_h = fout;
+
+  popOutputPrefix();
+}
+
+void adjSafeEvolve(std::vector<std::reference_wrapper<ColorSpinorField>> sf_list,
+                   std::vector<std::reference_wrapper<GaugeField>> gf_list, QudaGaugeSmearParam *smear_param,
+                   unsigned int ns_safe, TimeProfile &profile, std::vector<std::reference_wrapper<int>> meas_cinf)
+{
+  const GaugeField gin = gf_list[0].get();
+  GaugeField &g_W0 = gf_list[0].get();
+  GaugeField &g_W1 = gf_list[1].get();
+  GaugeField &g_W2 = gf_list[2].get();
+  GaugeField &g_VT = gf_list[3].get();
+  GaugeField &gaugeTemp = gf_list[4].get();
+  GaugeField &precise = gf_list[5].get();
+
+  ColorSpinorField &f_temp0 = sf_list[0].get();
+  ColorSpinorField &f_temp1 = sf_list[1].get();
+  ColorSpinorField &f_temp2 = sf_list[2].get();
+  ColorSpinorField &f_temp3 = sf_list[3].get();
+  ColorSpinorField &f_temp4 = sf_list[4].get();
+
+  int &i_glob = meas_cinf[0].get();
+  int &measurement_n = meas_cinf[1].get();
+  measurement_n = 0;
+
+  int parity = 0;
+
+  // initialize a and b for Laplace operator
+  double a = 1.;
+  double b = -8.;
+
+  int comm_dim[4] = {};
+  // only switch on comms needed for directions with a derivative
+  for (int i = 0; i < 4; i++) { comm_dim[i] = comm_dim_partitioned(i); }
+
+  for (unsigned int j = 0; j < ns_safe; j++) {
+    for (unsigned int i = 0; i < ns_safe - j; i++) {
+
+      if (i == 0)
+        g_W0 = gin;
+      else
+        std::swap(g_W0, g_VT);
+
+      GFlowStep(g_W1, gaugeTemp, g_W0, smear_param->epsilon, smear_param->smear_type, WFLOW_STEP_W1);
+      GFlowStep(g_W2, gaugeTemp, g_W1, smear_param->epsilon, smear_param->smear_type, WFLOW_STEP_W2);
+      GFlowStep(g_VT, gaugeTemp, g_W2, smear_param->epsilon, smear_param->smear_type, WFLOW_STEP_VT);
+    }
+    // init auxilliary fields [0], [1] and [2] as [3]
+    f_temp0 = f_temp3;
+    f_temp1 = f_temp3;
+    f_temp2 = f_temp3;
+
+    // [4] = Lap2 [0]
+    copyExtendedGauge(precise, g_W2, QUDA_CUDA_FIELD_LOCATION);
+    precise.exchangeGhost();
+    ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, comm_dim, profile);
+
+    // [4] -> 3/4 eps [4]
+    blas::ax(smear_param->epsilon * 3. / 4., f_temp4);
+
+    // [2] = [4]
+    f_temp2 = f_temp4;
+
+    // [4] = Lap1 [2]
+    copyExtendedGauge(precise, g_W1, QUDA_CUDA_FIELD_LOCATION);
+    precise.exchangeGhost();
+    ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, comm_dim, profile);
+
+    // [3] -> [3] + 8/9 eps [4]
+    blas::axpy(smear_param->epsilon * 8. / 9., f_temp4, f_temp3);
+
+    // [1], [4] <- [3]
+    f_temp1 = f_temp3;
+    f_temp4 = f_temp1;
+
+    // [4] <- [4] - 8/9 [2]
+    blas::axpy(-8. / 9., f_temp2, f_temp4);
+
+    // [0] <- Lap0 [4]
+    copyExtendedGauge(precise, g_W0, QUDA_CUDA_FIELD_LOCATION);
+    precise.exchangeGhost();
+    ApplyLaplace(f_temp0, f_temp4, precise, 4, a, b, f_temp4, parity, comm_dim, profile);
+
+    // [0] <- 1/4 eps [0]; [0] <- [2] + [0]; [0] <- [1] + [0]
+    blas::ax(smear_param->epsilon * 1. / 4., f_temp0);
+    blas::axpy(1., f_temp2, f_temp0);
+    blas::axpy(1., f_temp1, f_temp0);
+
+    // redefining f_temp0 to restart loop
+    f_temp3 = f_temp0;
+
+    i_glob++;
+  }
+}
+
+/* total_dist == n_steps, n_b is dividing factor of each block, n_Save is the size of the list, "front" denotes whether
+ * split hierarchy goes to existing or new subhierarchy */
+std::vector<int> get_hier_list(int total_dist, int n_b, int n_save, bool front = true)
+{
+
+  std::vector<int> hier_list;
+  int counter = 0;
+
+  int val = total_dist;
+  for (int i_s = 0; i_s < n_save; i_s++) {
+    val = (val <= 1) ? 1 : val / n_b;
+    hier_list.push_back(val);
+    counter += val;
+  }
+
+  if (front)
+    hier_list.at(0) += total_dist - counter;
+  else
+    hier_list.back() += total_dist - counter;
+
+  return hier_list;
+}
+
+int modify_hier_list(std::vector<int> &hier_list, int n_b, int n_save, int threshold)
+{
+
+  int result = -1;
+  int current_size = hier_list.size();
+  std::vector<int> temp_list;
+  if (current_size > n_save) errorQuda("something isnt right\n");
+
+  int diff = n_save - current_size;
+
+  for (int i = current_size - 1; i >= 0; --i) {
+
+    if (hier_list[i] > threshold) {
+
+      temp_list = get_hier_list(hier_list[i], n_b, diff + 1, false);
+      hier_list.erase(hier_list.begin() + i);
+      hier_list.insert(hier_list.begin() + i, temp_list.begin(), temp_list.end());
+      result = i;
+      break;
+    }
+  }
+
+  return result;
+}
+
+void performAdjGFlowHier(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaGaugeSmearParam *smear_param)
+{
+
+  auto profile = pushProfile(profileAdjGFlowHier);
+  pushOutputPrefix("performAdjGFlowQudaHier: ");
+  checkGaugeSmearParam(smear_param);
+
+  if (smear_param->n_steps <= smear_param->adj_n_save) {
+
+    errorQuda("Not good practice to have adj_n_save (%d) >= n_steps (%d); adj_n_save should be manually altered to "
+              "min(nsteps, %d): \n",
+              smear_param->n_steps, smear_param->adj_n_save, smear_param->n_steps - 1);
+  }
+
+  pushVerbosity(inv_param->verbosity);
+  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
+
+  if (smear_param->restart) {
+    if (gaugeSmeared == nullptr) errorQuda("gaugeSmeared must be loaded");
+  } else {
+    if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
+    freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
+    gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileAdjGFlowHier);
+  }
+
+  GaugeFieldParam gParamDummy(*gaugeSmeared);
+  GaugeField gaugeW0(gParamDummy);
+  GaugeField gaugeW1(gParamDummy);
+  GaugeField gaugeW2(gParamDummy);
+  GaugeField gaugeVT(gParamDummy);
+  GaugeField gauge_out(gParamDummy);
+
+  GaugeFieldParam gParam(*gaugePrecise);
+  gParam.reconstruct = QUDA_RECONSTRUCT_NO; // temporary field is not on manifold so cannot use reconstruct
+  GaugeField gaugeTemp(gParam);
+
+  auto n = smear_param->adj_n_save;
+
+  std::vector<GaugeField> gauge_stages(n, gParamDummy);
+  gauge_stages[0] = *gaugeSmeared;
+  // Can also do below
+  // creates copies std::vector<GaugeField> gauge_stages(n,*gaugeSmeared);
+
+  GaugeField &gin = *gaugeSmeared;
+  GaugeField &gout = gauge_out;
+
+  // helper gauge field for Laplace operator
+  GaugeField precise;
+  GaugeFieldParam gParam_helper(*gaugePrecise);
+  gParam_helper.create = QUDA_NULL_FIELD_CREATE;
+  precise = GaugeField(gParam_helper);
+
+  // spinor fields
+  ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), false, inv_param->input_location);
+  ColorSpinorField fin_h(cpuParam);
+
+  ColorSpinorParam deviceParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION);
+  ColorSpinorField fin(deviceParam);
+  fin = fin_h;
+
+  deviceParam.create = QUDA_NULL_FIELD_CREATE;
+  ColorSpinorField fout(deviceParam);
+
+  ColorSpinorField f_temp0(deviceParam);
+  ColorSpinorField f_temp1(deviceParam);
+  ColorSpinorField f_temp2(deviceParam);
+  ColorSpinorField f_temp3(deviceParam);
+  ColorSpinorField f_temp4(deviceParam);
+
+  // IMPORTANT initializing step: set [3] = input spinor
+  f_temp3 = fin;
+
+  int n_b = ceil(pow(1. * smear_param->n_steps, 1. / (smear_param->adj_n_save + 1)));
+  logQuda(QUDA_SUMMARIZE, "Hierarchical block n_b: %d\n\n", n_b);
+  int ret_idx = 0;
+  int threshold = smear_param->hier_threshold;
+  std::vector<int> hier_list;
+  // The first stage is saved at the very beginning, so its presence is implicit
+  hier_list = get_hier_list(smear_param->n_steps, n_b, smear_param->adj_n_save);
+  logQuda(QUDA_SUMMARIZE, "hier list size (number of gauge fields to save) is %d\n", (int)hier_list.size());
+  if (threshold < hier_list.back()) {
+    threshold = hier_list.back();
+    logQuda(QUDA_SUMMARIZE, "threshold changed to %d", threshold);
+  } else
+    logQuda(QUDA_SUMMARIZE, "threshold is %d\n", threshold);
+
+  if (hier_list.empty()) errorQuda("hier_list is not populated\n");
+  if (hier_list.size() != gauge_stages.size()) errorQuda("hier_list is not same size as gauge_stages\n");
+
+  for (unsigned int i = 0; i < hier_list.size() - 1; i++) {
+
+    if (i == 0) {
+      logQuda(QUDA_VERBOSE, "we first set gin to the first index of the gauge_steps vector\n");
+      gauge_stages[0] = gin;
+    }
+    if (i > 0) std::swap(gout, gin);
+
+    for (unsigned int j = 0; j < (unsigned int)hier_list[i]; j++) {
+      if (j > 0) std::swap(gout, gin);
+
+      WFlowStep(gout, gaugeTemp, gin, smear_param->epsilon, smear_param->smear_type, smear_param->smear_anisotropy,
+                smear_param->rk_order);
+    }
+    gauge_stages[i + 1] = gout;
+  }
+
+  std::vector<std::reference_wrapper<ColorSpinorField>> sf_list;
+  sf_list = {f_temp0, f_temp1, f_temp2, f_temp3, f_temp4};
+  std::vector<std::reference_wrapper<GaugeField>> gf_list;
+  gf_list = {gauge_stages.back(), gaugeW1, gaugeW2, gaugeVT, gaugeTemp, precise};
+
+  // first one is global counter, second is meas counter
+  int i_glob = 0, measurement_n = 0;
+  std::vector<std::reference_wrapper<int>> meas_cinf {i_glob, measurement_n};
+
+  int hier_loop_counter = 0;
+  while (ret_idx != -1) {
+    logQuda(QUDA_DEBUG_VERBOSE, "Hier loop count %d has begun \n", hier_loop_counter);
+    logQuda(QUDA_DEBUG_VERBOSE, "Starting a hierarchical loop log: \n");
+
+    adjSafeEvolve(sf_list, gf_list, smear_param, hier_list.back(), profileAdjGFlowHier, meas_cinf);
+
+    logQuda(QUDA_DEBUG_VERBOSE, "Previous hier list elements: \n");
+    for (int j = 0; j < (int)hier_list.size(); j++) { logQuda(QUDA_DEBUG_VERBOSE, "%d \n", (int)hier_list[j]); }
+    logQuda(QUDA_DEBUG_VERBOSE, "\n");
+
+    hier_list.pop_back();
+    gauge_stages.pop_back();
+    ret_idx = modify_hier_list(hier_list, n_b, smear_param->adj_n_save, threshold);
+    if (ret_idx == -1) {
+      logQuda(QUDA_VERBOSE, " now in final serial stage of hierarchial evolution \n");
+      for (int i = gauge_stages.size() - 1; i >= 0; --i) {
+        // first load correct gauge field (for beginning of the loop, it is the final gauge list element)
+
+        gf_list.at(0) = std::ref(gauge_stages[i]);
+
+        adjSafeEvolve(sf_list, gf_list, smear_param, hier_list[i], profileAdjGFlowHier, meas_cinf);
+
+        logQuda(QUDA_DEBUG_VERBOSE, " block number %d successfully deployed \n", i);
+      }
+      logQuda(QUDA_VERBOSE, "Hierarchial evolution completed \n");
+      break;
+    }
+
+    GaugeField g_2(gParamDummy);
+    GaugeField g_1 = gauge_stages[ret_idx];
+
+    logQuda(QUDA_DEBUG_VERBOSE, "Modified hier list elements: \n");
+    for (int j = 0; j < (int)hier_list.size(); j++) { logQuda(QUDA_DEBUG_VERBOSE, "%d \n", (int)hier_list[j]); }
+    logQuda(QUDA_DEBUG_VERBOSE, "\n");
+
+    for (unsigned int j = 0; j < (unsigned int)hier_list[ret_idx]; j++) {
+      if (j > 0) std::swap(g_2, g_1);
+      WFlowStep(g_2, gaugeTemp, g_1, smear_param->epsilon, smear_param->smear_type, smear_param->smear_anisotropy,
+                smear_param->rk_order);
+    }
+
+    gauge_stages.insert(gauge_stages.begin() + ret_idx + 1, g_2);
+    logQuda(QUDA_DEBUG_VERBOSE, "recycled gauge field placed *before* index %d\n\n", ret_idx + 1);
+    gf_list.at(0) = std::ref(gauge_stages.back());
+    hier_loop_counter += 1;
+  }
+
+  cpuParam.v = h_out;
+  cpuParam.location = inv_param->output_location;
+  ColorSpinorField fout_h(cpuParam);
+  fout_h = sf_list[0].get();
+  logQuda(QUDA_DEBUG_VERBOSE, "Spinor written to cpu \n");
+  popOutputPrefix();
+}
+
+/* save list of gauge vectors */
+
 int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                               const unsigned int verbose_interval, const double relax_boost, const double tolerance,
                               const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a0ab337328..45e6b26cb2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -199,6 +199,13 @@ target_link_libraries(su3_test ${TEST_LIBS})
 quda_checkbuildtest(su3_test QUDA_BUILD_ALL_TESTS)
 install(TARGETS su3_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
 
+if(QUDA_DIRAC_LAPLACE)
+  add_executable(su3_fermion_test su3_fermion_test.cpp)
+  target_link_libraries(su3_fermion_test ${TEST_LIBS})
+  quda_checkbuildtest(su3_fermion_test QUDA_BUILD_ALL_TESTS)
+  install(TARGETS su3_fermion_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
 add_executable(pack_test pack_test.cpp)
 target_link_libraries(pack_test ${TEST_LIBS})
 quda_checkbuildtest(pack_test QUDA_BUILD_ALL_TESTS)
@@ -1450,6 +1457,12 @@ if (QUDA_DIRAC_LAPLACE AND QUDA_DIRAC_WILSON)
            --dim 2 4 6 8
            --enable-testing true
            --gtest_output=xml:laph_test.xml)
+
+  add_test(NAME su3_fermion_test
+           COMMAND  ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:su3_fermion_test> ${MPIEXEC_POSTFLAGS}
+           --dim 2 4 6 8
+           --enable-testing true
+           --gtest_output=xml:su3_fermion_test.xml)
 endif()
 
 
diff --git a/tests/su3_fermion_test.cpp b/tests/su3_fermion_test.cpp
new file mode 100644
index 0000000000..672735bd74
--- /dev/null
+++ b/tests/su3_fermion_test.cpp
@@ -0,0 +1,313 @@
+// In a typical application, quda.h is the only QUDA header required.
+#include <quda.h>
+#include <util_quda.h>
+#include <instantiate.h>
+
+#include "host_utils.h"
+#include "gauge_utils.h"
+#include "command_line_params.h"
+#include "dslash_reference.h"
+#include "misc.h"
+#include "test.h"
+
+using test_t = ::testing::tuple<QudaPrecision, QudaGaugeSmearType>;
+
+class SU3FermionTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  QudaPrecision precision;
+  QudaGaugeSmearType smear_type;
+
+public:
+  SU3FermionTest() : precision(::testing::get<0>(GetParam())), smear_type(::testing::get<1>(GetParam())) { }
+};
+
+int argc_copy;
+char **argv_copy;
+
+void run(test_t param)
+{
+  prec = ::testing::get<0>(param);
+  gauge_smear_type = ::testing::get<1>(param);
+
+  using namespace quda;
+
+  if (!is_enabled_spin(4)) errorQuda("Test requires Wilson-type fermion enablement");
+
+  QudaGaugeParam gauge_param = newQudaGaugeParam();
+  if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec;
+  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon;
+
+  setWilsonGaugeParam(gauge_param);
+  gauge_param.t_boundary = QUDA_PERIODIC_T;
+
+  void *gauge[4], *new_gauge[4];
+  for (int dir = 0; dir < 4; dir++) {
+    gauge[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    new_gauge[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+  }
+
+  constructHostGaugeField(gauge, gauge_param, argc_copy, argv_copy);
+  // Load the gauge field to the device
+  loadGaugeQuda((void *)gauge, &gauge_param);
+  saveGaugeQuda(new_gauge, &gauge_param);
+  // start the timer
+  quda::host_timer_t host_timer, host_safe_timer, host_hier_timer, host_fwd_timer;
+
+  // The commented out section is all geared towards gauge observables, so unlikely to be needed for now
+  // // Prepare various perf info
+  // long long flops_plaquette = 6ll * 597 * V;
+  // long long flops_ploop = 198ll * V + 6 * V / gauge_param.X[3];
+
+  // // Prepare a gauge observable struct
+  // QudaGaugeObservableParam param = newQudaGaugeObservableParam();
+
+  // The user may specify which measurements they wish to perform/omit
+  // using the QudaGaugeObservableParam struct, and whether or not to
+  // perform suN projection at each measurement step. We recommend that
+  // users perform suN projection.
+  // A unique observable param struct is constructed for each measurement.
+
+  // Gauge Smearing Routines
+  //---------------------------------------------------------------------------
+  // Stout smearing should be equivalent to APE smearing
+  // on D dimensional lattices for rho = alpha/2*(D-1).
+  // Typical values for
+  // APE: alpha=0.6
+  // Stout: rho=0.1
+  // Over Improved Stout: rho=0.08, epsilon=-0.25
+  //
+  // Typically, the user will use smearing for Q charge data only, so
+  // we hardcode to compute Q only and not the plaquette. Users may
+  // of course set these as they wish.  SU(N) projection su_project=true is recommended.
+  QudaGaugeObservableParam *obs_param = new QudaGaugeObservableParam[gauge_smear_steps / measurement_interval + 1];
+  for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
+    obs_param[i] = newQudaGaugeObservableParam();
+    obs_param[i].compute_plaquette = QUDA_BOOLEAN_FALSE;
+    obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE;
+    obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
+  }
+
+  // We here set all the problem parameters for all possible smearing types.
+  QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam();
+  smear_param.smear_type = gauge_smear_type;
+  smear_param.n_steps = gauge_smear_steps;
+  smear_param.adj_n_save = gauge_n_save;
+  smear_param.hier_threshold = hier_threshold;
+  smear_param.meas_interval = measurement_interval;
+  smear_param.alpha = gauge_smear_alpha;
+  smear_param.rho = gauge_smear_rho;
+  smear_param.epsilon = gauge_smear_epsilon;
+  smear_param.alpha1 = gauge_smear_alpha1;
+  smear_param.alpha2 = gauge_smear_alpha2;
+  smear_param.alpha3 = gauge_smear_alpha3;
+  smear_param.dir_ignore = gauge_smear_dir_ignore;
+
+  quda::ColorSpinorField check, check_safe, check_hier, check_fwd;
+  QudaInvertParam invParam = newQudaInvertParam();
+  invParam.cpu_prec = QUDA_DOUBLE_PRECISION;
+  invParam.cuda_prec = cuda_prec;
+  invParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+  invParam.dirac_order = QUDA_DIRAC_ORDER;
+  invParam.verbosity = verbosity;
+
+  quda::ColorSpinorParam cs_param;
+
+  constructWilsonTestSpinorParam(&cs_param, &invParam, &gauge_param);
+  check = quda::ColorSpinorField(cs_param);
+  // Add noise to spinor
+  spinorNoise(check, 1234, QUDA_NOISE_GAUSS);
+
+  check_safe = quda::ColorSpinorField(cs_param);
+  check_hier = quda::ColorSpinorField(cs_param);
+  check_fwd = quda::ColorSpinorField(cs_param);
+
+  void *check_arr[] = {check.data()};
+  void *check_fwdarr[] = {check_fwd.data()};
+
+  printf("Inspecting the very first element of the random fermion we will use:\n");
+  check.PrintVector(0, 0, 0);
+  printf("Inspecting the very first element of the 3 un-evolved fermions (should be zero):\n");
+  printf("Hierarchical method:\n");
+  check_hier.PrintVector(0, 0, 0);
+  printf("Safe method:\n");
+  check_safe.PrintVector(0, 0, 0);
+  printf("Forward method:\n");
+  check_fwd.PrintVector(0, 0, 0);
+
+  host_timer.start(); // start the timer
+  switch (smear_param.smear_type) {
+  case QUDA_GAUGE_SMEAR_APE:
+  case QUDA_GAUGE_SMEAR_STOUT:
+  case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
+  case QUDA_GAUGE_SMEAR_HYP: {
+    performGaugeSmearQuda(&smear_param, obs_param);
+    break;
+  }
+
+    // Here we use a typical use case which is different from simple smearing in that
+    // the user will want to compute the plaquette values to compute the gauge energy.
+  case QUDA_GAUGE_SMEAR_WILSON_FLOW:
+  case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: {
+    for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
+      obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE;
+    }
+
+    // Perform two adjoint flow algorithms, these methods dont alter the final value for the gauge so we excecute them first
+    host_hier_timer.start();
+    performAdjGFlowHier(check_hier.data(), check.data(), &invParam, &smear_param);
+    host_hier_timer.stop();
+    host_safe_timer.start();
+    performAdjGFlowSafe(check_safe.data(), check.data(), &invParam, &smear_param);
+    host_safe_timer.stop();
+    // Perform forward flow algorithm
+    host_fwd_timer.start();
+    performGFlowQuda(check_fwdarr, check_arr, &invParam, &smear_param, obs_param, 1);
+    host_fwd_timer.stop();
+
+    printfQuda("Time elapsed for adjoint hierarchical fermion/gauge smearing = %g secs\n", host_hier_timer.last());
+    printfQuda("Time elapsed for adjoint safe fermion/gauge smearing = %g secs\n", host_safe_timer.last());
+    printfQuda("Time elapsed for forward fermion/gauge smearing = %g secs\n", host_fwd_timer.last());
+
+    break;
+  }
+  default: errorQuda("Undefined gauge smear type %d given", smear_param.smear_type);
+  }
+
+  host_timer.stop(); // stop the timer
+
+  printfQuda("Total time for collective fermion/gauge smearing = %g secs\n", host_timer.last());
+  printf("Now, inspecting the very first element of the 3 evolved fermions:\n");
+  printf("Hierarchical method:\n");
+  check_hier.PrintVector(0, 0, 0);
+  printf("Safe method:\n");
+  check_safe.PrintVector(0, 0, 0);
+  printf("Forward method:\n");
+  check_fwd.PrintVector(0, 0, 0);
+
+  double method_adj_diff = 0.;
+  /* To access the ith complex entry in a raw vector, do, for example: check.data<std::complex<double>*>()[i]*/
+  for (int i = 0; i < V * 24; i++) {
+    method_adj_diff += pow(fabs(check_safe.data<double *>()[i] - check_hier.data<double *>()[i]), 2);
+  }
+  double method_adj_check = sqrt(method_adj_diff) / (V * 24.);
+  printf(
+    "Mean of mag errors between Safe and Hierarchical Adj methods (should be zero up to machine precision) = %1.5e\n",
+    method_adj_check);
+
+  std::complex<double> trace_fwd, trace_adj;
+  trace_fwd = twoColorSpinorContract(check.data<std::complex<double> *>(), check_fwd.data<std::complex<double> *>());
+  trace_adj = twoColorSpinorContract(check.data<std::complex<double> *>(), check_safe.data<std::complex<double> *>());
+
+  auto trace_diff_err = 2. * std::fabs(trace_fwd - std::conj(trace_adj)) / std::fabs(trace_fwd + std::conj(trace_adj));
+
+  printf("The two numbers below should be complex conjugates of one another\n");
+  printf("<check,adj_check> is %1.5e, %1.5e \n", trace_adj.real(), trace_adj.imag());
+  printf("<check,fwd_check> is %1.5e, %1.5e \n", trace_fwd.real(), trace_fwd.imag());
+  printf("Fractional error of (<check,adj_check> - <check,fwd_check>.conj()) = %1.5e \n", trace_diff_err);
+
+  auto eps = getTolerance(prec);
+
+  printfQuda("Checking adjoint safe/hier match\n");
+  EXPECT_LE(method_adj_check, gauge_smear_steps * gauge_smear_steps * eps);
+  if (method_adj_check > gauge_smear_steps * gauge_smear_steps * eps)
+    warningQuda("Adjoint safe/hier difference %e greater than tolerance %e\n", method_adj_check,
+                gauge_smear_steps * gauge_smear_steps * eps);
+
+  printfQuda("Checking fractional error\n");
+  EXPECT_LE(trace_diff_err, gauge_smear_steps * gauge_smear_steps * eps);
+  if (trace_diff_err > gauge_smear_steps * gauge_smear_steps * eps)
+    warningQuda("Fractional error %e greater than tolerance %e\n", trace_diff_err,
+                gauge_smear_steps * gauge_smear_steps * eps);
+
+  if (verify_results) check_gauge(gauge, new_gauge, 1e-3, gauge_param.cpu_prec);
+
+  for (int dir = 0; dir < 4; dir++) {
+    host_free(gauge[dir]);
+    host_free(new_gauge[dir]);
+  }
+
+  check = {};
+  check_hier = {};
+  check_safe = {};
+  check_fwd = {};
+
+  freeGaugeQuda();
+}
+
+TEST_P(SU3FermionTest, verify)
+{
+  if (!quda::is_enabled(precision)) GTEST_SKIP();
+  run(GetParam());
+}
+
+struct su3_fermion_test : quda_test {
+
+  void display_info() const override
+  {
+    quda_test::display_info();
+    printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon S_dimension T_dimension\n");
+    printfQuda("%s   %s             %s            %s            %d/%d/%d          %d\n", get_prec_str(prec),
+               get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim,
+               tdim);
+
+    // Specific test
+    printfQuda("\n%s smearing\n", get_gauge_smear_str(gauge_smear_type));
+    switch (gauge_smear_type) {
+    case QUDA_GAUGE_SMEAR_APE: printfQuda(" - alpha %f\n", gauge_smear_alpha); break;
+    case QUDA_GAUGE_SMEAR_STOUT: printfQuda(" - rho %f\n", gauge_smear_rho); break;
+    case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
+      printfQuda(" - rho %f\n", gauge_smear_rho);
+      printfQuda(" - epsilon %f\n", gauge_smear_epsilon);
+      break;
+    case QUDA_GAUGE_SMEAR_HYP:
+      printfQuda(" - alpha1 %f\n", gauge_smear_alpha1);
+      printfQuda(" - alpha2 %f\n", gauge_smear_alpha2);
+      printfQuda(" - alpha3 %f\n", gauge_smear_alpha3);
+      break;
+    case QUDA_GAUGE_SMEAR_WILSON_FLOW:
+    case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: printfQuda(" - epsilon %f\n", gauge_smear_epsilon); break;
+    default: errorQuda("Undefined test type %d given", test_type);
+    }
+    printfQuda(" - smearing steps %d\n", gauge_smear_steps);
+    printfQuda(" - smearing ignore direction %d\n", gauge_smear_dir_ignore);
+    printfQuda(" - Measurement interval %d\n", measurement_interval);
+  }
+
+  void add_command_line_group(std::shared_ptr<QUDAApp> app) const override
+  {
+    quda_test::add_command_line_group(app);
+    add_su3_option_group(app);
+  }
+
+  su3_fermion_test(int argc, char **argv) : quda_test("SU3 Fermion Test", argc, argv) { }
+};
+
+auto test_str = [](testing::TestParamInfo<test_t> param) {
+  return std::string(get_prec_str(::testing::get<0>(param.param))) + "_"
+    + get_gauge_smear_str(::testing::get<1>(param.param));
+};
+
+INSTANTIATE_TEST_SUITE_P(Flow, SU3FermionTest,
+                         ::testing::Combine(::testing::Values(QUDA_DOUBLE_PRECISION, QUDA_SINGLE_PRECISION),
+                                            ::testing::Values(QUDA_GAUGE_SMEAR_WILSON_FLOW,
+                                                              QUDA_GAUGE_SMEAR_SYMANZIK_FLOW)),
+                         test_str);
+
+int main(int argc, char **argv)
+{
+  argc_copy = argc;
+  argv_copy = argv;
+
+  su3_fermion_test test(argc, argv);
+  test.init();
+  if (enable_testing) {
+    return test.execute();
+  } else {
+    if (gauge_smear_type != QUDA_GAUGE_SMEAR_WILSON_FLOW && gauge_smear_type != QUDA_GAUGE_SMEAR_SYMANZIK_FLOW) {
+      warningQuda("Smear type %s not supported, setting to Wilson Flow", get_gauge_smear_str(gauge_smear_type));
+      gauge_smear_type = QUDA_GAUGE_SMEAR_WILSON_FLOW;
+    }
+    run(test_t {prec, gauge_smear_type});
+  };
+}
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index 7e0146d8fc..ff2de0184e 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -313,6 +313,8 @@ double gauge_smear_alpha1 = 0.75;
 double gauge_smear_alpha2 = 0.6;
 double gauge_smear_alpha3 = 0.3;
 int gauge_smear_steps = 5;
+int gauge_n_save = 3;
+int hier_threshold = 6;
 int gauge_smear_dir_ignore = -1;
 int measurement_interval = 5;
 bool su_project = true;
@@ -1158,6 +1160,11 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
 
   opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)");
 
+  opgroup->add_option("--su3-adj-gauge-nsave", gauge_n_save,
+                      "The number of gauge steps to save for hierarchical adj grad flow");
+
+  opgroup->add_option("--su3-hier-threshold", hier_threshold, "Minimum threshold for hierarchical adj grad flow");
+
   opgroup->add_option("--su3-measurement-interval", measurement_interval,
                       "Measure the field energy and/or topological charge every Nth step (default 5) ");
 
diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h
index f0739a2ea6..8641ee0a0d 100644
--- a/tests/utils/command_line_params.h
+++ b/tests/utils/command_line_params.h
@@ -561,6 +561,8 @@ extern double gauge_smear_alpha1;
 extern double gauge_smear_alpha2;
 extern double gauge_smear_alpha3;
 extern int gauge_smear_steps;
+extern int gauge_n_save;
+extern int hier_threshold;
 extern int gauge_smear_dir_ignore;
 extern int measurement_interval;
 extern QudaGaugeSmearType gauge_smear_type;
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index bd953f9ddb..2a31748a7a 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -964,6 +964,46 @@ void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precisi
     checkGauge((float **)oldG, (float **)newG, epsilon);
 }
 
+std::complex<double> twoColorSpinorContract(std::complex<double> *spinor1, std::complex<double> *spinor2)
+{
+  int col_inc = 3;
+
+  std::vector<int> col_st {0, 1, 2};
+  std::vector<int> row_st {0, 3, 6};
+
+  std::vector<complex<double>> test_contract(9 * V);
+  complex<double> trace = {0., 0.};
+  double trace_re, trace_im;
+  for (int i = 0; i < V; i++) {
+
+    for (int ii = 0; ii < 9; ii++) {
+      int which_col_idx = (ii % 3), which_row_idx = (ii - (ii % 3)) / 3;
+
+      std::complex<double> dot = {0., 0.};
+
+      for (int i_s = 0; i_s < 4; i_s++) {
+
+        int s_row_idx = i * 12 + col_st[which_row_idx] + i_s * col_inc;
+        int s_col_idx = i * 12 + col_st[which_col_idx] + i_s * col_inc;
+
+        auto m1 = std::conj(spinor1[s_row_idx]);
+        auto m2 = spinor2[s_col_idx];
+
+        dot += m1 * m2;
+      }
+      test_contract[i * 9 + ii] = dot;
+    }
+    trace += (test_contract[i * 9] + test_contract[i * 9 + 4] + test_contract[i * 9 + 8]);
+  }
+  trace_re = trace.real();
+  trace_im = trace.imag();
+  quda::comm_allreduce_sum(trace_re);
+  quda::comm_allreduce_sum(trace_im);
+
+  std::complex<double> trace_fin = {trace_re, trace_im};
+  return trace_fin;
+}
+
 void createSiteLinkCPU(void *const *gauge, QudaPrecision precision, SiteLinkType phase)
 {
   if (phase == SiteLinkType::SITELINK_PHASE_NO) {
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 53f261a612..cc1e71751e 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -137,6 +137,7 @@ void constructPointSpinorSource(void *v, QudaPrecision precision, const int *con
 void constructWallSpinorSource(void *v, int nSpin, int nColor, QudaPrecision precision, const int dil);
 void constructRandomSpinorSource(void *v, int nSpin, int nColor, QudaPrecision precision, QudaSolutionType sol_type,
                                  const int *const x, int nDim, quda::RNG &rng);
+std::complex<double> twoColorSpinorContract(std::complex<double> *spinor1, std::complex<double> *spinor2);
 //------------------------------------------------------
 
 // Helper functions
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index b3041e85f5..3bcc32e22a 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -240,10 +240,10 @@ const char *get_gauge_smear_str(QudaGaugeSmearType type)
   switch (type) {
   case QUDA_GAUGE_SMEAR_APE: ret = "APE"; break;
   case QUDA_GAUGE_SMEAR_STOUT: ret = "Stout"; break;
-  case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: ret = "Over Improved"; break;
+  case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: ret = "Over_Improved"; break;
   case QUDA_GAUGE_SMEAR_HYP: ret = "HYP"; break;
-  case QUDA_GAUGE_SMEAR_WILSON_FLOW: ret = "Wilson Flow"; break;
-  case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: ret = "Symanzik Flow"; break;
+  case QUDA_GAUGE_SMEAR_WILSON_FLOW: ret = "Wilson_Flow"; break;
+  case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: ret = "Symanzik_Flow"; break;
   default: ret = "unknown"; break;
   }