From a5d3463c215abb0fe2167364490b6aceda4d69b4 Mon Sep 17 00:00:00 2001
From: James Osborn <osborn@alcf.anl.gov>
Date: Wed, 4 Oct 2023 15:49:06 -0500
Subject: [PATCH] make special ops initialization mandatory at construction
 time

---
 include/dslash_helper.cuh                     |  13 +-
 include/kernels/block_orthogonalize.cuh       |   4 +-
 include/kernels/block_transpose.cuh           |   4 +-
 include/kernels/clover_deriv.cuh              |   3 +-
 include/kernels/coarse_op_kernel.cuh          |   8 +-
 include/kernels/color_spinor_pack.cuh         |  10 +-
 include/kernels/covDev.cuh                    |   3 +-
 include/kernels/dslash_clover_helper.cuh      |  12 +-
 include/kernels/dslash_coarse.cuh             |   4 +-
 include/kernels/dslash_domain_wall_4d.cuh     |   3 +-
 .../dslash_domain_wall_4d_fused_m5.cuh        |   4 +-
 include/kernels/dslash_domain_wall_5d.cuh     |   3 +-
 include/kernels/dslash_domain_wall_m5.cuh     |   8 +-
 include/kernels/dslash_mobius_eofa.cuh        |  18 ++-
 .../kernels/dslash_ndeg_twisted_clover.cuh    |   8 +-
 ...ash_ndeg_twisted_clover_preconditioned.cuh |   4 +-
 include/kernels/dslash_ndeg_twisted_mass.cuh  |   3 +-
 ...slash_ndeg_twisted_mass_preconditioned.cuh |   4 +-
 include/kernels/dslash_staggered.cuh          |   3 +-
 .../dslash_twisted_clover_preconditioned.cuh  |   3 +-
 include/kernels/dslash_twisted_mass.cuh       |   3 +-
 .../dslash_twisted_mass_preconditioned.cuh    |   3 +-
 include/kernels/dslash_wilson.cuh             |   3 +-
 include/kernels/dslash_wilson_clover.cuh      |   3 +-
 .../dslash_wilson_clover_hasenbusch_twist.cuh |   3 +-
 ...clover_hasenbusch_twist_preconditioned.cuh |   3 +-
 .../dslash_wilson_clover_preconditioned.cuh   |   3 +-
 include/kernels/field_strength_tensor.cuh     |   3 +-
 include/kernels/gauge_ape.cuh                 |   6 +-
 include/kernels/gauge_fix_ovr.cuh             |   7 +-
 include/kernels/gauge_force.cuh               |   6 +-
 include/kernels/gauge_loop_trace.cuh          |   7 +-
 include/kernels/gauge_stout.cuh               |   7 +-
 include/kernels/gauge_wilson_flow.cuh         |   4 +-
 include/kernels/hisq_paths_force.cuh          |   8 +-
 include/kernels/laplace.cuh                   |   3 +-
 include/kernels/madwf_transfer.cuh            |   4 +-
 include/kernels/restrictor.cuh                |   4 +-
 include/targets/generic/helpers.h             |   6 +
 include/targets/generic/special_ops.h         |  33 ++---
 include/targets/sycl/block_reduce_helper.h    |  29 +++--
 include/targets/sycl/block_reduction_kernel.h |  10 ++
 include/targets/sycl/kernel.h                 |  20 +++
 include/targets/sycl/reduce_helper.h          |   8 +-
 include/targets/sycl/reduction_kernel.h       |  14 ++
 include/targets/sycl/shared_memory_helper.h   |   2 +
 include/targets/sycl/special_ops_target.h     | 120 +++++++++++-------
 include/targets/sycl/tunable_kernel.h         |  10 ++
 48 files changed, 321 insertions(+), 133 deletions(-)
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 8a6261699d..465d687979 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -662,15 +662,18 @@ namespace quda
     static constexpr bool dagger = Arg::dagger;
     static constexpr KernelType kernel_type = Arg::kernel_type;
     static constexpr const char *filename() { return Arg::D::filename(); }
-    constexpr dslash_functor(const Arg &arg) : arg(arg.arg) { }
+    using typename getSpecialOps<typename Arg::D>::KernelOpsT;
+    template <typename ...Ops>
+    constexpr dslash_functor(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg.arg) { }
 
     template <bool allthreads = false>
     __forceinline__ __device__ void operator()(int, int s, int parity, bool active = true)
     {
-      typename Arg::D dslash(arg);
-      if constexpr (hasSpecialOps<typename Arg::D>) {
-	dslash.setSpecialOps(*this);
-      }
+      //typename Arg::D dslash(arg);
+      //if constexpr (hasSpecialOps<typename Arg::D>) {
+      //dslash.setSpecialOps(*this);
+      //}
+      typename Arg::D dslash(*this);
       // for full fields set parity from z thread index else use arg setting
       if (nParity == 1) parity = arg.parity;
 
diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh
index 69842d3084..4218bf0950 100644
--- a/include/kernels/block_orthogonalize.cuh
+++ b/include/kernels/block_orthogonalize.cuh
@@ -114,7 +114,9 @@ namespace quda {
     using dot_t = typename BlockOrtho_Params<Arg>::dot_t;
     using real = typename Arg::real;
 
-    constexpr BlockOrtho_(const Arg &arg) : arg(arg) {}
+    using typename BlockOrtho_Params<Arg>::Ops::KernelOpsT;
+    template <typename ...Ops>
+    constexpr BlockOrtho_(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void load(ColorSpinor<real, nColor, spinBlock> &v, int parity, int x_cb, int chirality, int i)
diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
index 42fe9f7080..e227e1f1b1 100644
--- a/include/kernels/block_transpose.cuh
+++ b/include/kernels/block_transpose.cuh
@@ -57,7 +57,9 @@ namespace quda
 
   template <typename Arg> struct BlockTransposeKernel : BlockTransposeKernelOps<Arg>::Ops {
     const Arg &arg;
-    constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { }
+    using typename BlockTransposeKernelOps<Arg>::Ops::KernelOpsT;
+    template <typename ...OpsArgs>
+    constexpr BlockTransposeKernel(const Arg &arg, const OpsArgs &...ops) : KernelOpsT(ops...), arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     /**
diff --git a/include/kernels/clover_deriv.cuh b/include/kernels/clover_deriv.cuh
index 3623a9a40f..38ab548766 100644
--- a/include/kernels/clover_deriv.cuh
+++ b/include/kernels/clover_deriv.cuh
@@ -205,7 +205,8 @@ namespace quda
   template <typename Arg> struct CloverDerivative : computeForceOps
   {
     const Arg &arg;
-    constexpr CloverDerivative(const Arg &arg) : arg(arg) {}
+    template <typename ...Ops>
+    constexpr CloverDerivative(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     __host__ __device__ void operator()(int x_cb, int parity, int mu)
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index 07e0dd77e7..a329fbb3dd 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -1702,7 +1702,9 @@ namespace quda {
     static constexpr int nFace = 1;
     const Arg &arg;
     static constexpr const char *filename() { return KERNEL_FILE; }
-    constexpr compute_vuv(const Arg &arg) : arg(arg) { }
+    using typename storeCoarseSharedAtomic_impl<true>::Ops<Arg>::KernelOpsT;
+    template <typename ...Ops>
+    constexpr compute_vuv(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) { }
 
     /**
        3-d parallelism
@@ -1735,7 +1737,9 @@ namespace quda {
     static constexpr int nFace = 3;
     const Arg &arg;
     static constexpr const char *filename() { return KERNEL_FILE; }
-    constexpr compute_vlv(const Arg &arg) : arg(arg) { }
+    using typename storeCoarseSharedAtomic_impl<true>::Ops<Arg_>::KernelOpsT;
+    template <typename ...Ops>
+    constexpr compute_vlv(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) { }
 
     /**
        3-d parallelism
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index ad5de2426d..d675408c88 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -292,11 +292,15 @@ namespace quda {
     }
   }
 
-  template <typename Arg_> struct GhostPacker :
-    std::conditional_t<Arg_::block_float, site_max<true>::Ops<Arg_>, NoSpecialOps> {
+  template <typename Arg_> using GhostPackerOps =
+    std::conditional_t<Arg_::block_float, site_max<true>::Ops<Arg_>, NoSpecialOps>;
+
+  template <typename Arg_> struct GhostPacker : GhostPackerOps<Arg_> {
     using Arg = Arg_;
     const Arg &arg;
-    constexpr GhostPacker(const Arg &arg) : arg(arg) {}
+    using typename GhostPackerOps<Arg>::KernelOpsT;
+    template <typename ...Ops>
+    constexpr GhostPacker(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     template <bool allthreads = false>
diff --git a/include/kernels/covDev.cuh b/include/kernels/covDev.cuh
index 0d3a4d328a..46405548e6 100644
--- a/include/kernels/covDev.cuh
+++ b/include/kernels/covDev.cuh
@@ -124,7 +124,8 @@ namespace quda
     dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr covDev(const Arg &arg) : arg(arg) {}
+    //constexpr covDev(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr covDev(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
diff --git a/include/kernels/dslash_clover_helper.cuh b/include/kernels/dslash_clover_helper.cuh
index 779a1833e4..a21beead5e 100644
--- a/include/kernels/dslash_clover_helper.cuh
+++ b/include/kernels/dslash_clover_helper.cuh
@@ -172,19 +172,23 @@ namespace quda {
       arg.out(x_cb, spinor_parity) = out;
     }
   };
-  
+
+  template <typename Arg> using NdegTwistCloverApplyOps =
+    SpecialOps<SharedMemoryCache<ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin / 2>>>;
+
   // if (!inverse) apply (Clover + i*a*gamma_5*tau_3 + b*epsilon*tau_1) to the input spinor
   // else apply (Clover + i*a*gamma_5*tau_3 + b*epsilon*tau_1)/(Clover^2 + a^2 - b^2) to the input spinor
   // noting that appropriate signs are carried by a and b depending on inverse
-  template <typename Arg> struct NdegTwistCloverApply :
-    SpecialOps<SharedMemoryCache<ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin / 2>>> {
+  template <typename Arg> struct NdegTwistCloverApply : NdegTwistCloverApplyOps<Arg> {
     static constexpr int N = Arg::nColor * Arg::nSpin / 2;
     using real = typename Arg::real;
     using fermion = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
     using half_fermion = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin / 2>;
     using Mat = HMatrix<typename Arg::real, N>;
     const Arg &arg;
-    constexpr NdegTwistCloverApply(const Arg &arg) : arg(arg) {}
+    using typename NdegTwistCloverApplyOps<Arg>::KernelOpsT;
+    template <typename ...Ops>
+    constexpr NdegTwistCloverApply(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char* filename() { return KERNEL_FILE; }
 
     template <bool allthreads = false>
diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh
index 715020dc32..2ab59f09b4 100644
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -338,7 +338,9 @@ namespace quda {
   template <typename Arg_> struct CoarseDslash : CoarseDslashParams<Arg_>::Ops {
     using Arg = Arg_;
     const Arg &arg;
-    constexpr CoarseDslash(const Arg &arg) : arg(arg) {}
+    using typename CoarseDslashParams<Arg>::Ops::KernelOpsT;
+    template <typename ...Ops>
+    constexpr CoarseDslash(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     template <bool allthreads = false>
diff --git a/include/kernels/dslash_domain_wall_4d.cuh b/include/kernels/dslash_domain_wall_4d.cuh
index 9a122d5fd8..dda8088430 100644
--- a/include/kernels/dslash_domain_wall_4d.cuh
+++ b/include/kernels/dslash_domain_wall_4d.cuh
@@ -28,7 +28,8 @@ namespace quda
   struct domainWall4D : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr domainWall4D(const Arg &arg) : arg(arg) {}
+    //constexpr domainWall4D(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr domainWall4D(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
diff --git a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
index cffb1f8031..5280b7c98e 100644
--- a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
+++ b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
@@ -61,7 +61,9 @@ namespace quda
     static constexpr Dslash5Type dslash5_type = Arg::type;
 
     const Arg &arg;
-    constexpr domainWall4DFusedM5(const Arg &arg) : arg(arg) { }
+    using typename d5Params<Arg_>::Ops::KernelOpsT;
+    //constexpr domainWall4DFusedM5(const Arg &arg) : arg(arg) { }
+    template <typename Ftor> constexpr domainWall4DFusedM5(const Ftor &ftor) : KernelOpsT(ftor), arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type, bool allthreads = false>
diff --git a/include/kernels/dslash_domain_wall_5d.cuh b/include/kernels/dslash_domain_wall_5d.cuh
index 80038ede52..da75217c1a 100644
--- a/include/kernels/dslash_domain_wall_5d.cuh
+++ b/include/kernels/dslash_domain_wall_5d.cuh
@@ -26,7 +26,8 @@ namespace quda
   struct domainWall5D : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr domainWall5D(const Arg &arg) : arg(arg) {}
+    //constexpr domainWall5D(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr domainWall5D(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
     static constexpr QudaPCType pc_type() { return QUDA_5D_PC; }
 
diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh
index 3a4f536291..f84408f8ca 100644
--- a/include/kernels/dslash_domain_wall_m5.cuh
+++ b/include/kernels/dslash_domain_wall_m5.cuh
@@ -333,7 +333,9 @@ namespace quda
   template <typename Arg_> struct dslash5 : d5Params<Arg_>::Ops {
     using Arg = Arg_;
     const Arg &arg;
-    constexpr dslash5(const Arg &arg) : arg(arg) { }
+    using typename d5Params<Arg_>::Ops::KernelOpsT;
+    template <typename ...OpsArgs>
+    constexpr dslash5(const Arg &arg, const OpsArgs &...ops) : KernelOpsT(ops...), arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     /**
@@ -589,7 +591,9 @@ namespace quda
   template <typename Arg_> struct dslash5inv : dslash5invParams<Arg_>::Ops {
     using Arg = Arg_;
     const Arg &arg;
-    constexpr dslash5inv(const Arg &arg) : arg(arg) {}
+    using typename dslash5invParams<Arg>::Ops::KernelOpsT;
+    template <typename ...OpsArgs>
+    constexpr dslash5inv(const Arg &arg, const OpsArgs &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     /**
diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh
index 98dc2abff1..195a36989d 100644
--- a/include/kernels/dslash_mobius_eofa.cuh
+++ b/include/kernels/dslash_mobius_eofa.cuh
@@ -90,6 +90,8 @@ namespace quda
       }
     };
 
+    template <typename Arg> using eofa_dslash5Ops =
+      SpecialOps<SharedMemoryCache<ColorSpinor<typename Arg::real, Arg::nColor, 4>>>;
     /**
       @brief Apply the D5 operator at given site
       @param[in] arg    Argument struct containing any meta data and accessors
@@ -97,10 +99,11 @@ namespace quda
       @param[in] x_cb   Checkerboarded 4-d space-time index
       @param[in] s      Ls dimension coordinate
      */
-    template <typename Arg> struct eofa_dslash5 :
-      SpecialOps<SharedMemoryCache<ColorSpinor<typename Arg::real, Arg::nColor, 4>>> {
+    template <typename Arg> struct eofa_dslash5 : eofa_dslash5Ops<Arg> {
       const Arg &arg;
-      constexpr eofa_dslash5(const Arg &arg) : arg(arg) {}
+      using typename eofa_dslash5Ops<Arg>::KernelOpsT;
+      template <typename ...Ops>
+      constexpr eofa_dslash5(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
       static constexpr const char *filename() { return KERNEL_FILE; }
 
       template <bool allthreads = false>
@@ -170,6 +173,8 @@ namespace quda
       }
     };
 
+    template <typename Arg> using eofa_dslash5invOps =
+      SpecialOps<SharedMemoryCache<ColorSpinor<typename Arg::real, Arg::nColor, 4>>>;
     /**
       @brief Apply the M5 inverse operator at a given site on the
       lattice.  This is the original algorithm as described in Kim and
@@ -182,10 +187,11 @@ namespace quda
       @param[in] x_cb   Checkerboarded 4-d space-time index
       @param[in] s      Ls dimension coordinate
      */
-    template <typename Arg> struct eofa_dslash5inv :
-      SpecialOps<SharedMemoryCache<ColorSpinor<typename Arg::real, Arg::nColor, 4>>> {
+    template <typename Arg> struct eofa_dslash5inv : eofa_dslash5invOps<Arg> {
       const Arg &arg;
-      constexpr eofa_dslash5inv(const Arg &arg) : arg(arg) {}
+      using typename eofa_dslash5invOps<Arg>::KernelOpsT;
+      template <typename ...Ops>
+      constexpr eofa_dslash5inv(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
       static constexpr const char *filename() { return KERNEL_FILE; }
 
       template <bool allthreads = false>
diff --git a/include/kernels/dslash_ndeg_twisted_clover.cuh b/include/kernels/dslash_ndeg_twisted_clover.cuh
index 8dba5e0a53..d2a2e47c4e 100644
--- a/include/kernels/dslash_ndeg_twisted_clover.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover.cuh
@@ -40,16 +40,18 @@ namespace quda
     using real = typename mapper<typename Arg::Float>::type;
     using Vec = ColorSpinor<real, Arg::nColor, 4>;
     using Cache = SharedMemoryCache<Vec>;
-    using Ops = SpecialOps<Cache>;
+    //using Ops = SpecialOps<Cache>;
     //template <KernelType kernel_type>
-    //using Ops = conditional_t<kernel_type == INTERIOR_KERNEL,SpecialOps<Cache>,NoSpecialOps>;
+    using Ops = std::conditional_t<kernel_type == INTERIOR_KERNEL,SpecialOps<Cache>,NoSpecialOps>;
   };
 
   template <int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
   struct nDegTwistedClover : dslash_default, nDegTwistedCloverParams<kernel_type,Arg>::Ops {
 
     const Arg &arg;
-    constexpr nDegTwistedClover(const Arg &arg) : arg(arg) {}
+    using typename nDegTwistedCloverParams<kernel_type,Arg>::Ops::KernelOpsT;
+    //constexpr nDegTwistedClover(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr nDegTwistedClover(const Ftor &ftor) : KernelOpsT(ftor), arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
index b87014b6bb..66b8382b1b 100644
--- a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
@@ -52,7 +52,9 @@ namespace quda
   struct nDegTwistedCloverPreconditioned : dslash_default, nDegTwistedCloverPreconditionedParams<Arg>::Ops {
 
     const Arg &arg;
-    constexpr nDegTwistedCloverPreconditioned(const Arg &arg) : arg(arg) {}
+    using typename nDegTwistedCloverPreconditionedParams<Arg>::Ops::KernelOpsT;
+    //constexpr nDegTwistedCloverPreconditioned(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr nDegTwistedCloverPreconditioned(const Ftor &ftor) : KernelOpsT(ftor), arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/dslash_ndeg_twisted_mass.cuh b/include/kernels/dslash_ndeg_twisted_mass.cuh
index 6ed31be3ac..032bc40e8e 100644
--- a/include/kernels/dslash_ndeg_twisted_mass.cuh
+++ b/include/kernels/dslash_ndeg_twisted_mass.cuh
@@ -26,7 +26,8 @@ namespace quda
   struct nDegTwistedMass : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr nDegTwistedMass(const Arg &arg) : arg(arg) {}
+    //constexpr nDegTwistedMass(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr nDegTwistedMass(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
index 4907b83d14..dae62d171f 100644
--- a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
@@ -48,7 +48,9 @@ namespace quda
   struct nDegTwistedMassPreconditioned : dslash_default, nDegTwistedMassPreconditionedParams<dagger, Arg>::Ops {
 
     const Arg &arg;
-    constexpr nDegTwistedMassPreconditioned(const Arg &arg) : arg(arg) {}
+    using typename nDegTwistedMassPreconditionedParams<dagger,Arg>::Ops::KernelOpsT;
+    //constexpr nDegTwistedMassPreconditioned(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr nDegTwistedMassPreconditioned(const Ftor &ftor) : KernelOpsT(ftor), arg(ftor.arg) {}
     constexpr int twist_pack() const { return (!Arg::asymmetric && dagger) ? 2 : 0; }
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 20842735c3..327f672890 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -173,7 +173,8 @@ namespace quda
   struct staggered : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr staggered(const Arg &arg) : arg(arg) {}
+    //constexpr staggered(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr staggered(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
diff --git a/include/kernels/dslash_twisted_clover_preconditioned.cuh b/include/kernels/dslash_twisted_clover_preconditioned.cuh
index c1dfbc144f..c9a7985192 100644
--- a/include/kernels/dslash_twisted_clover_preconditioned.cuh
+++ b/include/kernels/dslash_twisted_clover_preconditioned.cuh
@@ -40,7 +40,8 @@ namespace quda
   struct twistedCloverPreconditioned : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr twistedCloverPreconditioned(const Arg &arg) : arg(arg) {}
+    //constexpr twistedCloverPreconditioned(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr twistedCloverPreconditioned(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/dslash_twisted_mass.cuh b/include/kernels/dslash_twisted_mass.cuh
index c8276d01b8..da6d9f442b 100644
--- a/include/kernels/dslash_twisted_mass.cuh
+++ b/include/kernels/dslash_twisted_mass.cuh
@@ -24,7 +24,8 @@ namespace quda
   struct twistedMass : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr twistedMass(const Arg &arg) : arg(arg) {}
+    //constexpr twistedMass(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr twistedMass(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/dslash_twisted_mass_preconditioned.cuh b/include/kernels/dslash_twisted_mass_preconditioned.cuh
index d6e66a7635..56557c0a13 100644
--- a/include/kernels/dslash_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_twisted_mass_preconditioned.cuh
@@ -135,7 +135,8 @@ namespace quda
   struct twistedMassPreconditioned : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr twistedMassPreconditioned(const Arg &arg) : arg(arg) {}
+    //constexpr twistedMassPreconditioned(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr twistedMassPreconditioned(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
     constexpr int twist_pack() const { return (!Arg::asymmetric && dagger) ? 1 : 0; }
 
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 64c55bc587..8bc8062e6d 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -137,7 +137,8 @@ namespace quda
     dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr wilson(const Arg &arg) : arg(arg) {}
+    //constexpr wilson(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr wilson(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     // out(x) = M*in = (-D + m) * in(x-mu)
diff --git a/include/kernels/dslash_wilson_clover.cuh b/include/kernels/dslash_wilson_clover.cuh
index 22d541dad1..46d09b881d 100644
--- a/include/kernels/dslash_wilson_clover.cuh
+++ b/include/kernels/dslash_wilson_clover.cuh
@@ -36,7 +36,8 @@ namespace quda
   struct wilsonClover : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr wilsonClover(const Arg &arg) : arg(arg) {}
+    //constexpr wilsonClover(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr wilsonClover(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh b/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh
index 7b92016567..3381e92196 100644
--- a/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh
+++ b/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh
@@ -36,7 +36,8 @@ namespace quda
   struct cloverHasenbusch : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr cloverHasenbusch(const Arg &arg) : arg(arg) {}
+    //constexpr cloverHasenbusch(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr cloverHasenbusch(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
     
     /**
diff --git a/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh b/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh
index 261edfc0d1..61bb116aae 100644
--- a/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh
+++ b/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh
@@ -38,7 +38,8 @@ namespace quda
   struct cloverHasenbuschPreconditioned : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr cloverHasenbuschPreconditioned(const Arg &arg) : arg(arg) {}
+    //constexpr cloverHasenbuschPreconditioned(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr cloverHasenbuschPreconditioned(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/dslash_wilson_clover_preconditioned.cuh b/include/kernels/dslash_wilson_clover_preconditioned.cuh
index 029af93027..dfe6b39dc6 100644
--- a/include/kernels/dslash_wilson_clover_preconditioned.cuh
+++ b/include/kernels/dslash_wilson_clover_preconditioned.cuh
@@ -34,7 +34,8 @@ namespace quda
   struct wilsonCloverPreconditioned : dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr wilsonCloverPreconditioned(const Arg &arg) : arg(arg) {}
+    //constexpr wilsonCloverPreconditioned(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr wilsonCloverPreconditioned(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     /**
diff --git a/include/kernels/field_strength_tensor.cuh b/include/kernels/field_strength_tensor.cuh
index 0caf5c3a7e..b1ebb453c8 100644
--- a/include/kernels/field_strength_tensor.cuh
+++ b/include/kernels/field_strength_tensor.cuh
@@ -179,7 +179,8 @@ namespace quda
   template <typename Arg_> struct ComputeFmunu : computeFmunuCoreOps {
     using Arg = Arg_;
     const Arg &arg;
-    constexpr ComputeFmunu(const Arg &arg) : arg(arg) {}
+    template <typename ...Ops>
+    constexpr ComputeFmunu(const Arg &arg, const Ops &...ops) : computeFmunuCoreOps(ops...), arg(arg) {}
     static constexpr const char* filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void operator()(int x_cb, int parity, int mu_nu)
diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh
index e0a7180ba1..693733cec4 100644
--- a/include/kernels/gauge_ape.cuh
+++ b/include/kernels/gauge_ape.cuh
@@ -41,10 +41,12 @@ namespace quda
       }
     }
   };
-  
+
   template <typename Arg> struct APE : computeStapleOps {
     const Arg &arg;
-    constexpr APE(const Arg &arg) : arg(arg) {}
+    //constexpr APE(const Arg &arg) : arg(arg) {}
+    template <typename ...Ops>
+    constexpr APE(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char* filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
diff --git a/include/kernels/gauge_fix_ovr.cuh b/include/kernels/gauge_fix_ovr.cuh
index b8a40b95a8..832b0fda28 100644
--- a/include/kernels/gauge_fix_ovr.cuh
+++ b/include/kernels/gauge_fix_ovr.cuh
@@ -143,7 +143,9 @@ namespace quda {
   //template <typename Arg> struct computeFix : SpecialOps<SharedMemoryCache<typename Arg::real>> {
   template <typename Arg> struct computeFix : computeFixOps<Arg> {
     const Arg &arg;
-    constexpr computeFix(const Arg &arg) : arg(arg) {}
+    using typename computeFixOps<Arg>::KernelOpsT;
+    template <typename ...Ops>
+    constexpr computeFix(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     template <bool allthreads = false>
@@ -162,8 +164,7 @@ namespace quda {
         for (int dr = 0; dr < 4; dr++) p += arg.border[dr];
         getCoords(x, idx, arg.X, p + parity);
       } else {
-	if (!allthreads || active)
-	  idx = arg.borderpoints[parity][idx];  // load the lattice site assigment
+	if (!allthreads || active) idx = arg.borderpoints[parity][idx];  // load the lattice site assigment
         x[3] = idx / (X[0] * X[1]  * X[2]);
         x[2] = (idx / (X[0] * X[1])) % X[2];
         x[1] = (idx / X[0]) % X[1];
diff --git a/include/kernels/gauge_force.cuh b/include/kernels/gauge_force.cuh
index 2bf8e3d0ca..996c3bdc5b 100644
--- a/include/kernels/gauge_force.cuh
+++ b/include/kernels/gauge_force.cuh
@@ -46,9 +46,11 @@ namespace quda {
 
   template <typename Arg> struct GaugeForce : SpecialOps<thread_array<int,4>>
   {
+    using KOps = SpecialOps<thread_array<int,4>>;
     const Arg &arg;
-    constexpr GaugeForce(const Arg &arg) : arg(arg) {}
-    static constexpr const char *filename() { return KERNEL_FILE; }    
+    template <typename ...Ops>
+    constexpr GaugeForce(const Arg &arg, const Ops &...ops) : KOps(ops...), arg(arg) {}
+    static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ void operator()(int x_cb, int parity, int dir)
     {
diff --git a/include/kernels/gauge_loop_trace.cuh b/include/kernels/gauge_loop_trace.cuh
index 5ef20f8090..15648b3fd2 100644
--- a/include/kernels/gauge_loop_trace.cuh
+++ b/include/kernels/gauge_loop_trace.cuh
@@ -52,13 +52,16 @@ namespace quda {
     }
   };
 
-  template <typename Arg> struct GaugeLoop : plus<typename Arg::reduce_t>, SpecialOps<thread_array<int,4>>
+  template <typename Arg>
+  struct GaugeLoop : plus<typename Arg::reduce_t>, KernelOps<thread_array<int,4>>
   {
     using reduce_t = typename Arg::reduce_t;
     using plus<reduce_t>::operator();
     static constexpr int reduce_block_dim = 2; // x_cb and parity are mapped to x
     const Arg &arg;
-    constexpr GaugeLoop(const Arg &arg) : arg(arg) {}
+    //constexpr GaugeLoop(const Arg &arg) : arg(arg) {}
+    template <typename ...Ops>
+    constexpr GaugeLoop(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline reduce_t operator()(reduce_t &value, int x_cb, int parity, int path_id)
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 8164edf19f..650576987d 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -53,7 +53,8 @@ namespace quda
     using Link = Matrix<complex<real>, Arg::nColor>;
 
     const Arg &arg;
-    constexpr STOUT(const Arg &arg) : arg(arg) {}
+    template <typename ...OpsArgs>
+    constexpr STOUT(const Arg &arg, const OpsArgs &...ops) : computeStapleOps(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
@@ -127,9 +128,11 @@ namespace quda
     using real = typename Arg::Float;
     using Complex = complex<real>;
     using Link = Matrix<complex<real>, Arg::nColor>;
+    using typename OvrImpSTOUTOps<Arg_>::Ops::KernelOpsT;
 
     const Arg &arg;
-    constexpr OvrImpSTOUT(const Arg &arg) : arg(arg) {}
+    template <typename ...OpsArgs>
+    constexpr OvrImpSTOUT(const Arg &arg, const OpsArgs &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 82dd6ca3b0..43e25f117f 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -162,9 +162,11 @@ namespace quda
   //template <typename Arg_> struct WFlow
   template <typename Arg_> struct WFlow : computeStapleOpsWF<Arg_>::Ops
   {
+    using typename computeStapleOpsWF<Arg_>::Ops::KernelOpsT;
     using Arg = Arg_;
     const Arg &arg;
-    constexpr WFlow(const Arg &arg) : arg(arg) {}
+    template <typename ...OpsArgs>
+    constexpr WFlow(const Arg &arg, const OpsArgs &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index 46a7988180..b114eee9ec 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -323,7 +323,9 @@ namespace quda {
       static_assert(Param::nu_next_positive == -1, "nu_next_positive should be set to -1 for AllThreeAllLepageLink");
       static constexpr int compute_lepage = Param::compute_lepage;
 
-      constexpr AllThreeAllLepageLink(const Param &param) : arg(param.arg) {}
+      using typename AllThreeAllLepageLinkOps<Param>::Ops::KernelOpsT;
+      template <typename ...OpsArgs>
+      constexpr AllThreeAllLepageLink(const Param &param, const OpsArgs &...ops) : KernelOpsT(ops...), arg(param.arg) {}
       constexpr static const char *filename() { return KERNEL_FILE; }
 
       /**
@@ -691,7 +693,9 @@ namespace quda {
       static constexpr int nu_next_positive = Param::nu_next_positive; // if nu_next_positive == -1, skip
       static_assert(Param::compute_lepage == -1, "compute_lepage should be set to -1 for AllFiveAllSevenLink");
 
-      constexpr AllFiveAllSevenLink(const Param &param) : arg(param.arg) {}
+      using typename AllFiveAllSevenLinkOps<Param>::Ops::KernelOpsT;
+      template <typename ...OpsArgs>
+      constexpr AllFiveAllSevenLink(const Param &param, const OpsArgs &...ops) : KernelOpsT(ops...), arg(param.arg) {}
       constexpr static const char *filename() { return KERNEL_FILE; }
 
       /**
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index f1421c451c..b563854424 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -139,7 +139,8 @@ namespace quda
     dslash_default, NoSpecialOps {
 
     const Arg &arg;
-    constexpr laplace(const Arg &arg) : arg(arg) {}
+    //constexpr laplace(const Arg &arg) : arg(arg) {}
+    template <typename Ftor> constexpr laplace(const Ftor &ftor) : arg(ftor.arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
diff --git a/include/kernels/madwf_transfer.cuh b/include/kernels/madwf_transfer.cuh
index 65427f1088..cfb8563d90 100644
--- a/include/kernels/madwf_transfer.cuh
+++ b/include/kernels/madwf_transfer.cuh
@@ -102,7 +102,9 @@ namespace quda
     template <class Arg> struct Transfer5D : Transfer5DParams<Arg>::Ops {
 
       const Arg &arg;
-      constexpr Transfer5D(const Arg &arg) : arg(arg) { }
+      using typename Transfer5DParams<Arg>::Ops::KernelOpsT;
+      template <typename ...OpsArgs>
+      constexpr Transfer5D(const Arg &arg, const OpsArgs &...ops) : KernelOpsT(ops...), arg(arg) { }
       static constexpr const char *filename() { return KERNEL_FILE; }
 
       /**
diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index 59810f174c..115ea5ed99 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -115,7 +115,9 @@ namespace quda {
     using vector = typename RestrictorParams<Arg>::vector;
     using BlockReduce_t = typename RestrictorParams<Arg>::BlockReduce_t;
     const Arg &arg;
-    constexpr Restrictor(const Arg &arg) : arg(arg) {}
+    using typename SpecialOps<BlockReduce_t>::KernelOpsT;
+    template <typename ...Ops>
+    constexpr Restrictor(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
     template <bool allthreads = false>
diff --git a/include/targets/generic/helpers.h b/include/targets/generic/helpers.h
index eeb6a65533..ce13676e6d 100644
--- a/include/targets/generic/helpers.h
+++ b/include/targets/generic/helpers.h
@@ -24,6 +24,12 @@ namespace quda
     }
   };
 
+  struct SizeZ {
+    static constexpr unsigned int size(dim3 block) {
+      return block.z;
+    }
+  };
+
   template <typename D, int N = 1> struct SizeDims {
     static constexpr unsigned int size(dim3 block) {
       dim3 dims = D::dims(block);
diff --git a/include/targets/generic/special_ops.h b/include/targets/generic/special_ops.h
index ee1906bc74..1584a7dd0c 100644
--- a/include/targets/generic/special_ops.h
+++ b/include/targets/generic/special_ops.h
@@ -37,16 +37,29 @@ namespace quda {
   // alternative to SpecialOps
   struct NoSpecialOps {
     using SpecialOpsT = NoSpecialOps;
+    using KernelOpsT = NoSpecialOps;
   };
   // SpecialOps forward declaration and base type
   template <typename ...T> struct SpecialOps;
+  template <typename ...T> using KernelOps = SpecialOps<T...>;
   template <typename ...T> struct SpecialOps_Base {
     using SpecialOpsT = SpecialOps<T...>;
+    using KernelOpsT = SpecialOps<T...>;
   };
   //template <typename ...T> struct SpecialOps : SpecialOpsTarget<T...> {
   //  using SpecialOpsT = SpecialOps<T...>;
   //};
 
+  // getSpecialOps
+  template <typename T, typename U = void> struct getSpecialOpsS { using type = NoSpecialOps; };
+  template <typename T> struct getSpecialOpsS<T,std::conditional_t<true,void,typename T::SpecialOpsT>> {
+    using type = typename T::SpecialOpsT;
+  };
+  template <typename ...T> struct getSpecialOpsS<SpecialOps<T...>,void> {
+    using type = SpecialOps<T...>;
+  };
+  template <typename T> using getSpecialOps = typename getSpecialOpsS<T>::type;
+
   // hasSpecialOp: checks if first type matches any of the op
   // <op, SpecialOps<ops...>>
   template <typename T, typename U> static constexpr bool hasSpecialOp = false;
@@ -54,14 +67,14 @@ namespace quda {
   static constexpr bool hasSpecialOp<T,SpecialOps<U...>> = ( std::is_same_v<T,U> || ... );
 
   //template <typename T, typename Ops> void checkSpecialOps() { static_assert(hasSpecialOp<T,Ops>); }
-  template <typename T, typename Ops> void checkSpecialOps(const Ops &) {
-    static_assert(hasSpecialOp<T,typename Ops::SpecialOpsT>);
+  //template <typename T, typename Ops> void checkSpecialOps(const Ops &) {
+  //static_assert(hasSpecialOp<T,typename Ops::SpecialOpsT>);
+  //}
+  template <typename ...T, typename Ops> void checkSpecialOps(const Ops &) {
+    static_assert((hasSpecialOp<T,typename Ops::SpecialOpsT> || ...));
   }
 
 
-
-
-
   // OLD
 
   template <typename ...T> struct op_Concurrent {};  // set of op types used concurrently (needs separate resources)
@@ -94,16 +107,6 @@ namespace quda {
   template <typename T, unsigned int S> using only_SharedMemStatic = only_SharedMemory<T,opSizeStatic<S>>;
   template <typename ...T> using only_Concurrent = SpecialOps<op_Concurrent<T...>>;
 
-  // getSpecialOps
-  template <typename T, typename U = void> struct getSpecialOpsS { using type = NoSpecialOps; };
-  template <typename T> struct getSpecialOpsS<T,std::conditional_t<true,void,typename T::SpecialOpsT>> {
-    using type = typename T::SpecialOpsT;
-  };
-  template <typename ...T> struct getSpecialOpsS<SpecialOps<T...>,void> {
-    using type = SpecialOps<T...>;
-  };
-  template <typename T> using getSpecialOps = typename getSpecialOpsS<T>::type;
-
   // explicitSpecialOps
   template <typename T, typename U = void> struct explicitSpecialOpsS : std::false_type {};
   template <typename T>
diff --git a/include/targets/sycl/block_reduce_helper.h b/include/targets/sycl/block_reduce_helper.h
index fbed3045ae..a91d8ea9b4 100644
--- a/include/targets/sycl/block_reduce_helper.h
+++ b/include/targets/sycl/block_reduce_helper.h
@@ -4,6 +4,7 @@
 #include <reducer.h>
 #include <group_reduce.h>
 #include <special_ops_target.h>
+#include <shared_memory_helper.h>
 
 /**
    @file block_reduce_helper.h
@@ -132,21 +133,26 @@ namespace quda
   */
 #define DYNAMIC_SLM
   template <typename T, int block_dim, int batch_size>
-  struct block_reduceW {
+  //struct block_reduceW {
+  struct block_reduceW : SharedMemory<T,SizeBlockDivWarp> {
+    using Smem = SharedMemory<T,SizeBlockDivWarp>;
+    //using Smem::shared_mem_size;
 #ifdef DYNAMIC_SLM
     using opSmem = op_SharedMemory<T,opSizeBlockDivWarp>;
+    //using opSmem = SharedMemory<T,opSizeBlockDivWarp>;
     using dependencies = op_Sequential<op_blockSync,opSmem>;
     using dependentOps = SpecialOps<op_blockSync,opSmem>;
-    template <typename ...Arg>
-    static constexpr size_t shared_mem_size(dim3 block, Arg &...arg) {
-      return opSizeBlockDivWarp::size<T>(block, arg...);
-    }
+    //template <typename ...Arg>
+    //static constexpr size_t shared_mem_size(dim3 block, Arg &...arg) {
+    //return opSizeBlockDivWarp::size<T>(block, arg...);
+    //}
 #else
 #endif
     using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
-    dependentOps ops;
+    //dependentOps ops;
     template <typename S>
-    inline block_reduceW(S &ops) : ops(getDependentOps<BlockReduce_t>(ops)) {};
+    //inline block_reduceW(S &ops) : ops(getDependentOps<BlockReduce_t>(ops)) {};
+    inline block_reduceW(S &ops) : Smem(ops) {};
 
     template <int width_> struct warp_reduce_param {
       static constexpr int width = width_;
@@ -183,7 +189,8 @@ namespace quda
 
       //__shared__ T storage[max_items];
 #ifdef DYNAMIC_SLM
-      auto storage = getSharedMemPtr<opSmem>(ops);
+      //auto storage = getSharedMemPtr<opSmem>(ops);
+      auto storage = Smem::sharedMem();
 #else
       static_assert(sizeof(T[max_items])<=device::shared_memory_size(), "Block reduce shared mem size too large");
       auto mem = sycl::ext::oneapi::group_local_memory_for_overwrite<T[max_items]>(getGroup());
@@ -192,7 +199,8 @@ namespace quda
 
       // if first thread in warp, write result to shared memory
       if (thread_idx % device::warp_size() == 0) storage[batch * warp_items + warp_idx] = value;
-      blockSync(ops);
+      //blockSync(ops);
+      __syncthreads();
 
       // whether to use the first warp or first thread for the final reduction
       constexpr bool final_warp_reduction = true;
@@ -216,7 +224,8 @@ namespace quda
 
       if (all) {
         if (thread_idx == 0) storage[batch * warp_items + 0] = value;
-	blockSync(ops);
+	//blockSync(ops);
+	__syncthreads();
         value = storage[batch * warp_items + 0];
       }
 
diff --git a/include/targets/sycl/block_reduction_kernel.h b/include/targets/sycl/block_reduction_kernel.h
index bd4c5cd0dd..bec9c85905 100644
--- a/include/targets/sycl/block_reduction_kernel.h
+++ b/include/targets/sycl/block_reduction_kernel.h
@@ -95,6 +95,7 @@ namespace quda
     const unsigned int k = globalIdZ;
     if (k >= arg.threads.z) return;
 
+#if 0
     Functor<Arg> f(arg);
     if constexpr (hasSpecialOps<Functor<Arg>>) {
       f.setNdItem(ndi);
@@ -102,6 +103,10 @@ namespace quda
     if constexpr (needsSharedMem<Functor<Arg>>) {
       f.setSharedMem(smem...);
     }
+#else
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
     f(block_idx, thread_idx);
   }
   template <template <typename> class Functor, typename Arg, typename ...S>
@@ -116,6 +121,7 @@ namespace quda
     const unsigned int k = globalIdZ;
     if (k >= arg.threads.z) active = false;
 
+#if 0
     Functor<Arg> f(arg);
     if constexpr (hasSpecialOps<Functor<Arg>>) {
       f.setNdItem(ndi);
@@ -123,6 +129,10 @@ namespace quda
     if constexpr (needsSharedMem<Functor<Arg>>) {
       f.setSharedMem(smem...);
     }
+#else
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
     f.template operator()<true>(block_idx, thread_idx, active);
   }
   template <template <typename> class Functor, typename Arg>
diff --git a/include/targets/sycl/kernel.h b/include/targets/sycl/kernel.h
index 5fed1f3cd9..ae37a93bd4 100644
--- a/include/targets/sycl/kernel.h
+++ b/include/targets/sycl/kernel.h
@@ -110,6 +110,7 @@ namespace quda {
   std::enable_if_t<!needsFullBlock<Functor<Arg>>, void>
   Kernel2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S ...smem)
   {
+#if 0
     Functor<Arg> f(arg);
     if constexpr (hasSpecialOps<Functor<Arg>>) {
       f.setNdItem(ndi);
@@ -117,6 +118,10 @@ namespace quda {
     if constexpr (needsSharedMem<Functor<Arg>>) {
       f.setSharedMem(smem...);
     }
+#else
+    //Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
 
     auto j = globalIdY;
     if (j >= arg.threads.y) return;
@@ -130,6 +135,7 @@ namespace quda {
   std::enable_if_t<needsFullBlock<Functor<Arg>>, void>
   Kernel2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S ...smem)
   {
+#if 0
     Functor<Arg> f(arg);
     if constexpr (hasSpecialOps<Functor<Arg>>) {
       f.setNdItem(ndi);
@@ -137,6 +143,10 @@ namespace quda {
     if constexpr (needsSharedMem<Functor<Arg>>) {
       f.setSharedMem(smem...);
     }
+#else
+    //Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
 
     bool active = true;
     auto j = globalIdY;
@@ -246,6 +256,7 @@ namespace quda {
   std::enable_if_t<!needsFullBlock<Functor<Arg>>, void>
   Kernel3DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S ...smem)
   {
+#if 0
     Functor<Arg> f(arg);
     if constexpr (hasSpecialOps<Functor<Arg>>) {
       f.setNdItem(ndi);
@@ -253,6 +264,10 @@ namespace quda {
     if constexpr (needsSharedMem<Functor<Arg>>) {
       f.setSharedMem(smem...);
     }
+#else
+    //Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
 
     auto j = globalIdY;
     if (j >= arg.threads.y) return;
@@ -268,6 +283,7 @@ namespace quda {
   std::enable_if_t<needsFullBlock<Functor<Arg>>, void>
   Kernel3DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S ...smem)
   {
+#if 0
     Functor<Arg> f(arg);
     if constexpr (hasSpecialOps<Functor<Arg>>) {
       f.setNdItem(ndi);
@@ -275,6 +291,10 @@ namespace quda {
     if constexpr (needsSharedMem<Functor<Arg>>) {
       f.setSharedMem(smem...);
     }
+#else
+    //Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
 
     bool active = true;
     auto j = globalIdY;
diff --git a/include/targets/sycl/reduce_helper.h b/include/targets/sycl/reduce_helper.h
index 5b2a1c8330..ab298fcb92 100644
--- a/include/targets/sycl/reduce_helper.h
+++ b/include/targets/sycl/reduce_helper.h
@@ -119,6 +119,7 @@ namespace quda
     //using Ops = SpecialOps<BlockReduce_t,reduceConcurrentOps>;
     using opBlockSync = op_blockSync;
     using opSharedMem = op_SharedMemory<bool>;
+    using Smem = SharedMemory<bool, SizeZ>;
     using Ops = SpecialOps<BlockReduce_t,opBlockSync,opSharedMem>;
   };
 
@@ -158,9 +159,12 @@ namespace quda
       auto glmem = sycl::ext::oneapi::group_local_memory_for_overwrite<bool[n_batch_block]>(getGroup());
       auto isLastBlockDone = *glmem.get();
 #else
-      using opSharedMem = typename reduceParams<Arg, Reducer, T>::opSharedMem;
+      //using opSharedMem = typename reduceParams<Arg, Reducer, T>::opSharedMem;
       //auto isLastBlockDone = getSharedMemPtr(opSharedMem()(ops));
-      auto isLastBlockDone = getSharedMemPtr<opSharedMem>(ops);
+      //auto isLastBlockDone = getSharedMemPtr<opSharedMem>(ops);
+      using Smem = typename reduceParams<Arg, Reducer, T>::Smem;
+      Smem smem(ops);
+      auto isLastBlockDone = smem.sharedMem();
 #endif
 
     if (target::thread_idx().x == 0 && target::thread_idx().y == 0 && idx < arg.threads.z) {
diff --git a/include/targets/sycl/reduction_kernel.h b/include/targets/sycl/reduction_kernel.h
index dc46b98102..95b8240cd7 100644
--- a/include/targets/sycl/reduction_kernel.h
+++ b/include/targets/sycl/reduction_kernel.h
@@ -13,9 +13,13 @@ namespace quda {
   void Reduction2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, char *smem)
   {
     Functor<Arg> f(arg);
+#if 0
     typename reduceParams<Arg,Functor<Arg>,typename Functor<Arg>::reduce_t>::Ops rso;
     rso.setNdItem(ndi);
     rso.setSharedMem(smem);
+#else
+    typename reduceParams<Arg,Functor<Arg>,typename Functor<Arg>::reduce_t>::Ops rso{smem};
+#endif
     auto idx = globalIdX;
     auto j = localIdY;
     auto value = f.init();
@@ -153,6 +157,7 @@ namespace quda {
   {
     static_assert(!needsFullBlock<Functor<Arg>>);
     using reduce_t = typename Functor<Arg>::reduce_t;
+#if 0
     Functor<Arg> f(arg);
     if constexpr (hasSpecialOps<Functor<Arg>>) {
       f.setNdItem(ndi);
@@ -160,9 +165,18 @@ namespace quda {
     if constexpr (needsSharedMem<Functor<Arg>>) {
       f.setSharedMem(smem);
     }
+#else
+    //Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem);
+#endif
+
+#if 0
     typename reduceParams<Arg,Functor<Arg>,typename Functor<Arg>::reduce_t>::Ops rso;
     rso.setNdItem(ndi);
     rso.setSharedMem(smem);
+#else
+    typename reduceParams<Arg,Functor<Arg>,typename Functor<Arg>::reduce_t>::Ops rso{smem};
+#endif
 
     auto idx = globalIdX;
     auto k = localIdY;
diff --git a/include/targets/sycl/shared_memory_helper.h b/include/targets/sycl/shared_memory_helper.h
index c185813c8c..2344d3b833 100644
--- a/include/targets/sycl/shared_memory_helper.h
+++ b/include/targets/sycl/shared_memory_helper.h
@@ -42,6 +42,7 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
+#if 0
     SharedMemory() : size(S::size(target::block_dim()))
     {
       auto grp = getGroup();
@@ -50,6 +51,7 @@ namespace quda
       auto offset = get_offset(target::block_dim());
       data = *mem0.get() + offset;
     }
+#endif
 
     template <typename ...U>
     SharedMemory(const SpecialOps<U...> &ops) : size(S::size(target::block_dim()))
diff --git a/include/targets/sycl/special_ops_target.h b/include/targets/sycl/special_ops_target.h
index ffc226591e..f98450d527 100644
--- a/include/targets/sycl/special_ops_target.h
+++ b/include/targets/sycl/special_ops_target.h
@@ -4,6 +4,40 @@
 
 namespace quda {
 
+  // needsSharedMem
+#if 0
+  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getSpecialOps<T>>;
+  template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
+  template <> static constexpr bool needsSharedMemImpl<depNone> = false;
+  template <> static constexpr bool needsSharedMemImpl<depFullBlock> = false;
+  template <typename T, typename S> static constexpr bool needsSharedMemImpl<depSharedMem<T,S>> = true;
+  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Concurrent<T...>> = needsSharedMemImpl<T...>;
+  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Sequential<T...>> = needsSharedMemImpl<T...>;
+  template <typename T> static constexpr bool needsSharedMemF() {
+    if constexpr (std::is_base_of<op_Base,T>::value) {
+    //if constexpr (is_instance<T,op_Base>) {
+      return needsSharedMemImpl<typename T::dependencies>;
+    } else {
+      //if constexpr (hasSpecialOps<T>) {
+      //return needsSharedMem<getSpecialOps<T>>;
+      //} else {
+      //return false;
+      return needsSharedMem<typename T::dependentOps>;
+      //}
+    }
+  }
+  template <typename T> static constexpr bool needsSharedMemImpl<T> = needsSharedMemF<T>();
+  template <> static constexpr bool needsSharedMem<NoSpecialOps> = false;
+  template <typename ...T> static constexpr bool needsSharedMem<SpecialOps<T...>> = needsSharedMemImpl<T...>;
+#else
+  //template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
+  template <typename T> static constexpr bool needsSharedMemImpl = (T::shared_mem_size(dim3{8,8,8}) > 0);
+  template <typename... T> static constexpr bool needsSharedMemImpl<SpecialOps<T...>> = (needsSharedMemImpl<T> || ...);
+  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getSpecialOps<T>>;
+  template <typename... T> static constexpr bool needsSharedMem<SpecialOps<T...>> = (needsSharedMemImpl<T> || ...);
+  template <> static constexpr bool needsSharedMem<NoSpecialOps> = false;
+#endif
+
   // SpecialOps
   template <typename ...T>
   struct SpecialOps : SpecialOps_Base<T...> {
@@ -11,14 +45,32 @@ namespace quda {
     //using SpecialOpsT = op_Sequential<T...>;
     //using SpecialOpsT = SpecialOps<T...>;
     //using SpecialOpsElemType = typename SpecialOpsElemTypeS<T...>::type;
-    const sycl::nd_item<3> *ndi = nullptr;
+    //const sycl::nd_item<3> *ndi = nullptr;
     //char *smem;
     sycl::local_ptr<char> smem = nullptr;
-    inline void setNdItem(const sycl::nd_item<3> &i) { ndi = &i; }
+
+    //SpecialOps() = delete;
+    inline SpecialOps() {
+      static_assert(!needsSharedMem<SpecialOps<T...>>);
+    }
+    inline SpecialOps(char *s) {
+      static_assert(needsSharedMem<SpecialOps<T...>>);
+      smem = s;
+    }
+    template <typename ...U>
+    inline SpecialOps(const SpecialOps<U...> &ops) {
+      checkSpecialOps<T...>(ops);
+      if constexpr (needsSharedMem<SpecialOps<T...>>) {
+	smem = ops.smem;
+      }
+    }
+
+    //inline void setNdItem(const sycl::nd_item<3> &i) { ndi = &i; }
+    inline void setNdItem(const sycl::nd_item<3> &i) {}
     inline void setSharedMem(char *s) { smem = s; }
     template <typename ...U> inline void setSpecialOps(const SpecialOps<U...> &ops) {
       static_assert(std::is_same_v<SpecialOps<T...>,SpecialOps<U...>>);
-      ndi = ops.ndi;
+      //ndi = ops.ndi;
       smem = ops.smem;
     }
 #if 0
@@ -37,7 +89,8 @@ namespace quda {
     //  errorQuda("SpecialOps not set");
     //}
 #ifdef __SYCL_DEVICE_ONLY__
-    sycl::group_barrier(ops->ndi->get_group());
+    //sycl::group_barrier(ops->ndi->get_group());
+    sycl::group_barrier(getGroup());
 #endif
   }
   template <typename ...T> inline void blockSync(SpecialOps<T...> ops) { blockSync(&ops); }
@@ -50,6 +103,7 @@ namespace quda {
   template <typename T, typename U, typename ...V> static constexpr int getOpIndex<T, U, V...> =
     std::is_same_v<T,U> ? 0 : (1 + getOpIndex<T,V...>);
 
+#if 1
   // getSpecialOp
   template <typename U, int n = 0, typename ...T>
   inline SpecialOpsType<U,n> getSpecialOp(const SpecialOps<T...> &ops) {
@@ -62,7 +116,7 @@ namespace quda {
       //	errorQuda("SpecialOps not set");
       //}
       SpecialOpsType<U,n> s;
-      s.ndi = ops.ndi;
+      //s.ndi = ops.ndi;
       //s.smem = ops->smem + sharedMemOffset<U,n>()(ops->ndi->get_local_range());  // FIXME: need to pass arg
       s.smem = ops.smem + sharedMemOffset<U,n>()(getBlockDim());  // FIXME: need to pass arg
       return s;
@@ -73,7 +127,9 @@ namespace quda {
   template <typename U, int n = 0> struct getSpecialOpF {
     template <typename T> inline SpecialOpsType<U,n> operator()(const T &ops) { return getSpecialOp<U,n>(ops); }
   };
+#endif
 
+#if 0
   // getDependentOps
   template <typename U, int n = 0, typename ...T>
   inline SpecialOpDependencies<SpecialOpsType<U,n>> getDependentOps(const SpecialOps<T...> &ops) {
@@ -81,12 +137,22 @@ namespace quda {
     //if (ops->ndi == nullptr || ops->smem == nullptr) {
     //errorQuda("SpecialOps not set");
     //}
-    SpecialOpDependencies<SpecialOpsType<U,n>> s;
-    s.ndi = ops.ndi;
+    //SpecialOpDependencies<SpecialOpsType<U,n>> s;
+    //s.ndi = ops.ndi;
     //s.smem = ops->smem + sharedMemOffset<U,n>()(ops->ndi->get_local_range());  // FIXME: need to pass arg
-    s.smem = ops.smem + sharedMemOffset<U,n>()(getBlockDim());  // FIXME: need to pass arg
-    return s;
+    //s.smem = ops.smem + sharedMemOffset<U,n>()(getBlockDim());  // FIXME: need to pass arg
+    //return s;
+    using R = SpecialOpDependencies<SpecialOpsType<U,n>>;
+    if constexpr (needsSharedMem<R>) {
+      auto m = ops.smem + SpecialOps<U>::
+      R s{};
+      return s;
+    } else {
+      R s{};
+      return s;
+    }
   }
+#endif
 
   // getSharedMemPtr
 #if 0
@@ -130,12 +196,14 @@ namespace quda {
     return getSharedMemPtr(op);
   }
 
+#if 0
   template <typename T, typename O>
   inline auto getSharedMemory(O *ops)
   {
     auto s = getSpecialOp<T>(ops);
     return getSharedMemPtr(s);
   }
+#endif
 
   // base operation dependencies
   struct depNone {};
@@ -233,40 +301,6 @@ namespace quda {
 #endif
 
 
-  // needsSharedMem
-#if 0
-  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getSpecialOps<T>>;
-  template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
-  template <> static constexpr bool needsSharedMemImpl<depNone> = false;
-  template <> static constexpr bool needsSharedMemImpl<depFullBlock> = false;
-  template <typename T, typename S> static constexpr bool needsSharedMemImpl<depSharedMem<T,S>> = true;
-  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Concurrent<T...>> = needsSharedMemImpl<T...>;
-  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Sequential<T...>> = needsSharedMemImpl<T...>;
-  template <typename T> static constexpr bool needsSharedMemF() {
-    if constexpr (std::is_base_of<op_Base,T>::value) {
-    //if constexpr (is_instance<T,op_Base>) {
-      return needsSharedMemImpl<typename T::dependencies>;
-    } else {
-      //if constexpr (hasSpecialOps<T>) {
-      //return needsSharedMem<getSpecialOps<T>>;
-      //} else {
-      //return false;
-      return needsSharedMem<typename T::dependentOps>;
-      //}
-    }
-  }
-  template <typename T> static constexpr bool needsSharedMemImpl<T> = needsSharedMemF<T>();
-  template <> static constexpr bool needsSharedMem<NoSpecialOps> = false;
-  template <typename ...T> static constexpr bool needsSharedMem<SpecialOps<T...>> = needsSharedMemImpl<T...>;
-#else
-  //template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
-  template <typename T> static constexpr bool needsSharedMemImpl = (T::shared_mem_size(dim3{8,8,8}) > 0);
-  template <typename... T> static constexpr bool needsSharedMemImpl<SpecialOps<T...>> = (needsSharedMemImpl<T> || ...);
-  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getSpecialOps<T>>;
-  template <typename... T> static constexpr bool needsSharedMem<SpecialOps<T...>> = (needsSharedMemImpl<T> || ...);
-  template <> static constexpr bool needsSharedMem<NoSpecialOps> = false;
-#endif
-
   // tests
 #if 0
   static const int opTestArg = 10;
diff --git a/include/targets/sycl/tunable_kernel.h b/include/targets/sycl/tunable_kernel.h
index 44a40e3f47..7489a32152 100644
--- a/include/targets/sycl/tunable_kernel.h
+++ b/include/targets/sycl/tunable_kernel.h
@@ -385,4 +385,14 @@ namespace quda {
     return err;
   }
 
+  template <typename F, bool = hasSpecialOps<F>, bool = needsSharedMem<F>>
+  struct Ftor : F {
+    template <typename Arg, typename S>
+    Ftor(const Arg &arg, const sycl::nd_item<3> &ndi, S smem) : F{arg,smem} {}
+  };
+  template <typename F, bool ns> struct Ftor<F,ns,false> : F {
+    template <typename Arg, typename ...S>
+    Ftor(const Arg &arg, const sycl::nd_item<3> &ndi, S ...smem) : F{arg} {}
+  };
+
 }