diff --git a/.gitignore b/.gitignore
index 9dbc07673..fd03d2e32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@
 *.orig
 __pycache__/
 view
+*.cache*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6519366b4..546d74076 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,16 +24,17 @@ set(ENABLE_MPI ON CACHE BOOL "")
 
 
 if (ENABLE_CUDA)
-    add_compile_definitions(USE_CUDA)
-    set(SERAC_USE_CUDA ON CACHE BOOL "")
+    #add_compile_definitions(USE_CUDA)
+    set(ENABLE_CLANG_CUDA ON CACHE BOOL "")
     set(SERAC_USE_CUDA ON CACHE BOOL "")
     set(RAJA_ENABLE_CUDA ON CACHE BOOL "")
     set(AXOM_USE_CUDA ON CACHE BOOL "")
+    set(MFEM_USE_CUDA ON CACHE BOOL "")
 endif()
 
 
 if (NOT MPI_C_COMPILER OR NOT MPI_CXX_COMPILER)
-    message(FATAL_ERROR 
+    message(FATAL_ERROR
             "Serac requires MPI.  It is required to provide the MPI C/C++ "
             "compiler wrappers via the CMake variables, "
             "MPI_C_COMPILER and MPI_CXX_COMPILER.")
@@ -141,6 +142,10 @@ include(${PROJECT_SOURCE_DIR}/cmake/thirdparty/SetupSeracThirdParty.cmake)
 
 set(CMAKE_C_FLAGS   ${SERAC_CMAKE_C_FLAGS})
 set(CMAKE_CXX_FLAGS ${SERAC_CMAKE_CXX_FLAGS})
+string(REPLACE " -Werror" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -gdwarf-4 -fgpu-rdc" CACHE STRING "")
+#set(CMAKE_CXX_FLAGS_DEBUG "-fopenmp -gdwarf-4 -fgpu-rdc" CACHE STRING "")
+#set(CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS ON CACHE BOOL "")
 
 include(${PROJECT_SOURCE_DIR}/cmake/SeracConfigHeader.cmake)
 
@@ -206,7 +211,7 @@ if (SERAC_ENABLE_CODEVELOP)
             )
 endif()
 
-install(EXPORT serac-targets 
+install(EXPORT serac-targets
         NAMESPACE serac::
         DESTINATION lib/cmake
         )
diff --git a/cmake/thirdparty/SetupSeracThirdParty.cmake b/cmake/thirdparty/SetupSeracThirdParty.cmake
index 29aaa3a6a..59c9fca9f 100644
--- a/cmake/thirdparty/SetupSeracThirdParty.cmake
+++ b/cmake/thirdparty/SetupSeracThirdParty.cmake
@@ -15,7 +15,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
         # Manually set includes as system includes
         foreach(_target cuda_runtime cuda)
             get_target_property(_dirs ${_target} INTERFACE_INCLUDE_DIRECTORIES)
-            set_property(TARGET ${_target} 
+            set_property(TARGET ${_target}
                          APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
                          "${_dirs}")
         endforeach()
@@ -59,7 +59,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
     #------------------------------------------------------------------------------
     if(UMPIRE_DIR)
         serac_assert_is_directory(VARIABLE_NAME UMPIRE_DIR)
-        find_package(umpire REQUIRED NO_DEFAULT_PATH 
+        find_package(umpire REQUIRED NO_DEFAULT_PATH
                      PATHS ${UMPIRE_DIR})
         message(STATUS "Umpire support is ON")
         set(UMPIRE_FOUND TRUE)
@@ -73,7 +73,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
     #------------------------------------------------------------------------------
     if(RAJA_DIR)
         serac_assert_is_directory(VARIABLE_NAME RAJA_DIR)
-        find_package(RAJA REQUIRED NO_DEFAULT_PATH 
+        find_package(RAJA REQUIRED NO_DEFAULT_PATH
                      PATHS ${RAJA_DIR})
         message(STATUS "RAJA support is ON")
         set(RAJA_FOUND TRUE)
@@ -100,7 +100,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
 
     # Manually set includes as system includes
     get_target_property(_dirs conduit::conduit INTERFACE_INCLUDE_DIRECTORIES)
-    set_property(TARGET conduit::conduit 
+    set_property(TARGET conduit::conduit
                  APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
                  "${_dirs}")
 
@@ -154,10 +154,10 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
                 message(FATAL_ERROR "Serac+Caliper+CUDA requires CMake > 3.17.")
             else()
                 find_package(CUDAToolkit REQUIRED)
-            endif() 
+            endif()
         endif()
 
-        find_package(caliper REQUIRED NO_DEFAULT_PATH 
+        find_package(caliper REQUIRED NO_DEFAULT_PATH
                      PATHS ${CALIPER_DIR})
         message(STATUS "Caliper support is ON")
         set(CALIPER_FOUND TRUE)
@@ -177,7 +177,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
 
         #### Store Data that MFEM clears
         set(tpls_to_save AMGX AXOM CALIPER CAMP CONDUIT HDF5
-                         HYPRE LUA METIS MFEM NETCDF PARMETIS PETSC RAJA 
+                         HYPRE LUA METIS MFEM NETCDF PARMETIS PETSC RAJA
                          SUPERLU_DIST STRUMPACK SUNDIALS TRIBOL UMPIRE)
         foreach(_tpl ${tpls_to_save})
             set(${_tpl}_DIR_SAVE "${${_tpl}_DIR}")
@@ -271,7 +271,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
         else()
             add_subdirectory(${PROJECT_SOURCE_DIR}/mfem  ${CMAKE_BINARY_DIR}/mfem)
         endif()
- 
+
         set(MFEM_FOUND TRUE CACHE BOOL "" FORCE)
 
         # Temporary hack to inject the hdf5_hl after netcdf and before hdf5
@@ -304,7 +304,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
         serac_assert_is_directory(VARIABLE_NAME AXOM_DIR)
 
         find_package(axom REQUIRED
-                        NO_DEFAULT_PATH 
+                        NO_DEFAULT_PATH
                         PATHS ${AXOM_DIR}/lib/cmake)
 
         message(STATUS "Axom support is ON")
@@ -420,10 +420,10 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
 
             # Mark the axom includes as "system" and filter unallowed directories
             get_target_property(_dirs core INTERFACE_INCLUDE_DIRECTORIES)
-            set_property(TARGET core 
+            set_property(TARGET core
                          PROPERTY INTERFACE_INCLUDE_DIRECTORIES
                          "${_dirs}")
-            set_property(TARGET core 
+            set_property(TARGET core
                          APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
                          "${_dirs}")
         else()
@@ -432,10 +432,10 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
             # Mark the axom includes as "system" and filter unallowed directories
             get_target_property(_dirs axom INTERFACE_INCLUDE_DIRECTORIES)
             list(REMOVE_ITEM _dirs ${PROJECT_SOURCE_DIR})
-            set_property(TARGET axom 
+            set_property(TARGET axom
                          PROPERTY INTERFACE_INCLUDE_DIRECTORIES
                          "${_dirs}")
-            set_property(TARGET axom 
+            set_property(TARGET axom
                          APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
                          "${_dirs}")
         endif()
@@ -449,7 +449,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
         serac_assert_is_directory(VARIABLE_NAME TRIBOL_DIR)
 
         find_package(tribol REQUIRED
-                            NO_DEFAULT_PATH 
+                            NO_DEFAULT_PATH
                             PATHS ${TRIBOL_DIR}/lib/cmake)
 
         if(TARGET tribol)
@@ -468,7 +468,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
     else()
         set(TRIBOL_FOUND OFF)
     endif()
-    
+
     message(STATUS "Tribol support is " ${TRIBOL_FOUND})
 
     #------------------------------------------------------------------------------
@@ -498,7 +498,7 @@ if (NOT SERAC_THIRD_PARTY_LIBRARIES_FOUND)
         # This flag is empty due to us not enabling fortran but we need to strip it
         # so it doesn't propagate in our project
         if("${OpenMP_Fortran_FLAGS}" STREQUAL "")
-            set(OpenMP_Fortran_FLAGS "$<$<NOT:$<COMPILE_LANGUAGE:Fortran>>:-fopenmp=libomp>;$<$<COMPILE_LANGUAGE:Fortran>:-fopenmp>")
+            set(OpenMP_Fortran_FLAGS "-fopenmp")#"$<$<NOT:$<COMPILE_LANGUAGE:Fortran>>:-fopenmp=libomp>;$<$<COMPILE_LANGUAGE:Fortran>:-fopenmp>")
         endif()
 
         foreach(_target axom)
diff --git a/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake
index 83876a455..23b37f2cf 100644
--- a/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake
+++ b/host-configs/lassen-blueos_3_ppc64le_ib_p9-clang@10.0.1.cmake
@@ -24,13 +24,15 @@ if(DEFINED ENV{SPACK_CC})
 else()
 
   set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-ibm-10.0.1-gcc-8.3.1/bin/clang" CACHE PATH "")
+  #set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-13.0.1-gcc-8.3.1/bin/clang" CACHE PATH "")
 
   set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-ibm-10.0.1-gcc-8.3.1/bin/clang++" CACHE PATH "")
+  #set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-13.0.1-gcc-8.3.1/bin/clang++" CACHE PATH "")
 
   set(CMAKE_Fortran_COMPILER "/usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran" CACHE PATH "")
 
 endif()
-
+#-fopenmp -gdwarf-4 -fgpu-rdc
 set(CMAKE_C_STANDARD_LIBRARIES "-lgfortran" CACHE STRING "")
 
 set(CMAKE_CXX_STANDARD_LIBRARIES "-lgfortran" CACHE STRING "")
@@ -65,13 +67,15 @@ set(BLT_MPI_COMMAND_APPEND "mpibind" CACHE STRING "")
 # Cuda
 #------------------------------------------------
 
-set(CUDAToolkit_ROOT "/usr/tce/packages/cuda/cuda-11.2.0" CACHE PATH "")
+set(CUDAToolkit_ROOT "/usr/tce/packages/cuda/cuda-12.0.0" CACHE PATH "")
+#set(CUDAToolkit_ROOT "/usr/tce/packages/cuda/cuda-10.1.105" CACHE PATH "")
 
 set(CMAKE_CUDA_COMPILER "${CUDAToolkit_ROOT}/bin/nvcc" CACHE PATH "")
 
 set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE PATH "")
 
-set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-11.2.0" CACHE PATH "")
+set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-12.0.0" CACHE PATH "")
+#set(CUDA_TOOLKIT_ROOT_DIR "/usr/tce/packages/cuda/cuda-10.1.105" CACHE PATH "")
 
 set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "")
 
@@ -79,15 +83,18 @@ set(ENABLE_OPENMP ON CACHE BOOL "")
 
 set(ENABLE_CUDA ON CACHE BOOL "")
 
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON CACHE BOOL "")
+set(ENABLE_CLANG_CUDA OFF CACHE BOOL "")
 
 set(CMAKE_CUDA_FLAGS " --expt-extended-lambda --expt-relaxed-constexpr " CACHE STRING "")
 
-set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "")
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON CACHE BOOL "")
+
+#set(CMAKE_CUDA_FLAGS " --expt-extended-lambda --expt-relaxed-constexpr " CACHE STRING "")
+#set(CMAKE_CUDA_FLAGS "-fopenmp" CACHE STRING "")
 
 # nvcc does not like gtest's 'pthreads' flag
 
-set(gtest_disable_pthreads ON CACHE BOOL "")
+set(gtest_disable_pthreads OFF CACHE BOOL "")
 
 set(BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3" CACHE STRING "")
 
diff --git a/src/serac/numerics/functional/boundary_integral_kernels.hpp b/src/serac/numerics/functional/boundary_integral_kernels.hpp
index f2da655f2..6f5373561 100644
--- a/src/serac/numerics/functional/boundary_integral_kernels.hpp
+++ b/src/serac/numerics/functional/boundary_integral_kernels.hpp
@@ -182,41 +182,41 @@ void evaluation_kernel_impl(trial_element_type trial_elements, test_element, con
 
 #if defined(USE_CUDA)
   std::cout << "USING CUDA\n";
-  using policy = RAJA::cuda_exec<512>;
+  using policy = RAJA::cuda_exec<32>;
 #else
   using policy = RAJA::simd_exec;
 #endif
   // for each element in the domain
-  RAJA::forall<policy>(
-      RAJA::TypedRangeSegment<uint32_t>(0, num_elements),
-      [J, x, qf, u, rule, r, qpts_per_elem, qf_derivatives] RAJA_HOST_DEVICE(uint32_t e) {
-        // load the jacobians and positions for each quadrature point in this element
-        auto J_e = J[e];
-        auto x_e = x[e];
-        // Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list)
-        (void)qf_derivatives;
-        (void)qpts_per_elem;
-
-        static constexpr trial_element_type empty_trial_element{};
-        // batch-calculate values / derivatives of each trial space, at each quadrature point
-        [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
-            get<indices>(empty_trial_element).interpolate(get<indices>(u)[e], rule))...};
-
-        // (batch) evalute the q-function at each quadrature point
-        auto qf_outputs = batch_apply_qf(qf, x_e, J_e, get<indices>(qf_inputs)...);
-
-        // write out the q-function derivatives after applying the
-        // physical_to_parent transformation, so that those transformations
-        // won't need to be applied in the action_of_gradient and element_gradient kernels
-        if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
-          for (int q = 0; q < leading_dimension(qf_outputs); q++) {
-            qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]);
-          }
-        }
-
-        // (batch) integrate the material response against the test-space basis functions
-        test_element::integrate(get_value(qf_outputs), rule, &r[e]);
-      });
+  //  RAJA::forall<policy>(
+  //      RAJA::TypedRangeSegment<uint32_t>(0, num_elements),
+  //      [J, x, qf, u, rule, r, qpts_per_elem, qf_derivatives] RAJA_HOST_DEVICE(uint32_t e) {
+  //        // load the jacobians and positions for each quadrature point in this element
+  //        auto J_e = J[e];
+  //        auto x_e = x[e];
+  //        // Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list)
+  //        (void)qf_derivatives;
+  //        (void)qpts_per_elem;
+  //
+  //        static constexpr trial_element_type empty_trial_element{};
+  //        // batch-calculate values / derivatives of each trial space, at each quadrature point
+  //        [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
+  //            get<indices>(empty_trial_element).interpolate(get<indices>(u)[e], rule))...};
+  //
+  //        // (batch) evalute the q-function at each quadrature point
+  //        auto qf_outputs = batch_apply_qf(qf, x_e, J_e, get<indices>(qf_inputs)...);
+  //
+  //        // write out the q-function derivatives after applying the
+  //        // physical_to_parent transformation, so that those transformations
+  //        // won't need to be applied in the action_of_gradient and element_gradient kernels
+  //        if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
+  //          for (int q = 0; q < leading_dimension(qf_outputs); q++) {
+  //            qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]);
+  //          }
+  //        }
+  //
+  //        // (batch) integrate the material response against the test-space basis functions
+  //        test_element::integrate(get_value(qf_outputs), rule, &r[e]);
+  //      });
 }
 
 //clang-format off
@@ -278,7 +278,7 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
   constexpr TensorProductQuadratureRule<Q> rule{};
 
 #if defined(USE_CUDA)
-  using policy = RAJA::cuda_exec<512>;
+  using policy = RAJA::cuda_exec<32>;
 #else
   using policy = RAJA::simd_exec;
 #endif
@@ -334,7 +334,7 @@ void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK,
 
 #if defined(USE_CUDA)
   std::cout << "USING CUDA :)\n";
-  using policy = RAJA::cuda_exec<512>;
+  using policy = RAJA::cuda_exec<32>;
 #else
   using policy = RAJA::simd_exec;
 #endif
diff --git a/src/serac/numerics/functional/detail/hexahedron_H1.inl b/src/serac/numerics/functional/detail/hexahedron_H1.inl
index ace6dc2ef..fbc8920f8 100644
--- a/src/serac/numerics/functional/detail/hexahedron_H1.inl
+++ b/src/serac/numerics/functional/detail/hexahedron_H1.inl
@@ -16,8 +16,9 @@
 // note: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
+#include <RAJA/pattern/launch/launch_core.hpp>
 template <int p, int c>
-struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
+struct finite_element<mfem::Geometry::CUBE, H1<p, c>> {
   static constexpr auto geometry   = mfem::Geometry::CUBE;
   static constexpr auto family     = Family::H1;
   static constexpr int  components = c;
@@ -31,9 +32,9 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
 
   using dof_type = tensor<double, c, p + 1, p + 1, p + 1>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
   SERAC_HOST_DEVICE static constexpr tensor<double, ndof> shape_functions(tensor<double, dim> xi)
@@ -72,7 +73,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
       for (int j = 0; j < p + 1; j++) {
         for (int i = 0; i < p + 1; i++) {
           dN[count++] = {
-            dN_xi[i] *  N_eta[j] *  N_zeta[k], 
+            dN_xi[i] *  N_eta[j] *  N_zeta[k],
              N_xi[i] * dN_eta[j] *  N_zeta[k],
              N_xi[i] *  N_eta[j] * dN_zeta[k]
           };
@@ -105,6 +106,28 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
     }
     return B;
   }
+#ifdef USE_CUDA
+  // Compute B on device, and return a pointer to device memory.
+  template <bool apply_weights, int q>
+  static serac::tensor<double, q, n>* calculate_B_device()
+  {
+    constexpr auto                  points1D       = GaussLegendreNodes<q, mfem::Geometry::SEGMENT>();
+    [[maybe_unused]] constexpr auto weights1D      = GaussLegendreWeights<q, mfem::Geometry::SEGMENT>();
+    tensor<double, q, n>*           B_ptr          = nullptr;
+    auto&                           rm             = umpire::ResourceManager::getInstance();
+    auto                            dest_allocator = rm.getAllocator("DEVICE");
+    B_ptr        = static_cast<tensor<double, q, n>*>(dest_allocator.allocate(sizeof(tensor<double, q, n>)));
+    using policy = RAJA::cuda_exec<256>;
+    RAJA::forall<policy>(RAJA::TypedRangeSegment<int>(0, q), [points1D, weights1D, B_ptr] SERAC_HOST_DEVICE(int i) {
+      (*B_ptr)[i] = GaussLobattoInterpolation<n>(points1D[i]);
+      if constexpr (apply_weights) {
+        (*B_ptr)[i] = (*B_ptr)[i] * weights1D[i];
+      }
+    });
+
+    return B_ptr;
+  }
+#endif
 
   /**
    * @brief G(i,j) is the derivative of the
@@ -129,6 +152,27 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
     return G;
   }
 
+#ifdef USE_CUDA
+  template <bool apply_weights, int q>
+  static serac::tensor<double, q, n>* calculate_G_device()
+  {
+    constexpr auto                  points1D       = GaussLegendreNodes<q, mfem::Geometry::SEGMENT>();
+    [[maybe_unused]] constexpr auto weights1D      = GaussLegendreWeights<q, mfem::Geometry::SEGMENT>();
+    tensor<double, q, n>*           G_ptr          = nullptr;
+    auto&                           rm             = umpire::ResourceManager::getInstance();
+    auto                            dest_allocator = rm.getAllocator("DEVICE");
+    G_ptr        = static_cast<tensor<double, q, n>*>(dest_allocator.allocate(sizeof(tensor<double, q, n>)));
+    using policy = RAJA::cuda_exec<256>;
+    RAJA::forall<policy>(RAJA::TypedRangeSegment<int>(0, q), [points1D, weights1D, G_ptr] SERAC_HOST_DEVICE(int i) {
+      (*G_ptr)[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
+      if constexpr (apply_weights) {
+        (*G_ptr)[i] = (*G_ptr)[i] * weights1D[i];
+      }
+    });
+    return G_ptr;
+  }
+#endif
+
   template <typename in_t, int q>
   static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input,
                                                     const TensorProductQuadratureRule<q>&)
@@ -168,7 +212,9 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type&                   X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q * q * q>* foo = nullptr,
+                                            RAJA::LaunchContext               ctx = RAJA::LaunchContext{})
   {
     // we want to compute the following:
     //
@@ -186,11 +232,20 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
     // A2(dz, qy, qx)  := B(qy, dy) * A1(dz, dy, qx)
     // X_q(qz, qy, qx) := B(qz, dz) * A2(dz, qy, qx)
     static constexpr bool apply_weights = false;
-    static constexpr auto B             = calculate_B<apply_weights, q>();
-    static constexpr auto G             = calculate_G<apply_weights, q>();
 
+#ifdef USE_CUDA
+    using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#else
+    using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+#endif
+
+    RAJA::RangeSegment x_range(0, BLOCK_SZ);
+
+#if not defined(USE_CUDA)
     tensor<double, c, q, q, q>      value{};
     tensor<double, c, dim, q, q, q> gradient{};
+    static constexpr auto           B = calculate_B<apply_weights, q>();
+    static constexpr auto           G = calculate_G<apply_weights, q>();
 
     for (int i = 0; i < c; i++) {
       auto A10 = contract<2, 1>(X[i], B);
@@ -205,33 +260,112 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
       gradient(i, 1) = contract<0, 1>(A22, B);
       gradient(i, 2) = contract<0, 1>(A20, G);
     }
+#else
+
+    RAJA_TEAM_SHARED tensor<double, c, q, q, q> value;
+    RAJA_TEAM_SHARED tensor<double, c, dim, q, q, q> gradient;
+    constexpr auto                                   B = calculate_B<apply_weights, q>();
+    constexpr auto                                   G = calculate_G<apply_weights, q>();
+    for (int i = 0; i < c; i++) {
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], B)) A10;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], G)) A11;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, B))  A20;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A11, B))  A21;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, G))  A22;
+
+      RAJA::loop<threads_x>(ctx, x_range, [&](int tid) {
+        int qx = tid % BLOCK_X;
+        int qy = tid / BLOCK_X;
+        int qz = tid / (BLOCK_X * BLOCK_Y);
+
+        // Perform actual contractions
+        contract<2, 1>(X[i], B, &A10, qx, qy, qz);
+
+        ctx.teamSync();
+        contract<2, 1>(X[i], G, &A11, qx, qy, qz);
+
+        ctx.teamSync();
+        contract<1, 1>(A10, B, &A20, qx, qy, qz);
+
+        ctx.teamSync();
+        contract<1, 1>(A11, B, &A21, qx, qy, qz);
+
+        ctx.teamSync();
+        contract<1, 1>(A10, G, &A22, qx, qy, qz);
+
+        ctx.teamSync();
+
+        contract<0, 1>(A20, B, &value(i), qx, qy, qz);
+
+        ctx.teamSync();
+        contract<0, 1>(A21, B, &gradient(i, 0), qx, qy, qz);
+
+        ctx.teamSync();
+        contract<0, 1>(A22, B, &gradient(i, 1), qx, qy, qz);
+
+        ctx.teamSync();
+        contract<0, 1>(A20, G, &gradient(i, 2), qx, qy, qz);
+
+        ctx.teamSync();
+      });
+    }
 
+#endif
     // transpose the quadrature data into a flat tensor of tuples
-    union {
-      tensor<qf_input_type, q * q * q>                                   one_dimensional;
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, q, q, q> three_dimensional;
+
+    RAJA_TEAM_SHARED union {
+      tensor<qf_input_type, q * q * q>                                  one_dimensional;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, q, q, q> three_dimensional;
     } output;
 
-    for (int qz = 0; qz < q; qz++) {
-      for (int qy = 0; qy < q; qy++) {
-        for (int qx = 0; qx < q; qx++) {
-          for (int i = 0; i < c; i++) {
-            get<VALUE>(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx);
-            for (int j = 0; j < dim; j++) {
-              get<GRADIENT>(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx);
-            }
-          }
+    RAJA::loop<threads_x>(ctx, x_range, [&](int tid) {
+      int qx = tid % BLOCK_X;
+      int qy = tid / BLOCK_X;
+      int qz = tid / (BLOCK_X * BLOCK_Y);
+      if (qx >= q || qy >= q || qz >= q) {
+        return;
+      }
+      for (int i = 0; i < c; i++) {
+        get<VALUE>(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx);
+        for (int j = 0; j < dim; j++) {
+          get<GRADIENT>(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx);
         }
       }
+    });
+    if (foo) {
+      RAJA::loop<threads_x>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.one_dimensional)) {
+          (*foo)[tid] = output.one_dimensional[tid];
+        }
+      });
     }
-
     return output.one_dimensional;
   }
 
+  template <typename T, typename DestinationTensor, int... n>
+  static void copy_tensor_to_host(serac::tensor<T, n...>* src, DestinationTensor* dst)
+  {
+    auto& rm             = umpire::ResourceManager::getInstance();
+    auto  dest_allocator = rm.getAllocator("DEVICE");
+
+    umpire::register_external_allocation(
+        dst,
+        umpire::util::AllocationRecord(dst, sizeof(DestinationTensor), rm.getAllocator("HOST").getAllocationStrategy(),
+                                       std::string("external array")));
+
+    rm.copy(dst, src);
+  }
+
+  template <typename DestinationTensor>
+  static void copy_tensor_to_host(serac::zero* src, DestinationTensor* dst)
+  {
+    *dst = serac::zero{};
+  }
+
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          int step = 1)
+                                          RAJA::LaunchContext ctx = RAJA::LaunchContext{}, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
@@ -239,39 +373,88 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
 
     constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
 
-    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q> >;
-    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q> >;
+    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q>>;
+    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q>>;
 
-    static constexpr bool apply_weights = true;
-    static constexpr auto B             = calculate_B<apply_weights, q>();
-    static constexpr auto G             = calculate_G<apply_weights, q>();
+    /*static*/ constexpr bool apply_weights = true;
+
+    RAJA::RangeSegment x_range(0, BLOCK_SZ);
+
+#ifdef USE_CUDA
+    using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#else
+
+    using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+#endif
 
     for (int j = 0; j < ntrial; j++) {
       for (int i = 0; i < c; i++) {
         s_buffer_type source;
         f_buffer_type flux;
 
-        for (int qz = 0; qz < q; qz++) {
-          for (int qy = 0; qy < q; qy++) {
-            for (int qx = 0; qx < q; qx++) {
-              int Q              = (qz * q + qy) * q + qx;
-              source(qz, qy, qx) = reinterpret_cast<const double*>(&get<SOURCE>(qf_output[Q]))[i * ntrial + j];
-              for (int k = 0; k < dim; k++) {
-                flux(k, qz, qy, qx) =
-                    reinterpret_cast<const double*>(&get<FLUX>(qf_output[Q]))[(i * dim + k) * ntrial + j];
-              }
-            }
+        RAJA::loop<threads_x>(ctx, x_range, [&](int tid) {
+          int qx = tid % BLOCK_X;
+          int qy = tid / BLOCK_X;
+          int qz = tid / (BLOCK_X * BLOCK_Y);
+          if (qx >= q || qy >= q || qz >= q) {
+            return;
           }
-        }
-
-        auto A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G);
-        auto A21 = contract<2, 0>(flux(1), B);
-        auto A22 = contract<2, 0>(flux(2), B);
+          constexpr auto B   = calculate_B<apply_weights, q>();
+          constexpr auto G   = calculate_G<apply_weights, q>();
+          int            Q   = (qz * q + qy) * q + qx;
+          source(qz, qy, qx) = reinterpret_cast<const double*>(&get<SOURCE>(qf_output[Q]))[i * ntrial + j];
+          for (int k = 0; k < dim; k++) {
+            flux(k, qz, qy, qx) = reinterpret_cast<const double*>(&get<FLUX>(qf_output[Q]))[(i * dim + k) * ntrial + j];
+          }
+        });
+#if not defined USE_CUDA
+        constexpr auto B   = calculate_B<apply_weights, q>();
+        constexpr auto G   = calculate_G<apply_weights, q>();
+        auto           A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G);
+        auto           A21 = contract<2, 0>(flux(1), B);
+        auto           A22 = contract<2, 0>(flux(2), B);
 
         auto A10 = contract<1, 0>(A20, B) + contract<1, 0>(A21, G);
         auto A11 = contract<1, 0>(A22, B);
 
         element_residual[j * step](i) += contract<0, 0>(A10, B) + contract<0, 0>(A11, G);
+#else
+        RAJA::loop<threads_x>(ctx, x_range, [&](int tid) {
+          int                                                                      qx = tid % BLOCK_X;
+          int                                                                      qy = tid / BLOCK_X;
+          int                                                                      qz = tid / (BLOCK_X * BLOCK_Y);
+          constexpr auto                                                           B  = calculate_B<apply_weights, q>();
+          constexpr auto                                                           G  = calculate_G<apply_weights, q>();
+          RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(source, B))  A20, A20_tmp;
+          RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(1), B)) A21;
+          RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(2), B)) A22;
+          RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 0>(A20, B))     A10;
+          RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 0>(A22, B))     A11;
+          ctx.teamSync();
+
+          contract<2, 0>(source, B, &A20, qx, qy, qz);
+          ctx.teamSync();
+          contract<2, 0>(flux(0), G, &A20, qx, qy, qz, true);
+          ctx.teamSync();
+
+          contract<2, 0>(flux(1), B, &A21, qx, qy, qz);
+          ctx.teamSync();
+          contract<2, 0>(flux(2), B, &A22, qx, qy, qz);
+          ctx.teamSync();
+
+          contract<1, 0>(A21, G, &A10, qx, qy, qz);
+          ctx.teamSync();
+          contract<1, 0>(A20, B, &A10, qx, qy, qz, true);
+          ctx.teamSync();
+          contract<1, 0>(A22, B, &A11, qx, qy, qz);
+          ctx.teamSync();
+
+          contract<0, 0>(A10, B, &(element_residual[j * step](i)), qx, qy, qz, true);
+          ctx.teamSync();
+          contract<0, 0>(A11, G, &(element_residual[j * step](i)), qx, qy, qz, true);
+          ctx.teamSync();
+        });
+#endif
       }
     }
   }
diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp
index becebf9d3..6ac6c5750 100644
--- a/src/serac/numerics/functional/domain_integral_kernels.hpp
+++ b/src/serac/numerics/functional/domain_integral_kernels.hpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 #pragma once
 
+#include <sys/types.h>
 #include "serac/infrastructure/accelerator.hpp"
 #include "serac/numerics/functional/quadrature_data.hpp"
 #include "serac/numerics/functional/function_signature.hpp"
@@ -12,45 +13,31 @@
 
 #include <RAJA/index/RangeSegment.hpp>
 #include <RAJA/RAJA.hpp>
+#include <RAJA/pattern/launch/launch_core.hpp>
+#include <RAJA/policy/sequential/policy.hpp>
 #include <array>
 #include <cstdint>
 #include <umpire/ResourceManager.hpp>
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
 namespace {
-  void printCUDAMemUsage() {
-    int deviceCount = 0;
-    cudaGetDeviceCount(&deviceCount);
-    for (int i = 0; i < deviceCount; ++i) {
-        cudaSetDevice(i);
-
-        size_t freeBytes, totalBytes;
-        cudaMemGetInfo(&freeBytes, &totalBytes);
-        size_t usedBytes = totalBytes - freeBytes;
-
-        std::cout << "Device Number: " << i << std::endl;
-        std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl;
-        std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl;
-        std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl;
-    }
-  }
-
-template <typename DataType>
-DataType* copy_data(DataType* source_data, std::size_t size, const std::string& destination)
+void printCUDAMemUsage()
 {
-  auto& rm             = umpire::ResourceManager::getInstance();
-  auto  dest_allocator = rm.getAllocator(destination);
-
-  DataType* dest_data = static_cast<DataType*>(dest_allocator.allocate(size * sizeof(DataType)));
-
-  // _sphinx_tag_tut_copy_start
-  rm.copy(dest_data, source_data);
-  // _sphinx_tag_tut_copy_end
-
-  return dest_data;
+  int deviceCount = 0;
+  cudaGetDeviceCount(&deviceCount);
+  int i = 0;
+  cudaSetDevice(i);
+
+  size_t freeBytes, totalBytes;
+  cudaMemGetInfo(&freeBytes, &totalBytes);
+  size_t usedBytes = totalBytes - freeBytes;
+
+  std::cout << "Device Number: " << i << std::endl;
+  std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl;
+  std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl;
+  std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl;
 }
 
-
 template <typename DataType>
 void deallocate(DataType* data, const std::string& destination)
 {
@@ -59,7 +46,7 @@ void deallocate(DataType* data, const std::string& destination)
   dest_allocator.deallocate(data);
 }
 
-}
+}  // namespace
 #endif
 namespace serac {
 
@@ -77,37 +64,37 @@ SERAC_HOST_DEVICE struct QFunctionArgument;
 
 /// @overload
 template <int p, int dim>
-SERAC_HOST_DEVICE struct QFunctionArgument<H1<p, 1>, Dimension<dim> > {
-  using type = tuple<double, tensor<double, dim> >;  ///< what will be passed to the q-function
+SERAC_HOST_DEVICE struct QFunctionArgument<H1<p, 1>, Dimension<dim>> {
+  using type = tuple<double, tensor<double, dim>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int c, int dim>
-SERAC_HOST_DEVICE struct QFunctionArgument<H1<p, c>, Dimension<dim> > {
-  using type = tuple<tensor<double, c>, tensor<double, c, dim> >;  ///< what will be passed to the q-function
+SERAC_HOST_DEVICE struct QFunctionArgument<H1<p, c>, Dimension<dim>> {
+  using type = tuple<tensor<double, c>, tensor<double, c, dim>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int dim>
-SERAC_HOST_DEVICE struct QFunctionArgument<L2<p, 1>, Dimension<dim> > {
-  using type = tuple<double, tensor<double, dim> >;  ///< what will be passed to the q-function
+SERAC_HOST_DEVICE struct QFunctionArgument<L2<p, 1>, Dimension<dim>> {
+  using type = tuple<double, tensor<double, dim>>;  ///< what will be passed to the q-function
 };
 /// @overload
 template <int p, int c, int dim>
-SERAC_HOST_DEVICE struct QFunctionArgument<L2<p, c>, Dimension<dim> > {
-  using type = tuple<tensor<double, c>, tensor<double, c, dim> >;  ///< what will be passed to the q-function
+SERAC_HOST_DEVICE struct QFunctionArgument<L2<p, c>, Dimension<dim>> {
+  using type = tuple<tensor<double, c>, tensor<double, c, dim>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p>
-SERAC_HOST_DEVICE struct QFunctionArgument<Hcurl<p>, Dimension<2> > {
+SERAC_HOST_DEVICE struct QFunctionArgument<Hcurl<p>, Dimension<2>> {
   using type = tuple<tensor<double, 2>, double>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p>
-SERAC_HOST_DEVICE struct QFunctionArgument<Hcurl<p>, Dimension<3> > {
-  using type = tuple<tensor<double, 3>, tensor<double, 3> >;  ///< what will be passed to the q-function
+SERAC_HOST_DEVICE struct QFunctionArgument<Hcurl<p>, Dimension<3>> {
+  using type = tuple<tensor<double, 3>, tensor<double, 3>>;  ///< what will be passed to the q-function
 };
 
 /// @brief layer of indirection needed to unpack the entries of the argument tuple
@@ -143,32 +130,45 @@ SERAC_HOST_DEVICE auto apply_qf(lambda&& qf, coords_type&& x_q, qpt_data_type&&
 template <int i, int dim, typename... trials, typename lambda, typename qpt_data_type>
 auto get_derivative_type(lambda qf, qpt_data_type&& qpt_data)
 {
-  using qf_arguments = serac::tuple<typename QFunctionArgument<trials, serac::Dimension<dim> >::type...>;
+  using qf_arguments = serac::tuple<typename QFunctionArgument<trials, serac::Dimension<dim>>::type...>;
   return get_gradient(apply_qf(qf, tensor<double, dim>{}, qpt_data, make_dual_wrt<i>(qf_arguments{})));
 };
 
 template <typename lambda, int dim, int n, typename... T>
-SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(lambda qf, const tensor<double, dim, n> x, const T&... inputs)
+SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(lambda qf, const tensor<double, dim, n> x, RAJA::LaunchContext ctx,
+                                               const T&... inputs)
 {
   using return_type = decltype(qf(tensor<double, dim>{}, T{}[0]...));
+#ifdef USE_CUDA
+  using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#else
+  using threads_x                          = RAJA::LoopPolicy<RAJA::seq_exec>;
+#endif
+  RAJA::RangeSegment     x_range(0, n);
   tensor<return_type, n> outputs{};
-  for (int i = 0; i < n; i++) {
+  RAJA::loop<threads_x>(ctx, x_range, [&](int i) {
     tensor<double, dim> x_q;
     for (int j = 0; j < dim; j++) {
       x_q[j] = x(j, i);
     }
     outputs[i] = qf(x_q, inputs[i]...);
-  }
+  });
   return outputs;
 }
 
 template <typename lambda, int dim, int n, typename qpt_data_type, typename... T>
 SERAC_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor<double, dim, n> x, qpt_data_type* qpt_data,
-                                      bool update_state, const T&... inputs)
+                                      bool update_state, RAJA::LaunchContext ctx, const T&... inputs)
 {
   using return_type = decltype(qf(tensor<double, dim>{}, qpt_data[0], T{}[0]...));
+#ifdef USE_CUDA
+  using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#else
+  using threads_x                          = RAJA::LoopPolicy<RAJA::seq_exec>;
+#endif
+  RAJA::RangeSegment     x_range(0, n);
   tensor<return_type, n> outputs{};
-  for (int i = 0; i < n; i++) {
+  RAJA::loop<threads_x>(ctx, x_range, [&](int i) {
     tensor<double, dim> x_q;
     for (int j = 0; j < dim; j++) {
       x_q[j] = x(j, i);
@@ -179,7 +179,7 @@ SERAC_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor<double, dim, n> x,
     if (update_state) {
       qpt_data[i] = qdata;
     }
-  }
+  });
   return outputs;
 }
 
@@ -195,8 +195,8 @@ void evaluation_kernel_impl(trial_element_tuple_type          trial_elements, te
 {
   // mfem provides this information as opaque arrays of doubles,
   // so we reinterpret the pointer with
-  using X_Type = typename batched_position<geom, Q>::type;
-  using J_Type = typename batched_jacobian<geom, Q>::type;
+  using X_Type                     = typename batched_position<geom, Q>::type;
+  using J_Type                     = typename batched_jacobian<geom, Q>::type;
   auto                           r = reinterpret_cast<typename test_element_type::dof_type*>(outputs);
   auto                           x = const_cast<X_Type*>(reinterpret_cast<const X_Type*>(positions));
   auto                           J = const_cast<J_Type*>(reinterpret_cast<const J_Type*>(jacobians));
@@ -206,95 +206,150 @@ void evaluation_kernel_impl(trial_element_tuple_type          trial_elements, te
 
   [[maybe_unused]] tuple u = {
       reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};
-#define USE_CUDA
+
+  trial_element_tuple_type empty_trial_element{};
+  using interpolate_out_type =
+      decltype(tuple{get<indices>(empty_trial_element).interpolate(get<indices>(u)[0], rule)...});
+
+  using type = decltype(tuple{promote_each_to_dual_when<indices == differentiation_index>(
+      get<indices>(empty_trial_element).interpolate(get<indices>(u)[0], rule))...});
+
 #ifdef USE_CUDA
+  auto&                 rm             = umpire::ResourceManager::getInstance();
+  auto                  dest_allocator = rm.getAllocator("DEVICE");
+  type*                 qf_inputs      = static_cast<type*>(dest_allocator.allocate(sizeof(type) * num_elements));
+  interpolate_out_type* interpolate_result =
+      static_cast<interpolate_out_type*>(dest_allocator.allocate(sizeof(interpolate_out_type) * num_elements));
   std::cout << "USING CUDA :)\n";
-  printCUDAMemUsage();
-  cudaSetDevice(0);
-  using policy = RAJA::cuda_exec<512>;
-  auto& rm             = umpire::ResourceManager::getInstance();
-  auto  dest_allocator = rm.getAllocator("DEVICE");
 
-  auto device_J = copy_data((J), serac::size(J), "DEVICE");
-  auto device_x = copy_data((x), serac::size(x), "DEVICE");
-  auto device_r = copy_data(r, serac::size(r), "DEVICE");
+  auto device_J = copy_data(J, serac::size(*J) * sizeof(double), "DEVICE");
+  auto device_x = copy_data(x, serac::size(*x) * sizeof(double), "DEVICE");
+  auto device_r = copy_data(r, serac::size(*r) * sizeof(double), "DEVICE");
 
   // These more complex types require a helper struct to deduce the data structure size.
-  decltype(u)* device_u = static_cast<decltype(u)*>(dest_allocator.allocate(serac::size(u)));
-  rm.copy(device_u, &u);
-  auto device_qf_derivatives = static_cast<derivative_type*>(dest_allocator.allocate(serac::size(*qf_derivatives)));
-  rm.copy(device_qf_derivatives, qf_derivatives);
+  // decltype(u)* device_u = static_cast<decltype(u)*>(dest_allocator.allocate(serac::size(u)));
+
+  // vector of pointers to device inputs
+
+  // umpire::register_external_allocation(
+  //   &u, umpire::util::AllocationRecord(&u, serac::size(u) * sizeof(double),
+  //   rm.getAllocator("HOST").getAllocationStrategy(),
+  //                                         std::string("external array")));
+  // rm.copy(device_u, &u);
+  // auto device_qf_derivatives = static_cast<derivative_type*>(dest_allocator.allocate(serac::size(*qf_derivatives) *
+  // sizeof(double))); umpire::register_external_allocation(
+  //   qf_derivatives, umpire::util::AllocationRecord(qf_derivatives, serac::size(*qf_derivatives) * sizeof(double),
+  //   rm.getAllocator("HOST").getAllocationStrategy(),
+  //                                         std::string("external array")));
+  // rm.copy(device_qf_derivatives, qf_derivatives);
+
+  printCUDAMemUsage();
+  cudaSetDevice(0);
 #else
-  using policy = RAJA::simd_exec;
+  type*                 qf_inputs          = nullptr;
+  interpolate_out_type* interpolate_result = nullptr;
+  auto                  device_J           = J;
+  auto                  device_x           = x;
+  auto                  device_r           = r;
+  // auto device_qf_derivatives = qf_derivatives;
 #endif
 
+  auto e_range = RAJA::TypedRangeSegment<uint32_t>(0, num_elements);
+
+#if defined(USE_CUDA)
+  using teams_e       = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+#else
+  using policy        = RAJA::simd_exec;
+  using teams_e       = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+#endif
+  std::cout << "NUM elements " << num_elements << std::endl;
 
   // for each element in the domain
-  RAJA::forall<policy>(
-      RAJA::TypedRangeSegment<uint32_t>(0, num_elements),
-      [device_J, device_x, qf, device_u, qpts_per_elem, rule, device_r, qf_state, device_qf_derivatives, update_state] SERAC_HOST_DEVICE(uint32_t e) {
-        auto J_e = device_J[e];
-        auto x_e = device_x[e];
-        // load the jacobians and positions for each quadrature point in this element
-
-        // Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list)
-        //(void)u;
-        (void)device_qf_derivatives;
-        (void)qpts_per_elem;
-        (void)update_state;
-        (void)qf_state;
-
-        static constexpr trial_element_tuple_type empty_trial_element{};
-        // batch-calculate values / derivatives of each trial space, at each quadrature point
-        [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
-            get<indices>(empty_trial_element).interpolate(get<indices>(*device_u)[e], rule))...};
-
-        // use J_e to transform values / derivatives on the parent element
-        // to the to the corresponding values / derivatives on the physical element
-        (parent_to_physical<get<indices>(empty_trial_element).family>(get<indices>(qf_inputs), J_e), ...);
-
-        // (batch) evalute the q-function at each quadrature point
-        //
-        // note: the weird immediately-invoked lambda expression is
-        // a workaround for a bug in GCC(<12.0) where it fails to
-        // decide which function overload to use, and crashes
-        
-        auto qf_outputs = [&]() {
-          if constexpr (std::is_same_v<state_type, Nothing>) {
-            return batch_apply_qf_no_qdata(qf, x_e, get<indices>(qf_inputs)...);
-          } else {
-            return batch_apply_qf(qf, x_e, &qf_state(e, 0), update_state, get<indices>(qf_inputs)...);
-          }
-        }();
-
-        // use J to transform sources / fluxes on the physical element
-        // back to the corresponding sources / fluxes on the parent element
-        physical_to_parent<test_element_type::family>(qf_outputs, J_e);
-
-        // write out the q-function derivatives after applying the
-        // physical_to_parent transformation, so that those transformations
-        // won't need to be applied in the action_of_gradient and element_gradient kernels
-        if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
-          for (int q = 0; q < leading_dimension(qf_outputs); q++) {
-            device_qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]);
-          }
-        }
+  RAJA::launch<launch_policy>(
+      RAJA::LaunchParams(RAJA::Teams(num_elements), RAJA::Threads(BLOCK_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<teams_e>(ctx, e_range,
+                            [&ctx, device_J, device_x, u, qf, qpts_per_elem, rule, device_r, qf_state,
+                             /*device_*/ qf_derivatives, qf_inputs, interpolate_result, update_state](uint32_t e) {
+                              if constexpr (test_element_type::geometry == mfem::Geometry::CUBE) {
+                                // load the jacobians and positions for each quadrature point in this element
+                                static constexpr trial_element_tuple_type empty_trial_element{};
+
+                                // batch-calculate values / derivatives of each trial space, at each quadrature point
+
+                                (get<indices>(empty_trial_element)
+                                     .interpolate(get<indices>(u)[e], rule, &get<indices>(interpolate_result[e]), ctx),
+                                 ...);
+
+                                (promote_each_to_dual_when<indices == differentiation_index>(
+                                     get<indices>(interpolate_result[e]), &get<indices>(qf_inputs[e]), ctx),
+                                 ...);
+
+                                //// use J_e to transform values / derivatives on the parent element
+                                //// to the to the corresponding values / derivatives on the physical element
+                                (parent_to_physical<get<indices>(empty_trial_element).family>(
+                                     get<indices>(qf_inputs[e]), device_J, e, ctx),
+                                 ...);
+                                // parent_to_physical<get<0>(empty_trial_element).family>(qf_inputs, device_J,e, ctx);
+
+                                ctx.teamSync();
+                                //// (batch) evalute the q-function at each quadrature point
+                                ////
+                                //// note: the weird immediately-invoked lambda expression is
+                                //// a workaround for a bug in GCC(<12.0) where it fails to
+                                //// decide which function overload to use, and crashes
+                                //
+                                auto qf_outputs = [&]() {
+                                  if constexpr (std::is_same_v<state_type, Nothing>) {
+                                    return batch_apply_qf_no_qdata(qf, device_x[e], ctx, get<indices>(qf_inputs[e])...);
+                                  } else {
+                                    return batch_apply_qf(qf, device_x[e], &qf_state(e, 0), update_state, ctx,
+                                                          get<indices>(qf_inputs[e])...);
+                                  }
+                                }();
 
-        // (batch) integrate the material response against the test-space basis functions
-        test_element_type::integrate(get_value(qf_outputs), rule, &device_r[e]);
-        
+#ifdef USE_CUDA
+                                using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#else
+    using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+#endif
+                                // use J to transform sources / fluxes on the physical element
+                                // back to the corresponding sources / fluxes on the parent element
+                                // physical_to_parent<test_element_type::family>(qf_outputs, J_e);
+
+                                // write out the q-function derivatives after applying the
+                                // physical_to_parent transformation, so that those transformations
+                                // won't need to be applied in the action_of_gradient and element_gradient kernels
+                                if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
+                                  RAJA::RangeSegment x_range(0, leading_dimension(qf_outputs));
+                                  RAJA::loop<threads_x>(ctx, x_range, [&](int q) {
+                                    // qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] =
+                                    //    get_gradient(qf_outputs[q]);
+                                  });
+                                }
+                                ctx.teamSync();
+
+                                // (batch) integrate the material response against the test-space basis functions
+                                test_element_type::integrate(get_value(qf_outputs), rule, &device_r[e], ctx);
+                              }
+                            });
       });
-
-    rm.copy(&r, device_r);
-
-    rm.copy(&u, device_u);
-    rm.copy(qf_derivatives, device_qf_derivatives);
-
-    deallocate(device_J, "DEVICE");
-    deallocate(device_x, "DEVICE");
-    deallocate(device_u, "DEVICE");
-    deallocate(device_r, "DEVICE");
-    deallocate(device_qf_derivatives, "DEVICE");
+#ifdef USE_CUDA
+  rm.copy(r, device_r);
+  // rm.copy(&u, device_u);
+  // rm.copy(qf_derivatives, device_qf_derivatives);
+  deallocate(device_J, "DEVICE");
+  deallocate(qf_inputs, "DEVICE");
+  deallocate(interpolate_result, "DEVICE");
+  deallocate(device_x, "DEVICE");
+  deallocate(device_r, "DEVICE");
+  std::cout << "312\n";
+  printCUDAMemUsage();
+  // deallocate(device_u, "DEVICE");
+  // deallocate(device_qf_derivatives, "DEVICE");
+#endif
   return;
 }
 
@@ -317,7 +372,8 @@ SERAC_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx)
 //clang-format on
 
 template <bool is_QOI, typename derivative_type, int n, typename T>
-SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
+SERAC_HOST_DEVICE tensor<decltype(chain_rule<is_QOI>(derivative_type{}, T{})), n> batch_apply_chain_rule(
+    derivative_type* qf_derivatives, const tensor<T, n>& inputs, const RAJA::LaunchContext& ctx = RAJA::LaunchContext{})
 {
   using return_type = decltype(chain_rule<is_QOI>(derivative_type{}, T{}));
   tensor<return_type, n> outputs{};
@@ -368,23 +424,34 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
   auto                                     dr = reinterpret_cast<typename test_element::dof_type*>(dR);
   constexpr TensorProductQuadratureRule<Q> rule{};
 
+  const int n_blocks_x = RAJA_DIVIDE_CEILING_INT(Q, BLOCK_X);
+  const int n_blocks_y = RAJA_DIVIDE_CEILING_INT(Q, BLOCK_Y);
+  const int n_blocks_z = RAJA_DIVIDE_CEILING_INT(Q, BLOCK_Z);
+  auto      e_range    = RAJA::RangeSegment(0, num_elements);
+
 #if defined(USE_CUDA)
-  using policy = RAJA::cuda_exec<512>;
+  using teams_e       = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
 #else
-  using policy = RAJA::simd_exec;
+  using policy        = RAJA::simd_exec;
+  using teams_e       = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 #endif
-
   // for each element in the domain
-  RAJA::forall<policy>(RAJA::TypedRangeSegment<uint32_t>(0, num_elements), [=] SERAC_HOST_DEVICE(uint32_t e) {
-    // (batch) interpolate each quadrature point's value
-    auto qf_inputs = trial_element::interpolate(du[e], rule);
+  RAJA::launch<launch_policy>(
+      RAJA::LaunchParams(RAJA::Teams(num_elements), RAJA::Threads(BLOCK_X, BLOCK_Y, BLOCK_Z)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<teams_e>(ctx, e_range, [du, rule, &ctx, qf_derivatives, dr, num_qpts](int e) {
+          if constexpr (g == mfem::Geometry::CUBE) {
+            // (batch) interpolate each quadrature point's value
+            auto qf_inputs = trial_element::interpolate(du[e], rule, nullptr, ctx);
 
-    // (batch) evalute the q-function at each quadrature point
-    auto qf_outputs = batch_apply_chain_rule<is_QOI>(qf_derivatives + e * num_qpts, qf_inputs);
+            auto qf_outputs = batch_apply_chain_rule<is_QOI>(qf_derivatives + e * num_qpts, qf_inputs);
 
-    // (batch) integrate the material response against the test-space basis functions
-    test_element::integrate(qf_outputs, rule, &dr[e]);
-  });
+            test_element::integrate(qf_outputs, rule, &dr[e], ctx);
+          }
+        });
+      });
 }
 
 /**
@@ -427,39 +494,58 @@ void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK,
   constexpr int                            nquad = num_quadrature_points(g, Q);
   constexpr TensorProductQuadratureRule<Q> rule{};
 #if defined(USE_CUDA)
-  std::cout << "USING CUDA :)\n";
-  using policy = RAJA::cuda_exec<512>;
+  using teams_e       = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
 #else
-  using policy = RAJA::simd_exec;
+  using teams_e = RAJA::LoopPolicy<RAJA::seq_exec>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
 #endif
 
   // for each element in the domain
-  RAJA::forall<policy>(RAJA::TypedRangeSegment<std::size_t>(0, num_elements), [=] SERAC_HOST_DEVICE(std::size_t e) {
-    static constexpr bool is_QOI_2   = test::family == Family::QOI;
-    [[maybe_unused]] auto*                 output_ptr = reinterpret_cast<typename test_element::dof_type*>(&dK(e, 0, 0));
-
-    tensor<padded_derivative_type, nquad> derivatives{};
-    for (int q = 0; q < nquad; q++) {
-      if constexpr (is_QOI_2) {
-        get<0>(derivatives(q)) = qf_derivatives[e * nquad + uint32_t(q)];
-      } else {
-        derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)];
-      }
-    }
-
-    for (int J = 0; J < trial_element::ndof; J++) {
-      auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule);
-
-      test_element::integrate(source_and_flux, rule, output_ptr + J, trial_element::ndof);
-    }
-  });
-}
+//  RAJA::launch<launch_policy>(RAJA::LaunchParams(RAJA::Teams(num_elements), RAJA::Threads(BLOCK_SZ)),
+//                              [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+//                                if constexpr (g == mfem::Geometry::CUBE) {
+//                                  static constexpr bool  is_QOI_2 = test::family == Family::QOI;
+//                                  [[maybe_unused]] auto* output_ptr =
+//                                      reinterpret_cast<typename test_element::dof_type*>(&dK(e, 0, 0));
+//
+//                                  (void*)qf_derivatives;
+//#ifdef USE_CUDA
+//                                  using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+//#else
+//    using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+//#endif
+//                                  tensor<padded_derivative_type, nquad> derivatives{};
+//                                  RAJA::RangeSegment                    x_range(0, nquad);
+//                                  RAJA::loop<threads_x>(ctx, x_range, [&](int q) {
+//                                    if constexpr (is_QOI_2) {
+//                                      get<0>(derivatives(q)) = qf_derivatives[e * nquad + uint32_t(q)];
+//                                    } else {
+//                                      derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)];
+//                                    }
+//                                  });
+//                                  printf("here 3, num elem %d\n", trial_element::ndof);
+//
+//                                  RAJA::RangeSegment J_range(0, trial_element::ndof);
+//                                  RAJA::loop<threads_x>(ctx, x_range, [&](int J) {
+//                                    auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule);
+//                                    test_element::integrate(source_and_flux, rule, output_ptr + J,
+//                                    trial_element::ndof);
+//                                  });
+//                                }
+//                              });
+//});
+#if defined(USE_CUDA)
+  std::cout << "L480\n";
+  printCUDAMemUsage();
+#endif
+}  // namespace domain_integral
 
 template <uint32_t wrt, int Q, mfem::Geometry::Type geom, typename signature, typename lambda_type, typename state_type,
           typename derivative_type>
 std::function<void(const std::vector<const double*>&, double*, bool)> evaluation_kernel(
     signature s, lambda_type qf, const double* positions, const double* jacobians,
-    std::shared_ptr<QuadratureData<state_type> > qf_state, std::shared_ptr<derivative_type> qf_derivatives,
+    std::shared_ptr<QuadratureData<state_type>> qf_state, std::shared_ptr<derivative_type> qf_derivatives,
     uint32_t num_elements)
 {
   auto trial_elements = trial_elements_tuple<geom>(s);
diff --git a/src/serac/numerics/functional/dual.hpp b/src/serac/numerics/functional/dual.hpp
index 949e0744c..a8c302a15 100644
--- a/src/serac/numerics/functional/dual.hpp
+++ b/src/serac/numerics/functional/dual.hpp
@@ -108,6 +108,8 @@ SERAC_HOST_DEVICE constexpr auto operator-(dual<gradient_type_a> a, dual<gradien
 template <typename gradient_type>
 SERAC_HOST_DEVICE constexpr auto operator*(const dual<gradient_type>& a, double b)
 {
+  // printf("dual operator* 3\n");
+  // return a;
   return dual{a.value * b, a.gradient * b};
 }
 
@@ -115,6 +117,7 @@ SERAC_HOST_DEVICE constexpr auto operator*(const dual<gradient_type>& a, double
 template <typename gradient_type>
 SERAC_HOST_DEVICE constexpr auto operator*(double a, const dual<gradient_type>& b)
 {
+  // printf("dual operator* 2\n");
   return dual{a * b.value, a * b.gradient};
 }
 
@@ -122,6 +125,7 @@ SERAC_HOST_DEVICE constexpr auto operator*(double a, const dual<gradient_type>&
 template <typename gradient_type_a, typename gradient_type_b>
 SERAC_HOST_DEVICE constexpr auto operator*(dual<gradient_type_a> a, dual<gradient_type_b> b)
 {
+  // printf("dual operator* 1\n");
   return dual{a.value * b.value, b.value * a.gradient + a.value * b.gradient};
 }
 
@@ -427,7 +431,7 @@ SERAC_HOST_DEVICE constexpr auto get_value(dual<T> arg)
 
 /** @brief return the "gradient" part from a dual number type */
 template <typename gradient_type>
-SERAC_HOST_DEVICE constexpr auto get_gradient(dual<gradient_type> arg)
+SERAC_HOST_DEVICE /*constexpr*/ auto get_gradient(dual<gradient_type> arg)
 {
   return arg.gradient;
 }
diff --git a/src/serac/numerics/functional/element_restriction.hpp b/src/serac/numerics/functional/element_restriction.hpp
index f082e9236..60edd891c 100644
--- a/src/serac/numerics/functional/element_restriction.hpp
+++ b/src/serac/numerics/functional/element_restriction.hpp
@@ -77,7 +77,7 @@ struct DoF {
   uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); }
 
   /// get the index field of this `DoF`
-  
+
   uint64_t index() const { return (bits & index_mask); }
 };
 
@@ -175,11 +175,11 @@ struct ElementRestriction {
    * @param i the index of the element
    * @param dofs (output) the DoFs associated with element `i`
    */
-  
+
   void GetElementVDofs(int i, DoF* vdofs) const;
 
   /// get the dof information for a given node / component
-  
+
   DoF GetVDof(DoF node, uint64_t component) const;
 
   /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector"
diff --git a/src/serac/numerics/functional/finite_element.hpp b/src/serac/numerics/functional/finite_element.hpp
index 28174f703..7c8221340 100644
--- a/src/serac/numerics/functional/finite_element.hpp
+++ b/src/serac/numerics/functional/finite_element.hpp
@@ -244,16 +244,25 @@ struct QOI {
  * @param jacobians the jacobians of the isoparametric map from parent to physical space of each quadrature point
  */
 template <Family f, typename T, int q, int dim>
-SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, const tensor<double, dim, dim, q>& jacobians)
+SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, tensor<double, dim, dim, q>* jacobians,
+                                          uint32_t block_idx, RAJA::LaunchContext ctx = {})
 {
   [[maybe_unused]] constexpr int VALUE      = 0;
   [[maybe_unused]] constexpr int DERIVATIVE = 1;
-
-  for (int k = 0; k < q; k++) {
+#ifdef USE_CUDA
+  using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#else
+  using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+#endif
+  RAJA::RangeSegment k_range(0, BLOCK_SZ);
+  RAJA::loop<threads_x>(ctx, k_range, [&](int k) {
+    if (k >= q) {
+      return;
+    }
     tensor<double, dim, dim> J;
     for (int row = 0; row < dim; row++) {
       for (int col = 0; col < dim; col++) {
-        J[row][col] = jacobians(col, row, k);
+        J[row][col] = jacobians[block_idx](col, row, k);
       }
     }
 
@@ -269,7 +278,7 @@ SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, const tensor<d
         get<DERIVATIVE>(qf_input[k]) = dot(get<DERIVATIVE>(qf_input[k]), transpose(J));
       }
     }
-  }
+  });
 }
 
 /**
diff --git a/src/serac/numerics/functional/functional.hpp b/src/serac/numerics/functional/functional.hpp
index 67dafe6bd..a378754a7 100644
--- a/src/serac/numerics/functional/functional.hpp
+++ b/src/serac/numerics/functional/functional.hpp
@@ -33,7 +33,6 @@
 #include <mfem/fem/geom.hpp>
 #include <vector>
 
-
 namespace serac {
 
 template <int... i>
@@ -519,6 +518,12 @@ class Functional<test(trials...), exec> {
           trial_space_(f.trial_space_[which]),
           df_(f.test_space_->GetTrueVSize())
     {
+      // #ifdef USE_CUDA
+      // df_.UseDevice(true);
+      // input_L_.UseDevice(true);
+      // output_L_.UseDevice(true);
+      // output_T_.UseDevice(true);
+      // #endif
     }
 
     /**
@@ -575,7 +580,10 @@ class Functional<test(trials...), exec> {
 
         integral.ComputeElementGradients(K_elem, which_argument);
       }
-
+#ifdef USE_CUDA
+      std::cout << "Printing USAGE before assemble\n";
+      printCUDAMemUsage();
+#endif
       for (auto type : Integral::Types) {
         auto K_elem             = element_gradients[type];
         auto test_restrictions  = form_.G_test_[type].restrictions;
@@ -628,6 +636,10 @@ class Functional<test(trials...), exec> {
           }
         }
       }
+#ifdef USE_CUDA
+      std::cout << "Printing USAGE AFTER assemble\n";
+      printCUDAMemUsage();
+#endif
 
       // Copy the column indices to an auxilliary array as MFEM can mutate these during HypreParMatrix construction
       col_ind_copy_ = lookup_tables.col_ind;
diff --git a/src/serac/numerics/functional/geometric_factors.cpp b/src/serac/numerics/functional/geometric_factors.cpp
index d67ace5a8..fdb921497 100644
--- a/src/serac/numerics/functional/geometric_factors.cpp
+++ b/src/serac/numerics/functional/geometric_factors.cpp
@@ -45,13 +45,13 @@ void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobian
   auto J_q = reinterpret_cast<jacobian_type*>(jacobians_q.ReadWrite());
   auto X   = reinterpret_cast<const typename element_type::dof_type*>(positions_e.Read());
 #if defined(USE_CUDA)
-  using policy = RAJA::cuda_exec<512>;
+  using policy = RAJA::cuda_exec<32>;
 #else
   using policy = RAJA::simd_exec;
 #endif
   // for each element in the domain
-  //RAJA::forall<policy>(RAJA::TypedRangeSegment<uint32_t>(0, num_elements), [=] SERAC_HOST_DEVICE(uint32_t e) {
-    for (uint32_t e = 0; e < num_elements; ++e) {
+  // RAJA::forall<policy>(RAJA::TypedRangeSegment<uint32_t>(0, num_elements), [=] SERAC_HOST_DEVICE(uint32_t e) {
+  for (uint32_t e = 0; e < num_elements; ++e) {
     // load the positions for the nodes in this element
     auto X_e = X[e];
 
@@ -73,7 +73,7 @@ void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobian
         }
       }
     }
-    }
+  }
   //});
 }
 
diff --git a/src/serac/numerics/functional/integral.hpp b/src/serac/numerics/functional/integral.hpp
index 0b7889de4..977e78432 100644
--- a/src/serac/numerics/functional/integral.hpp
+++ b/src/serac/numerics/functional/integral.hpp
@@ -88,9 +88,30 @@ struct Integral {
     for (auto& [geometry, func] : kernels) {
       std::vector<const double*> inputs(active_trial_spaces_.size());
       for (std::size_t i = 0; i < active_trial_spaces_.size(); i++) {
-        inputs[i] = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).Read();
+        if (input_E[uint32_t(active_trial_spaces_[i])].UseDevice()) {
+          std::cout << "is device " << std::endl;
+        }
+        input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).UseDevice(true);
+#ifdef USE_CUDA
+        const auto& mfem_vec = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry);
+        inputs[i]            = const_cast<const double*>(
+            copy_data(const_cast<double*>(mfem_vec.Read()), mfem_vec.Size() * sizeof(double), "DEVICE"));
+#else
+        inputs[i]          = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).Read();
+#endif
       }
+#ifdef USE_CUDA
+      std::cout << "L103" << std::endl;
+      printCUDAMemUsage();
+#endif
       func(inputs, output_E.GetBlock(geometry).ReadWrite(), update_state);
+// Deallocate
+#ifdef USE_CUDA
+      for (auto input : inputs) {
+        deallocate(const_cast<double*>(input), "DEVICE");
+      }
+      printCUDAMemUsage();
+#endif
     }
   }
 
@@ -111,7 +132,25 @@ struct Integral {
     // if this integral actually depends on the specified variable
     if (functional_to_integral_index_.count(differentiation_index) > 0) {
       for (auto& [geometry, func] : jvp_[functional_to_integral_index_.at(differentiation_index)]) {
-        func(input_E.GetBlock(geometry).Read(), output_E.GetBlock(geometry).ReadWrite());
+        const auto& mfem_vec = input_E.GetBlock(geometry);
+#ifdef USE_CUDA
+        auto device_input = const_cast<const double*>(
+            copy_data(const_cast<double*>(mfem_vec.Read()), mfem_vec.Size() * sizeof(double), "DEVICE"));
+        auto device_output = copy_data(const_cast<double*>(output_E.GetBlock(geometry).ReadWrite()),
+                                       output_E.GetBlock(geometry).Size() * sizeof(double), "DEVICE");
+#else
+        auto device_input  = mfem_vec.Read();
+        auto device_output = output_E.GetBlock(geometry).ReadWrite();
+#endif
+        func(device_input, device_output);
+#ifdef USE_CUDA
+        auto& rm = umpire::ResourceManager::getInstance();
+        deallocate(const_cast<double*>(device_input), "DEVICE");
+        rm.copy(output_E.GetBlock(geometry).ReadWrite(), device_output);
+        deallocate(device_output, "DEVICE");
+        std::cout << "L148" << std::endl;
+        printCUDAMemUsage();
+#endif
       }
     }
   }
diff --git a/src/serac/numerics/functional/tensor.hpp b/src/serac/numerics/functional/tensor.hpp
index 3150550a8..36385842d 100644
--- a/src/serac/numerics/functional/tensor.hpp
+++ b/src/serac/numerics/functional/tensor.hpp
@@ -12,12 +12,40 @@
 
 #pragma once
 
+#define BLOCK_SZ 128
+#define BLOCK_X 8
+#define BLOCK_Y 8
+#define BLOCK_Z 4
+
 #include "serac/infrastructure/accelerator.hpp"
 
 #include "detail/metaprogramming.hpp"
+#include <RAJA/RAJA.hpp>
+#include <umpire/ResourceManager.hpp>
 
 #include <cmath>
 
+namespace {
+template <typename DataType>
+DataType* copy_data(DataType* source_data, std::size_t size, const std::string& destination)
+{
+  auto& rm             = umpire::ResourceManager::getInstance();
+  auto  dest_allocator = rm.getAllocator(destination);
+
+  DataType* dest_data = static_cast<DataType*>(dest_allocator.allocate(size));
+
+  umpire::register_external_allocation(
+      source_data, umpire::util::AllocationRecord(source_data, size, rm.getAllocator("HOST").getAllocationStrategy(),
+                                                  std::string("external array")));
+
+  // _sphinx_tag_tut_copy_start
+  rm.copy(dest_data, source_data);
+  // _sphinx_tag_tut_copy_end
+
+  return dest_data;
+}
+}  // namespace
+
 namespace serac {
 
 /**
@@ -702,6 +730,7 @@ template <typename S, typename T, int m, int n, int p>
 SERAC_HOST_DEVICE constexpr auto dot(const tensor<S, m, n>& A, const tensor<T, n, p>& B)
 {
   tensor<decltype(S{} * T{}), m, p> AB{};
+
   for (int i = 0; i < m; i++) {
     for (int j = 0; j < p; j++) {
       for (int k = 0; k < n; k++) {
@@ -1248,7 +1277,7 @@ SERAC_HOST_DEVICE constexpr auto detApIm1(const tensor<T, 3, 3>& A)
 
   // clang-format off
   // equivalent to tr(A) + I2(A) + det(A)
-  return A(0, 0) + A(1, 1) + A(2, 2) 
+  return A(0, 0) + A(1, 1) + A(2, 2)
        - A(0, 1) * A(1, 0) * (1 + A(2, 2))
        + A(0, 0) * A(1, 1) * (1 + A(2, 2))
        - A(0, 2) * A(2, 0) * (1 + A(1, 1))
@@ -1324,12 +1353,8 @@ SERAC_HOST_DEVICE auto contract(const tensor<S, m, n...>& A, const tensor<T, p,
   // the type of the output tensor is easier to figure out
   using U = decltype(S{} * T{});
 
-  auto C = []() {
-    if constexpr (d3 == 0) return tensor<U, d1, d2>{};
-    if constexpr (d3 != 0) return tensor<U, d1, d2, d3>{};
-  }();
-
   if constexpr (d3 == 0) {
+    auto C = tensor<U, d1, d2>{};
     for (int i = 0; i < d1; i++) {
       for (int j = 0; j < d2; j++) {
         U sum{};
@@ -1342,7 +1367,10 @@ SERAC_HOST_DEVICE auto contract(const tensor<S, m, n...>& A, const tensor<T, p,
         C(i, j) = sum;
       }
     }
+
+    return C;
   } else {
+    auto C = tensor<U, d1, d2, d3>{};
     for (int i = 0; i < d1; i++) {
       for (int j = 0; j < d2; j++) {
         for (int k = 0; k < d3; k++) {
@@ -1359,11 +1387,88 @@ SERAC_HOST_DEVICE auto contract(const tensor<S, m, n...>& A, const tensor<T, p,
         }
       }
     }
+
+    return C;
   }
+}
 
-  return C;
+template <int i1, int i2, typename S, int m, typename T, int p, int q, int... n>
+SERAC_DEVICE constexpr auto deduce_contract_return_type(const tensor<S, m, n...>& A, const tensor<T, p, q>& B)
+{
+  constexpr int Adims[] = {m, n...};
+  constexpr int Bdims[] = {p, q};
+  static_assert(sizeof...(n) < 3);
+  static_assert(Adims[i1] == Bdims[i2], "error: incompatible tensor dimensions");
+
+  // first, we have to figure out the dimensions of the output tensor
+  constexpr int new_dim = (i2 == 0) ? q : p;
+  constexpr int d1      = (i1 == 0) ? new_dim : Adims[0];
+  constexpr int d2      = (i1 == 1) ? new_dim : Adims[1];
+  constexpr int d3      = sizeof...(n) == 1 ? 0 : ((i1 == 2) ? new_dim : Adims[2]);
+  using U               = decltype(S{} * T{});
+  if constexpr (d3 == 0) {
+    return serac::tensor<U, d1, d2>{};
+  }
+  return tensor<U, d1, d2, d3>{};
+}
+
+template <int i1, int i2, typename T, int p, int q>
+SERAC_HOST_DEVICE constexpr auto deduce_contract_return_type(const zero&, const tensor<T, p, q>&)
+{
+  return zero{};
+}
+
+#ifdef USE_CUDA
+template <int l1, int l2, typename S, int m, typename T, int p, int q, int n0, int n1, int n2, int... n>
+SERAC_DEVICE void contract(const tensor<S, m, n...>& A, const tensor<T, p, q>& B, tensor<S, n0, n1, n2>* C, int qx,
+                           int qy, int qz, bool accumulate = false)
+{
+  constexpr int Adims[] = {m, n...};
+  for (int i0 = qz; i0 < n0; i0 += BLOCK_Z) {
+    for (int i1 = qy; i1 < n1; i1 += BLOCK_Y) {
+      for (int i2 = qx; i2 < n2; i2 += BLOCK_X) {
+        double sum = (accumulate) ? (*C)(i0, i1, i2) : 0.0;
+
+        for (int j = 0; j < Adims[l1]; j++) {
+          if constexpr (l1 == 0 && l2 == 1) {
+            sum += A(j, i1, i2) * B(i0, j);
+          }
+          if constexpr (l1 == 1 && l2 == 1) {
+            sum += A(i0, j, i2) * B(i1, j);
+          }
+          if constexpr (l1 == 2 && l2 == 1) {
+            sum += A(i0, i1, j) * B(i2, j);
+          }
+        }
+
+        (*C)(i0, i1, i2) = sum;
+      }
+    }
+  }
+}
+
+template <int l1, int l2, typename T, int p, int q>
+SERAC_DEVICE void contract(const zero&, const tensor<T, p, q>&, zero*, int, int, int, bool accumulate = false)
+{
+  return;
+}
+
+template <int l1, int l2, typename S, typename T, int p, int q, int n0, int n1, int n2>
+SERAC_DEVICE void contract(const zero&, const tensor<T, p, q>&, tensor<S, n0, n1, n2>*, int, int, int,
+                           bool accumulate = false)
+{
+  return;
+}
+
+template <int l1, int l2, typename S, int m, typename T, int p, int q, int... n>
+SERAC_DEVICE void contract(const tensor<S, m, n...>&, const tensor<T, p, q>&, zero*, int, int, int,
+                           bool accumulate = false)
+{
+  return;
 }
 
+#endif
+
 /// @overload
 template <int i1, int i2, typename T>
 SERAC_HOST_DEVICE auto contract(const zero&, const T&)
@@ -1591,6 +1696,62 @@ SERAC_HOST_DEVICE constexpr tensor<double, 3, 3> inv(const tensor<double, 3, 3>&
 
   return invA;
 }
+
+/**
+ * @overload
+ * @note Compute ith jth inverse element
+ */
+template <int i, int j>
+SERAC_HOST_DEVICE constexpr double inv_elem(const tensor<double, 3, 3>& A)
+{
+  double inv_detA(1.0 / det(A));
+
+  if constexpr (i == 0 && j == 0) {
+    return (A[1][1] * A[2][2] - A[1][2] * A[2][1]) * inv_detA;
+  } else if constexpr (i == 0 && j == 1) {
+    return (A[0][2] * A[2][1] - A[0][1] * A[2][2]) * inv_detA;
+  } else if constexpr (i == 0 && j == 2) {
+    return (A[0][1] * A[1][2] - A[0][2] * A[1][1]) * inv_detA;
+  } else if constexpr (i == 1 && j == 0) {
+    return (A[1][2] * A[2][0] - A[1][0] * A[2][2]) * inv_detA;
+  } else if constexpr (i == 1 && j == 1) {
+    return (A[0][0] * A[2][2] - A[0][2] * A[2][0]) * inv_detA;
+  } else if constexpr (i == 1 && j == 2) {
+    return (A[0][2] * A[1][0] - A[0][0] * A[1][2]) * inv_detA;
+  } else if constexpr (i == 2 && j == 0) {
+    return (A[1][0] * A[2][1] - A[1][1] * A[2][0]) * inv_detA;
+  } else if constexpr (i == 2 && j == 1) {
+    return (A[0][1] * A[2][0] - A[0][0] * A[2][1]) * inv_detA;
+  } else if constexpr (i == 2 && j == 2) {
+    return (A[0][0] * A[1][1] - A[0][1] * A[1][0]) * inv_detA;
+  }
+  // error case
+  return 0;
+}
+
+/**
+ * @overload
+ * @note Compute ith jth inverse element
+ */
+template <int i, int j>
+SERAC_HOST_DEVICE constexpr double inv_elem(const tensor<double, 2, 2>& A)
+{
+  double inv_detA(1.0 / det(A));
+
+  if constexpr (i == 0 && j == 0) {
+    return A[1][1] * inv_detA;
+  } else if constexpr (i == 0 && j == 1) {
+    return -A[0][1] * inv_detA;
+  } else if constexpr (i == 1 && j == 0) {
+    return -A[1][0] * inv_detA;
+  } else if constexpr (i == 1 && j == 1) {
+    return A[0][0] * inv_detA;
+  }
+
+  // error case
+  return 0;
+}
+
 /**
  * @overload
  * @note For N-by-N matrices with N > 3, requires Gaussian elimination
@@ -1644,8 +1805,8 @@ inline SERAC_HOST_DEVICE void print(double value) { printf("%f", value); }
  * @brief print a tensor using `printf`, so that it is suitable for use inside cuda kernels.
  * @param[in] A The tensor to write out
  */
-template <int m, int... n>
-SERAC_HOST_DEVICE void print(const tensor<double, m, n...>& A)
+template <typename T, int m, int... n>
+SERAC_HOST_DEVICE void print(const tensor<T, m, n...>& A)
 {
   printf("{");
   print(A[0]);
@@ -1867,10 +2028,7 @@ SERAC_HOST_DEVICE constexpr int size(const tensor<T, n...>*)
  * @tparam n the extents of each dimension
  * @return the total number of values stored in the tensor
  */
-SERAC_HOST_DEVICE constexpr int size(const zero*)
-{
-  return 1;
-}
+SERAC_HOST_DEVICE constexpr int size(const zero*) { return 1; }
 
 /**
  * @overload
@@ -2012,12 +2170,12 @@ inline void eig(const r2tensor < 3, 3 > & A,
     // then just use the basis for the orthogonal complement
     // found earlier
     if (fabs(delta) <= 1.0e-15) {
-      
+
       for (int i = 0; i < 3; i++){
         Q(i,1) = s0(i);
         Q(i,2) = s1(i);
-      } 
-      
+      }
+
     // otherwise compute the remaining eigenvectors
     } else {
 
@@ -2143,7 +2301,7 @@ inline mat < 2, 2 > look_at(const vec < 2 > & direction) {
 
 inline mat < 3, 3 > R3_basis(const vec3 & n) {
   float sign = (n[2] >= 0.0f) ? 1.0f : -1.0f;
-  float a = -1.0f / (sign + n[2]); 
+  float a = -1.0f / (sign + n[2]);
   float b = n[0] * n[1] * a;
 
   return mat < 3, 3 >{
diff --git a/src/serac/numerics/functional/tests/CMakeLists.txt b/src/serac/numerics/functional/tests/CMakeLists.txt
index b69210de6..a37f02c38 100644
--- a/src/serac/numerics/functional/tests/CMakeLists.txt
+++ b/src/serac/numerics/functional/tests/CMakeLists.txt
@@ -74,6 +74,4 @@ target_compile_definitions(functional_cuda PUBLIC RAJA_ENABLE_CUDA)
     
     target_compile_definitions(functional_basic_h1_scalar_cuda PUBLIC ENABLE_CUDA)
     target_compile_definitions(functional_basic_h1_vector_cuda PUBLIC USE_CUDA)
-
-    target_compile_definitions(functional_basic_h1_vector_cuda PUBLIC RAJA_ENABLE_CUDA)
 endif()
diff --git a/src/serac/numerics/functional/tests/check_gradient.hpp b/src/serac/numerics/functional/tests/check_gradient.hpp
index 27f8c8e29..870b151e1 100644
--- a/src/serac/numerics/functional/tests/check_gradient.hpp
+++ b/src/serac/numerics/functional/tests/check_gradient.hpp
@@ -19,15 +19,24 @@ void check_gradient(serac::Functional<T, exec>& f, mfem::Vector& U, double epsil
   int seed = 42;
 
   mfem::Vector dU(U.Size());
+  // Set memory backend to device if using GPU execution.
+  if constexpr (exec == serac::ExecutionSpace::GPU) {
+    dU.UseDevice(true);
+  }
   dU.Randomize(seed);
 
   auto [value, dfdU]                                = f(serac::differentiate_wrt(U));
   std::unique_ptr<mfem::HypreParMatrix> dfdU_matrix = assemble(dfdU);
 
   // jacobian vector products
-  mfem::Vector df_jvp1 = dfdU(dU);  // matrix-free
+  mfem::Vector df_jvp1;
+  df_jvp1.UseDevice(true);
+  df_jvp1 = dfdU(dU);  // matrix-free
 
   mfem::Vector df_jvp2(df_jvp1.Size());
+  if constexpr (exec == serac::ExecutionSpace::GPU) {
+    df_jvp2.UseDevice(true);
+  }
   dfdU_matrix->Mult(dU, df_jvp2);  // sparse matvec
 
   if (df_jvp1.Norml2() != 0) {
diff --git a/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp b/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp
index a2a7a05ef..5fd3cd387 100644
--- a/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp
+++ b/src/serac/numerics/functional/tests/functional_basic_h1_vector.cpp
@@ -6,7 +6,7 @@
 
 #include <fstream>
 #include <iostream>
-#define USE_CUDA
+
 #include "mfem.hpp"
 
 #include <gtest/gtest.h>
@@ -89,20 +89,21 @@ void weird_mixed_test(std::unique_ptr<mfem::ParMesh>& mesh)
   auto [test_fes, test_col]   = generateParFiniteElementSpace<test_space>(mesh.get());
 
   mfem::Vector U(trial_fes->TrueVSize());
-  U.Randomize();
 
 #ifdef USE_CUDA
+  U.UseDevice(true);
   Functional<test_space(trial_space), serac::ExecutionSpace::GPU> residual(test_fes.get(), {trial_fes.get()});
 #else
   Functional<test_space(trial_space), serac::ExecutionSpace::CPU> residual(test_fes.get(), {trial_fes.get()});
 #endif
+  U.Randomize();
 
   // note: this is not really an elasticity problem, it's testing source and flux
   // terms that have the appropriate shapes to ensure that all the differentiation
   // code works as intended
   residual.AddDomainIntegral(Dimension<dim>{}, DependsOn<0>{}, MixedModelOne<dim>{}, *mesh);
 
-  residual.AddBoundaryIntegral(Dimension<dim - 1>{}, DependsOn<0>{}, MixedModelTwo<dim>{}, *mesh);
+  // residual.AddBoundaryIntegral(Dimension<dim - 1>{}, DependsOn<0>{}, MixedModelTwo<dim>{}, *mesh);
 
   check_gradient(residual, U);
 }
@@ -118,20 +119,22 @@ void elasticity_test(std::unique_ptr<mfem::ParMesh>& mesh)
   auto [test_fes, test_col]   = generateParFiniteElementSpace<test_space>(mesh.get());
 
   mfem::Vector U(trial_fes->TrueVSize());
-  U.Randomize();
 
 #ifdef USE_CUDA
+  U.UseDevice(true);
   Functional<test_space(trial_space), serac::ExecutionSpace::GPU> residual(test_fes.get(), {trial_fes.get()});
 #else
   Functional<test_space(trial_space), serac::ExecutionSpace::CPU> residual(test_fes.get(), {trial_fes.get()});
 #endif
 
+  U.Randomize();
+
   // note: this is not really an elasticity problem, it's testing source and flux
   // terms that have the appropriate shapes to ensure that all the differentiation
   // code works as intended
   residual.AddDomainIntegral(Dimension<dim>{}, DependsOn<0>{}, ElasticityTestModelOne<dim>{}, *mesh);
 
-  residual.AddBoundaryIntegral(Dimension<dim - 1>{}, DependsOn<0>{}, ElasticityTestModelTwo<dim>{}, *mesh);
+  // residual.AddBoundaryIntegral(Dimension<dim - 1>{}, DependsOn<0>{}, ElasticityTestModelTwo<dim>{}, *mesh);
 
   check_gradient(residual, U);
 }
@@ -157,16 +160,15 @@ void test_suite(std::string meshfile)
   }
 }
 
- //TEST(VectorValuedH1, test_suite_tris) { test_suite("/data/meshes/patch2D_tris.mesh"); }
- //TEST(VectorValuedH1, test_suite_quads) { test_suite("/data/meshes/patch2D_quads.mesh"); }
-//TEST(VectorValuedH1, test_suite_tris_and_quads) { test_suite("/data/meshes/patch2D_tris_and_quads.mesh"); }
-
-//TEST(VectorValuedH1, test_suite_tets) { test_suite("/data/meshes/patch3D_tets.mesh"); }
+// TEST(VectorValuedH1, test_suite_tris) { test_suite("/data/meshes/patch2D_tris.mesh"); }
+// TEST(VectorValuedH1, test_suite_quads) { test_suite("/data/meshes/patch2D_quads.mesh"); }
+// TEST(VectorValuedH1, test_suite_tris_and_quads) { test_suite("/data/meshes/patch2D_tris_and_quads.mesh"); }
 
+// TEST(VectorValuedH1, test_suite_tets) { test_suite("/data/meshes/patch3D_tets.mesh"); }
 
- TEST(VectorValuedH1, test_suite_hexes) { test_suite("/data/meshes/patch3D_hexes.mesh"); }
+TEST(VectorValuedH1, test_suite_hexes) { test_suite("/data/meshes/patch3D_hexes.mesh"); }
 
- TEST(VectorValuedH1, test_suite_tets_and_hexes) { test_suite("/data/meshes/patch3D_tets_and_hexes.mesh"); }
+// TEST(VectorValuedH1, test_suite_tets_and_hexes) { test_suite("/data/meshes/patch3D_tets_and_hexes.mesh"); }
 
 int main(int argc, char* argv[])
 {
@@ -179,18 +181,18 @@ int main(int argc, char* argv[])
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
 
   axom::slic::SimpleLogger logger;
-//cudaSetDevice(0);
-//cudaDeviceReset();
-//#ifdef USE_CUDA
-//printCUDAMemUsage();
-//#endif
-//cudaSetDevice(2);
-cudaDeviceSynchronize();
+  // cudaSetDevice(0);
+  // cudaDeviceReset();
+  //#ifdef USE_CUDA
+  // printCUDAMemUsage();
+  //#endif
+  // cudaSetDevice(2);
+
   int result = RUN_ALL_TESTS();
-cudaDeviceSynchronize();
-//#ifdef USE_CUDA
-//printCUDAMemUsage();
-//#endif
+
+  //#ifdef USE_CUDA
+  // printCUDAMemUsage();
+  //#endif
   MPI_Finalize();
 
   return result;
diff --git a/src/serac/numerics/functional/tests/interpolate_device.cpp b/src/serac/numerics/functional/tests/interpolate_device.cpp
new file mode 100644
index 000000000..db19f749c
--- /dev/null
+++ b/src/serac/numerics/functional/tests/interpolate_device.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and
+// other Serac Project Developers. See the top-level LICENSE file for
+// details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+
+#include <fstream>
+#include <iostream>
+
+#include "mfem.hpp"
+
+#include <gtest/gtest.h>
+
+#include "axom/slic/core/SimpleLogger.hpp"
+#include "serac/infrastructure/input.hpp"
+#include "serac/numerics/functional/finite_element.hpp"
+#include "serac/serac_config.hpp"
+#include "serac/mesh/mesh_utils_base.hpp"
+#include "serac/numerics/stdfunction_operator.hpp"
+#include "serac/numerics/functional/functional.hpp"
+#include "serac/numerics/functional/tensor.hpp"
+
+#include "serac/numerics/functional/tests/check_gradient.hpp"
+
+
+
+int main {
+    using element_type = serac::finite_element<mfem::Geometry::CUBE, serac::H1<3>>;
+    double* U_e; //input buffer, 2D array
+    double* U_q; //output buffer, 2D array
+    double* dU_dxi_q; //gradient 3D
+    // allocate buffers with umpire
+
+    RAJA::forall() .. {
+    using local_array_mem_policy = RAJA::cuda_shared_mem;
+       RAJA::LocalArray<local_array_mem_policy> shared_buffer;
+       // copy input buffer into shared_buffer
+        element_type::interpolate()
+        // check U_q and gradient dU_dxi_q
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/src/serac/numerics/functional/tuple.hpp b/src/serac/numerics/functional/tuple.hpp
index 49c790e99..93244aa24 100644
--- a/src/serac/numerics/functional/tuple.hpp
+++ b/src/serac/numerics/functional/tuple.hpp
@@ -205,7 +205,7 @@ struct tuple_size_ptr<serac::tuple<Types*...>> : std::integral_constant<std::siz
  * @tparam n the extents of each dimension
  * @return the total number of values stored in the tensor
  */
- template <class... Types>
+template <class... Types>
 SERAC_HOST_DEVICE constexpr int size(const serac::tuple<Types...>)
 {
   return tuple_size<serac::tuple<Types...>>::value;
@@ -218,7 +218,7 @@ SERAC_HOST_DEVICE constexpr int size(const serac::tuple<Types...>)
  * @tparam n the extents of each dimension
  * @return the total number of values stored in the tensor
  */
- template <class... Types>
+template <class... Types>
 SERAC_HOST_DEVICE constexpr int size(const serac::tuple<Types*...>)
 {
   return tuple_size_ptr<serac::tuple<Types*...>>::value;
@@ -574,6 +574,7 @@ template <typename... S, typename... T, int... i>
 SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple<S...>& x, const tuple<T...>& y,
                                              std::integer_sequence<int, i...>)
 {
+  // printf("mult helper* 3\n");
   return tuple{get<i>(x) * get<i>(y)...};
 }
 
@@ -587,6 +588,7 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple<S...>& x, const tuple<T
 template <typename... S, typename... T>
 SERAC_HOST_DEVICE constexpr auto operator*(const tuple<S...>& x, const tuple<T...>& y)
 {
+  // printf("operator* 1\n");
   static_assert(sizeof...(S) == sizeof...(T));
   return mult_helper(x, y, std::make_integer_sequence<int, static_cast<int>(sizeof...(S))>());
 }
@@ -600,9 +602,10 @@ SERAC_HOST_DEVICE constexpr auto operator*(const tuple<S...>& x, const tuple<T..
  * @param a a constant multiplier
  * @return the returned tuple product
  */
-template <typename... T, int... i>
-SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple<T...>& x, std::integer_sequence<int, i...>)
+template <typename TupleType, int... i>
+SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const TupleType& x, std::integer_sequence<int, i...>)
 {
+  // printf("mult helper* 1\n");
   return tuple{a * get<i>(x)...};
 }
 
@@ -615,9 +618,10 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple<T...>&
  * @param a a constant multiplier
  * @return the returned tuple product
  */
-template <typename... T, int... i>
-SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple<T...>& x, const double a, std::integer_sequence<int, i...>)
+template <typename TupleType, int... i>
+SERAC_HOST_DEVICE constexpr auto mult_helper(const TupleType& x, const double a, std::integer_sequence<int, i...>)
 {
+  // printf("mult helper* 2\n");
   return tuple{get<i>(x) * a...};
 }
 
@@ -630,6 +634,7 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple<T...>& x, const double
 template <typename... T>
 SERAC_HOST_DEVICE constexpr auto operator*(const double a, const tuple<T...>& x)
 {
+  // printf("operator* 2\n");
   return mult_helper(a, x, std::make_integer_sequence<int, static_cast<int>(sizeof...(T))>());
 }
 
@@ -642,6 +647,7 @@ SERAC_HOST_DEVICE constexpr auto operator*(const double a, const tuple<T...>& x)
 template <typename... T>
 SERAC_HOST_DEVICE constexpr auto operator*(const tuple<T...>& x, const double a)
 {
+  // printf("operator* 3\n");
   return mult_helper(x, a, std::make_integer_sequence<int, static_cast<int>(sizeof...(T))>());
 }
 
diff --git a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp
index a1ef69f75..1de092542 100644
--- a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp
+++ b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <RAJA/pattern/launch/launch_core.hpp>
 #include "serac/numerics/functional/tuple.hpp"
 #include "serac/numerics/functional/tensor.hpp"
 #include "serac/numerics/functional/dual.hpp"
@@ -220,17 +221,37 @@ SERAC_HOST_DEVICE auto promote_to_dual_when(const T& x)
  * @param x the values to be promoted
  */
 template <bool dualify, typename T, int n>
-SERAC_HOST_DEVICE auto promote_each_to_dual_when(const tensor<T, n>& x)
+SERAC_HOST_DEVICE auto promote_each_to_dual_when(const tensor<T, n>& x, void* output_ptr = nullptr,
+                                                 RAJA::LaunchContext ctx = RAJA::LaunchContext{})
 {
   if constexpr (dualify) {
+#ifdef USE_CUDA
+    using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+#else
+    using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+#endif
+
+    RAJA::RangeSegment x_range(0, n);
     using return_type = decltype(make_dual(T{}));
     tensor<return_type, n> output;
-    for (int i = 0; i < n; i++) {
+    auto                   casted_output_ptr = static_cast<tensor<return_type, n>*>(output_ptr);
+    RAJA::loop<threads_x>(ctx, x_range, [&](int i) {
+#ifndef USE_CUDA
       output[i] = make_dual(x[i]);
-    }
+#else
+        (*casted_output_ptr)[i] = make_dual(x[i]);
+#endif
+    });
     return output;
   }
   if constexpr (!dualify) {
+#ifdef USE_CUDA
+    using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+
+    RAJA::RangeSegment x_range(0, n);
+    auto               casted_output_ptr = static_cast<tensor<T, n>*>(output_ptr);
+    RAJA::loop<threads_x>(ctx, x_range, [&](int i) { (*casted_output_ptr)[i] = x[i]; });
+#endif
     return x;
   }
 }
@@ -503,7 +524,7 @@ SERAC_HOST_DEVICE auto get_value(const tensor<dual<T>, n...>& arg)
  * @param[in] arg The tensor of dual numbers
  */
 template <int... n>
-SERAC_HOST_DEVICE constexpr auto get_gradient(const tensor<dual<double>, n...>& arg)
+SERAC_HOST_DEVICE /*constexpr*/ auto get_gradient(const tensor<dual<double>, n...>& arg)
 {
   tensor<double, n...> g{};
   for_constexpr<n...>([&](auto... i) { g(i...) = arg(i...).gradient; });
@@ -512,7 +533,7 @@ SERAC_HOST_DEVICE constexpr auto get_gradient(const tensor<dual<double>, n...>&
 
 /// @overload
 template <int... n, int... m>
-SERAC_HOST_DEVICE constexpr auto get_gradient(const tensor<dual<tensor<double, m...>>, n...>& arg)
+SERAC_HOST_DEVICE /*constexpr*/ auto get_gradient(const tensor<dual<tensor<double, m...>>, n...>& arg)
 {
   tensor<double, n..., m...> g{};
   for_constexpr<n...>([&](auto... i) { g(i...) = arg(i...).gradient; });