From ea492d5aa51f69e473a45c77a94a36bf036cab3f Mon Sep 17 00:00:00 2001
From: stgeke <stgeke@gmail.com>
Date: Sat, 19 Oct 2024 15:57:03 +0200
Subject: [PATCH 1/2] Import next changes SHA[8fc4e8dc84, 6a36ad3dda]

---
 3rd_party/hypre/.gitignore                    |  11 +-
 .../hypre/AUTOTEST/check-license.filters      |   1 +
 3rd_party/hypre/AUTOTEST/check-mem.sh         |   4 +-
 3rd_party/hypre/AUTOTEST/machine-lassen.sh    |  32 +-
 ...{machine-rztopaz.sh => machine-rzhound.sh} |   4 +-
 3rd_party/hypre/AUTOTEST/machine-tux-spack.sh |  18 +-
 3rd_party/hypre/CHANGELOG                     |  10 +
 3rd_party/hypre/src/CMakeLists.txt            |  43 +-
 3rd_party/hypre/src/FEI_mv/fei-hypre/Makefile |   2 +
 3rd_party/hypre/src/FEI_mv/femli/Makefile     |   1 +
 3rd_party/hypre/src/IJ_mv/HYPRE_IJMatrix.c    | 267 ++++-
 3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h       |  74 +-
 3rd_party/hypre/src/IJ_mv/IJMatrix_parcsr.c   |  50 +-
 3rd_party/hypre/src/IJ_mv/_hypre_IJ_mv.h      |   4 +-
 3rd_party/hypre/src/IJ_mv/aux_parcsr_matrix.c |   4 +-
 3rd_party/hypre/src/IJ_mv/protos.h            |   4 +-
 .../hypre/src/config/HYPREConfig.cmake.in     |   1 +
 .../hypre/src/config/HYPRE_config.h.cmake.in  |   3 +-
 3rd_party/hypre/src/config/configure.in       |   6 +-
 3rd_party/hypre/src/config/update-cmake.py    | 115 +++
 3rd_party/hypre/src/config/update-cmake.sh    |  19 +
 3rd_party/hypre/src/config/version.sh         |   4 +-
 3rd_party/hypre/src/configure                 |  24 +-
 .../distributed_ls/Euclid/mat_dh_private.c    |   2 +-
 3rd_party/hypre/src/lib/Makefile              |   5 +-
 3rd_party/hypre/src/multivector/interpreter.h |   4 +-
 .../parcsr_block_mv/par_csr_block_interp.c    |  10 +-
 .../src/parcsr_block_mv/par_csr_block_rap.c   |   2 +-
 3rd_party/hypre/src/parcsr_ls/CMakeLists.txt  |   8 +-
 .../hypre/src/parcsr_ls/HYPRE_parcsr_Euclid.c |  18 +
 .../hypre/src/parcsr_ls/HYPRE_parcsr_amg.c    |  33 +-
 .../hypre/src/parcsr_ls/HYPRE_parcsr_ls.h     | 117 ++-
 .../hypre/src/parcsr_ls/HYPRE_parcsr_mgr.c    | 466 ++++++++-
 3rd_party/hypre/src/parcsr_ls/Makefile        |   1 +
 .../hypre/src/parcsr_ls/_hypre_parcsr_ls.h    |  55 +-
 3rd_party/hypre/src/parcsr_ls/ams.c           |  14 +-
 3rd_party/hypre/src/parcsr_ls/dsuperlu.c      |  24 +-
 3rd_party/hypre/src/parcsr_ls/par_amg.c       |  56 +-
 3rd_party/hypre/src/parcsr_ls/par_amg.h       |   4 +-
 3rd_party/hypre/src/parcsr_ls/par_amg_setup.c |  52 +-
 .../hypre/src/parcsr_ls/par_cg_relax_wt.c     |   2 +-
 3rd_party/hypre/src/parcsr_ls/par_coarsen.c   |  86 +-
 .../hypre/src/parcsr_ls/par_coarsen_device.c  |  29 +-
 .../hypre/src/parcsr_ls/par_fsai_setup.c      |  29 +-
 .../hypre/src/parcsr_ls/par_gauss_elim.c      |   6 +
 3rd_party/hypre/src/parcsr_ls/par_gsmg.c      |   2 +-
 3rd_party/hypre/src/parcsr_ls/par_ilu.c       |   2 +-
 3rd_party/hypre/src/parcsr_ls/par_ilu_setup.c | 100 +-
 .../src/parcsr_ls/par_ilu_setup_device.c      |  24 +-
 3rd_party/hypre/src/parcsr_ls/par_ilu_solve.c |   2 +-
 3rd_party/hypre/src/parcsr_ls/par_interp.c    |  10 +-
 3rd_party/hypre/src/parcsr_ls/par_lr_interp.c |  14 +-
 3rd_party/hypre/src/parcsr_ls/par_mgr.c       | 592 +++++------
 3rd_party/hypre/src/parcsr_ls/par_mgr.h       |  50 +-
 .../hypre/src/parcsr_ls/par_mgr_device.c      | 129 ---
 .../hypre/src/parcsr_ls/par_mgr_interp.c      | 374 +++----
 3rd_party/hypre/src/parcsr_ls/par_mgr_rap.c   | 665 +++++++++++++
 3rd_party/hypre/src/parcsr_ls/par_mgr_setup.c | 386 ++------
 3rd_party/hypre/src/parcsr_ls/par_mgr_solve.c |  99 +-
 3rd_party/hypre/src/parcsr_ls/par_mgr_stats.c |  85 +-
 .../hypre/src/parcsr_ls/par_multi_interp.c    |   9 +-
 .../hypre/src/parcsr_ls/par_nongalerkin.c     |   8 +
 3rd_party/hypre/src/parcsr_ls/par_rap.c       |  17 +-
 3rd_party/hypre/src/parcsr_ls/par_stats.c     |  14 +
 3rd_party/hypre/src/parcsr_ls/par_strength.c  | 112 ++-
 3rd_party/hypre/src/parcsr_ls/partial.c       |  12 +-
 3rd_party/hypre/src/parcsr_ls/protos.h        |  51 +-
 3rd_party/hypre/src/parcsr_ls/schwarz.c       |  14 +-
 3rd_party/hypre/src/parcsr_mv/CMakeLists.txt  |   6 +
 3rd_party/hypre/src/parcsr_mv/Makefile        |   4 +
 .../hypre/src/parcsr_mv/_hypre_parcsr_mv.h    |  16 +
 .../hypre/src/parcsr_mv/par_csr_filter.c      | 218 +++++
 .../src/parcsr_mv/par_csr_filter_device.c     | 356 +++++++
 .../hypre/src/parcsr_mv/par_csr_matmat.c      | 159 +++
 .../src/parcsr_mv/par_csr_matmat_device.c     | 146 +++
 3rd_party/hypre/src/parcsr_mv/par_csr_matop.c |  10 +-
 .../src/parcsr_mv/par_csr_matop_marked.c      |  13 +-
 .../hypre/src/parcsr_mv/par_csr_matrix.c      |  18 +-
 3rd_party/hypre/src/parcsr_mv/par_vector.c    |   2 +-
 3rd_party/hypre/src/parcsr_mv/protos.h        |  16 +
 3rd_party/hypre/src/seq_block_mv/Makefile     |   2 +-
 3rd_party/hypre/src/seq_mv/CMakeLists.txt     |   1 +
 3rd_party/hypre/src/seq_mv/Makefile           |   1 +
 3rd_party/hypre/src/seq_mv/csr_filter.c       |  52 +
 3rd_party/hypre/src/seq_mv/csr_matop.c        |   2 +-
 3rd_party/hypre/src/seq_mv/csr_matop_device.c |   6 +-
 3rd_party/hypre/src/seq_mv/protos.h           |   3 +
 3rd_party/hypre/src/seq_mv/seq_mv.h           |   3 +
 .../hypre/src/sstruct_ls/HYPRE_sstruct_ls.h   |   4 +-
 .../src/sstruct_ls/HYPRE_sstruct_maxwell.c    |   4 +-
 .../hypre/src/sstruct_ls/_hypre_sstruct_ls.h  |   4 +-
 .../hypre/src/sstruct_ls/fac_amr_fcoarsen.c   |   1 +
 3rd_party/hypre/src/sstruct_ls/fac_interp2.c  |   4 +-
 3rd_party/hypre/src/sstruct_ls/maxwell_TV.c   |   2 +-
 .../src/sstruct_ls/maxwell_semi_interp.c      |   5 +
 .../src/sstruct_ls/nd1_amge_interpolation.c   |   1 +
 3rd_party/hypre/src/sstruct_ls/node_relax.c   |   8 +-
 3rd_party/hypre/src/struct_ls/CMakeLists.txt  |   2 +
 .../hypre/src/struct_ls/sparse_msg_solve.c    |   4 +-
 .../hypre/src/struct_mv/_hypre_struct_mv.h    |   6 +-
 3rd_party/hypre/src/struct_mv/box.h           |   6 +-
 3rd_party/hypre/src/utilities/CMakeLists.txt  |   2 +
 3rd_party/hypre/src/utilities/HYPRE_handle.c  |  15 +
 .../hypre/src/utilities/HYPRE_utilities.h     | 143 ++-
 3rd_party/hypre/src/utilities/Makefile        |   3 +-
 .../hypre/src/utilities/_hypre_utilities.h    | 315 +++---
 3rd_party/hypre/src/utilities/device_utils.c  |   8 +-
 3rd_party/hypre/src/utilities/general.c       | 182 +---
 3rd_party/hypre/src/utilities/handle.c        |  72 +-
 3rd_party/hypre/src/utilities/handle.h        |  16 +
 3rd_party/hypre/src/utilities/headers         |   2 +-
 .../hypre/src/utilities/int_array_device.c    |   1 +
 3rd_party/hypre/src/utilities/matrix_stats.c  |   5 +-
 3rd_party/hypre/src/utilities/memory.c        | 739 +++++++++++++-
 3rd_party/hypre/src/utilities/memory.h        |  28 +-
 3rd_party/hypre/src/utilities/protos.h        |   7 +-
 3rd_party/hypre/src/utilities/stl_ops.c       |  53 +
 3rd_party/hypre/src/utilities/utilities.c     |  63 ++
 .../src/occa/internal/lang/modes/dpcpp.cpp    |   3 +-
 3rd_party/update.sh                           |   2 +-
 CMakeLists.txt                                |   9 +-
 RELEASE.md                                    |  17 +-
 cmake/nekrs.cmake                             |   1 +
 doc/envHelp.txt                               |   1 +
 doc/parHelp.txt                               |   1 +
 examples/CMakeLists.txt                       |  18 +-
 examples/channel/channel.udf                  |   5 -
 examples/eddyNekNek/ci.inc                    |   8 +-
 examples/eddyNekNek/eddy.udf                  |   5 -
 examples/ethier/ci.inc                        |  76 +-
 examples/ethier/ethier.par                    |   2 +-
 examples/ethier/ethier.udf                    |   9 +-
 examples/kershaw/kershaw.par                  |   8 +-
 examples/kershaw/kershaw.udf                  |  15 +-
 examples/ktauChannel/ci.inc                   |   2 +-
 examples/lowMach/lowMach.udf                  |   5 -
 examples/mv_cyl/mv_cyl.udf                    |   5 -
 examples/shlChannel/channel.udf               |   5 -
 examples/turbPipe/turbPipe.udf                |   2 +-
 scripts/nrsqsub_aurora                        |   8 +-
 scripts/nrsqsub_frontier                      |  61 +-
 scripts/nrsqsub_sunspot                       |  82 --
 scripts/nrsqsub_utils                         |  15 +-
 src/bench/advsub/benchmark.cpp                | 100 +-
 src/bench/advsub/main.cpp                     |  11 +-
 src/bench/axHelm/benchmark.cpp                |   2 +-
 src/bench/core/kernelBenchmarker.cpp          |   2 +-
 src/core/LVector.cpp                          |   2 +-
 src/core/avm.cpp                              |  93 +-
 src/core/comm.cpp                             |  79 +-
 src/core/comm.hpp                             |  11 +-
 src/core/device.cpp                           | 104 +-
 src/core/deviceMemory.hpp                     | 224 +++--
 src/core/io/iofld.hpp                         |  23 +-
 src/core/io/iofldAdios.cpp                    | 304 ++++--
 src/core/io/iofldAdios.hpp                    |   3 +
 src/core/io/iofldFactory.cpp                  |   2 +-
 src/core/io/iofldNek.cpp                      |   2 +
 src/core/kernelRequestManager.cpp             | 149 +--
 src/core/linAlg/kernels/absoluteError.okl     |   4 +-
 src/core/linAlg/kernels/relativeError.okl     |   4 +-
 src/core/linAlg/linAlg.cpp                    |  38 +-
 src/core/linAlg/linAlg.hpp                    |   4 +-
 src/core/nekrsSys.hpp.in                      |   4 +-
 src/core/ogs/oogs.cpp                         |  11 -
 src/core/opSEM.cpp                            |  44 +-
 src/core/platform.cpp                         |  33 +-
 src/core/platform.hpp                         |   7 +-
 src/core/registerCoreKernels.cpp              |  28 +-
 src/core/threadPool.cpp                       |  43 +
 src/core/threadPool.hpp                       |  75 ++
 src/elliptic/MG/MGSolver.cpp                  | 141 +--
 src/elliptic/MG/ellipticBuildFEM.cpp          |   3 -
 src/elliptic/MG/ellipticMultiGridLevel.cpp    | 110 ++-
 .../MG/ellipticMultiGridLevelSetup.cpp        |  24 +-
 src/elliptic/MG/ellipticMultiGridSetup.cpp    |   8 +-
 src/elliptic/SEMFEMSolver.cpp                 |   4 +-
 src/elliptic/ellipticPreconditionerSetup.cpp  |   1 -
 src/elliptic/ellipticSetup.cpp                |   4 +-
 src/elliptic/ellipticSolutionProjection.cpp   |   6 +-
 src/elliptic/ellipticSolve.cpp                |   4 +-
 src/elliptic/ellipticWorkspace.cpp            |   4 +-
 .../ellipticBlockPartialAxCoeffHex3D.okl      | 363 +++++++
 src/elliptic/linearSolver/PCG.cpp             |  65 +-
 src/elliptic/linearSolver/PGMRES.cpp          |  16 +-
 .../kernels/combinedPCGPostMatVec.c           |   0
 .../kernels/combinedPCGPostMatVec.okl         |   0
 .../kernels/combinedPCGPreMatVec.c            |   2 +-
 .../kernels/combinedPCGPreMatVec.okl          |   0
 .../combinedPCGUpdateConvergedSolution.c      |   4 +-
 .../combinedPCGUpdateConvergedSolution.okl    |   0
 .../kernels/ellipticBlockUpdatePCG.c          |   0
 .../kernels/ellipticBlockUpdatePCG.okl        |   0
 .../kernels/fusedResidualAndNorm.c            |   0
 .../kernels/fusedResidualAndNorm.okl          |   0
 .../kernels/gramSchmidtOrthogonalization.c    |   0
 .../kernels/gramSchmidtOrthogonalization.okl  |   0
 .../kernels/updatePGMRESSolution.c            |   0
 .../kernels/updatePGMRESSolution.okl          |   0
 src/elliptic/registerEllipticKernels.cpp      |   9 +-
 src/mesh/mesh.h                               |   2 +-
 src/mesh/meshDistance.cpp                     |   6 +-
 src/mesh/meshGeometricFactorsHex3D.cpp        |   2 +-
 src/mesh/meshIntp.cpp                         |   7 +-
 src/mesh/meshSurface.cpp                      |   2 +-
 src/mesh/planarAvg.cpp                        |   4 +-
 src/nekInterface/nekInterfaceAdapter.cpp      | 391 +++++---
 src/nrs/advectionSubCycling.cpp               |  14 +-
 src/nrs/bdry/applyDirichlet.cpp               | 124 +--
 src/nrs/cds/cvode/cbGMRES.cpp                 |  14 +-
 src/nrs/cds/solve.cpp                         |  14 +-
 src/nrs/cfl.cpp                               |  19 +-
 src/nrs/constantFlowRate.cpp                  | 112 ++-
 src/nrs/neknek/fixCoupledSurfaceFlux.cpp      |   7 +-
 src/nrs/neknek/multirateNekNek.cpp            |   2 +-
 src/nrs/neknek/neknek.cpp                     |  33 +-
 src/nrs/nrs.cpp                               | 261 ++---
 src/nrs/nrs.hpp                               |   4 +-
 src/nrs/plugins/RANSktau.cpp                  |  27 +-
 src/nrs/plugins/lowMach.cpp                   |  23 +-
 src/nrs/plugins/lpm.cpp                       |   6 +-
 src/nrs/plugins/velRecycling.cpp              |   1 -
 src/nrs/postProcessing/Qcriterion.cpp         |   4 +-
 src/nrs/postProcessing/aeroForces.cpp         |   2 +-
 src/nrs/postProcessing/strainRotationRate.cpp |   4 +-
 src/nrs/timeStepper.cpp                       |   6 +-
 src/nrs/tombo.cpp                             | 106 +-
 src/plugins/nekAscent.hpp                     |  31 +-
 src/plugins/tavg.cpp                          |   2 -
 src/pointInterpolation/findpts/findpts.cpp    | 925 +++++-------------
 src/pointInterpolation/findpts/findpts.hpp    |  65 +-
 .../findpts/kernels/findptsLocal.okl          |   2 +
 .../findpts/kernels/findptsLocalEval.okl      |   2 +-
 .../findpts/kernels/findptsLocalEvalMask.okl  |   2 +-
 src/pointInterpolation/pointInterpolation.cpp | 158 ++-
 src/pointInterpolation/pointInterpolation.hpp |  43 +-
 .../registerPointInterpolationKernels.cpp     |  19 +-
 src/udf/udf.cpp                               |  10 +-
 238 files changed, 8085 insertions(+), 4144 deletions(-)
 rename 3rd_party/hypre/AUTOTEST/{machine-rztopaz.sh => machine-rzhound.sh} (94%)
 create mode 100755 3rd_party/hypre/src/config/update-cmake.py
 create mode 100755 3rd_party/hypre/src/config/update-cmake.sh
 create mode 100644 3rd_party/hypre/src/parcsr_ls/par_mgr_rap.c
 create mode 100644 3rd_party/hypre/src/parcsr_mv/par_csr_filter.c
 create mode 100644 3rd_party/hypre/src/parcsr_mv/par_csr_filter_device.c
 create mode 100644 3rd_party/hypre/src/parcsr_mv/par_csr_matmat.c
 create mode 100644 3rd_party/hypre/src/parcsr_mv/par_csr_matmat_device.c
 create mode 100644 3rd_party/hypre/src/seq_mv/csr_filter.c
 create mode 100644 3rd_party/hypre/src/utilities/stl_ops.c
 delete mode 100755 scripts/nrsqsub_sunspot
 create mode 100644 src/core/threadPool.cpp
 create mode 100644 src/core/threadPool.hpp
 rename src/elliptic/{ => linearSolver}/kernels/combinedPCGPostMatVec.c (100%)
 rename src/elliptic/{ => linearSolver}/kernels/combinedPCGPostMatVec.okl (100%)
 rename src/elliptic/{ => linearSolver}/kernels/combinedPCGPreMatVec.c (96%)
 rename src/elliptic/{ => linearSolver}/kernels/combinedPCGPreMatVec.okl (100%)
 rename src/elliptic/{ => linearSolver}/kernels/combinedPCGUpdateConvergedSolution.c (92%)
 rename src/elliptic/{ => linearSolver}/kernels/combinedPCGUpdateConvergedSolution.okl (100%)
 rename src/elliptic/{ => linearSolver}/kernels/ellipticBlockUpdatePCG.c (100%)
 rename src/elliptic/{ => linearSolver}/kernels/ellipticBlockUpdatePCG.okl (100%)
 rename src/elliptic/{ => linearSolver}/kernels/fusedResidualAndNorm.c (100%)
 rename src/elliptic/{ => linearSolver}/kernels/fusedResidualAndNorm.okl (100%)
 rename src/elliptic/{ => linearSolver}/kernels/gramSchmidtOrthogonalization.c (100%)
 rename src/elliptic/{ => linearSolver}/kernels/gramSchmidtOrthogonalization.okl (100%)
 rename src/elliptic/{ => linearSolver}/kernels/updatePGMRESSolution.c (100%)
 rename src/elliptic/{ => linearSolver}/kernels/updatePGMRESSolution.okl (100%)

diff --git a/3rd_party/hypre/.gitignore b/3rd_party/hypre/.gitignore
index 98a666278..4b1d9c762 100644
--- a/3rd_party/hypre/.gitignore
+++ b/3rd_party/hypre/.gitignore
@@ -7,6 +7,12 @@
 *.err
 *.out
 *.perf
+*.btr
+*.patch
+.gdbinit
+build.sh
+update.sh
+make.log
 config.log
 config.status
 Makefile.config
@@ -15,6 +21,9 @@ autom4te.cache
 src/TAGS
 hypre/
 cmbuild/
+install/
+AUTOTEST/*.dir
+.vscode
 
 ###############
 # Documentation
@@ -46,4 +55,4 @@ src/test/zboxloop
 src/examples/ex[0-9]
 src/examples/ex1[0-6]
 src/examples/ex5f
-src/examples/ex12f
\ No newline at end of file
+src/examples/ex12f
diff --git a/3rd_party/hypre/AUTOTEST/check-license.filters b/3rd_party/hypre/AUTOTEST/check-license.filters
index 72e32da10..24f5dc8b4 100644
--- a/3rd_party/hypre/AUTOTEST/check-license.filters
+++ b/3rd_party/hypre/AUTOTEST/check-license.filters
@@ -17,3 +17,4 @@
 ./src/config.status
 .gitignore
 TVD.v3breakpoints
+.vscode
diff --git a/3rd_party/hypre/AUTOTEST/check-mem.sh b/3rd_party/hypre/AUTOTEST/check-mem.sh
index b7fac7b6e..94274cc3f 100755
--- a/3rd_party/hypre/AUTOTEST/check-mem.sh
+++ b/3rd_party/hypre/AUTOTEST/check-mem.sh
@@ -38,7 +38,9 @@ find . -type f -print | egrep '[.]*[.](c|cc|cpp|cxx|C|h|hpp|hxx|H)$' |
   egrep -v '/FEI_mv' |
   egrep -v '/hypre/include' |
   egrep -v '/utilities/memory_tracker.c' |
-  egrep -v '/utilities/memory.c' > check-mem.files
+  egrep -v '/utilities/memory.c' |
+  egrep -v '/utilities/general.c' |
+  egrep -v '/utilities/device_utils.c' > check-mem.files
 
 egrep '(^|[^[:alnum:]_]+)malloc[[:space:]]*\('  `cat check-mem.files` >&2
 egrep '(^|[^[:alnum:]_]+)calloc[[:space:]]*\('  `cat check-mem.files` >&2
diff --git a/3rd_party/hypre/AUTOTEST/machine-lassen.sh b/3rd_party/hypre/AUTOTEST/machine-lassen.sh
index f43e73246..ff2f5c8f9 100755
--- a/3rd_party/hypre/AUTOTEST/machine-lassen.sh
+++ b/3rd_party/hypre/AUTOTEST/machine-lassen.sh
@@ -47,57 +47,57 @@ save="lassen"
 
 ######################
 ##   DEFAULT CUDA   ##
-##  (cuda/10.1.243) ##
+##  (cuda/11.2.0)   ##
 ######################
 
 module -q load cuda
 module -q load xl
 
 # CUDA with UM in debug mode [ij, ams, struct, sstruct]
-co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-ij-gpu -ams -struct -sstruct -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 eo="-gpu -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro -eo: $eo
 ./renametest.sh basic $output_dir/basic-cuda-um
 
 # CUDA with UM in debug mode [ij, ams, struct, sstruct]
-co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-print-errors --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-persistent --enable-debug --with-print-errors --with-gpu-arch=70 --with-memory-tracker --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-error -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-um-with-errors
 
 # CUDA with UM and mixed-int
-co="--with-cuda --enable-unified-memory --enable-mixedint --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-mixedint --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-ij-mixed -ams -struct -sstruct-mixed -rt -mpibind -save ${save} -rtol ${rtol} -atol ${atol}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-um-mixedint
 
 # CUDA with UM with shared library
-co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --with-openmp --enable-hopscotch --enable-shared --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-gpumemcheck -rt -mpibind -cudamemcheck -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-um-shared
 
 # CUDA with UM and single precision
-co="--with-cuda --enable-unified-memory --enable-single --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-unified-memory --enable-single --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-single -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: ${ro}
 ./renametest.sh basic $output_dir/basic-cuda-um-single
 
 # CUDA with UM without MPI [no run]
-#co="--with-cuda --enable-unified-memory --without-MPI --with-gpu-arch=70 --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+#co="--with-cuda --enable-unified-memory --without-MPI --with-gpu-arch=70 --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 #./test.sh basic.sh $src_dir -co: $co -mo: $mo
 #./renametest.sh basic $output_dir/basic-cuda-um-without-MPI
 
 # CUDA without UM with device memory pool [struct]
-co="--with-cuda --enable-device-memory-pool --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-cuda --enable-device-memory-pool --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-struct -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-nonum
 
 # CUDA without UM with umpire [benchmark]
-UMPIRE_DIR=/usr/workspace/hypre/ext-libs/Umpire/2022.03.1-nvcc10.1.243-sm_70-xl2021.09.22
-co="--with-cuda --with-gpu-arch=70 --with-umpire --with-umpire-include=${UMPIRE_DIR}/include --with-umpire-lib-dirs=${UMPIRE_DIR}/lib --with-umpire-libs=umpire --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+UMPIRE_DIR=/usr/workspace/hypre/ext-libs/Umpire/2022.03.1-nvcc11.2-sm_70-xl2023.06.28-cuda-11.2.0-gcc-8.3.1
+co="--with-cuda --with-gpu-arch=70 --with-umpire --with-umpire-include=${UMPIRE_DIR}/include --with-umpire-lib-dirs=${UMPIRE_DIR}/lib --with-umpire-libs=umpire --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-bench -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-cuda-bench
@@ -113,12 +113,12 @@ ro="-ij-noilu -ams -struct -sstruct -rt -mpibind -save lassen_cpu"
 ############
 
 # OMP 4.5 with UM with shared library [no run]
-#co="--with-device-openmp --enable-unified-memory --enable-shared --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029:1500-030:1501-308\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029:1500-030:1501-308\\'"
+#co="--with-device-openmp --enable-unified-memory --enable-shared --with-extra-CFLAGS=\\'-qsuppress=1500-029:1500-030:1501-308\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029:1500-030:1501-308\\'"
 #./test.sh basic.sh $src_dir -co: $co -mo: $mo
 #./renametest.sh basic $output_dir/basic-deviceomp-um-shared
 
 # OMP 4.5 without UM in debug mode [struct]
-co="--with-device-openmp --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qmaxmem=-1 -qsuppress=1500-029\\'"
+co="--with-device-openmp --enable-debug --with-gpu-arch=70 --with-extra-CFLAGS=\\'-qsuppress=1500-029\\' --with-extra-CXXFLAGS=\\'-qsuppress=1500-029\\'"
 ro="-struct -rt -mpibind -save ${save}"
 ./test.sh basic.sh $src_dir -co: $co -mo: $mo -ro: $ro
 ./renametest.sh basic $output_dir/basic-deviceomp-nonum-debug-struct
@@ -126,21 +126,23 @@ ro="-struct -rt -mpibind -save ${save}"
 #####################################
 ## CUDA + CMake build (only) tests ##
 #####################################
+module -q load cmake/3.16.8
+module list cmake/3.16.8 |& grep "None found"
 
 mo="-j"
 
 # CUDA with UM + CMake
-co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_ENABLE_PERSISTENT_COMM=ON -DHYPRE_ENABLE_DEVICE_POOL=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
+co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_ENABLE_PERSISTENT_COMM=ON -DHYPRE_ENABLE_DEVICE_POOL=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
 ./test.sh cmake.sh $src_dir -co: $co -mo: $mo
 ./renametest.sh cmake $output_dir/cmake-cuda-um-ij
 
 # CUDA with UM [shared library] + CMake
-co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_OPENMP=ON -DHYPRE_ENABLE_HOPSCOTCH=ON -DHYPRE_ENABLE_SHARED=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029 "\'" -DHYPRE_CUDA_SM=70"
+co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DHYPRE_ENABLE_UNIFIED_MEMORY=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_OPENMP=ON -DHYPRE_ENABLE_HOPSCOTCH=ON -DHYPRE_ENABLE_SHARED=ON -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qsuppress=1500-029 "\'" -DHYPRE_CUDA_SM=70"
 ./test.sh cmake.sh $src_dir -co: $co -mo: $mo
 ./renametest.sh cmake $output_dir/cmake-cuda-um-shared
 
 # CUDA w.o UM + CMake
-co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qmaxmem=-1 -qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
+co="-DCMAKE_C_COMPILER=$(which xlc) -DCMAKE_CXX_COMPILER=$(which xlc++) -DCMAKE_CUDA_COMPILER=$(which nvcc) -DMPI_C_COMPILER=$(which mpicc) -DMPI_CXX_COMPILER=$(which mpicxx) -DHYPRE_WITH_CUDA=ON -DCMAKE_BUILD_TYPE=Debug -DHYPRE_WITH_EXTRA_CFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_WITH_EXTRA_CXXFLAGS="\'"-qsuppress=1500-029"\'" -DHYPRE_CUDA_SM=70"
 ./test.sh cmake.sh $src_dir -co: $co -mo: $mo
 ./renametest.sh cmake $output_dir/cmake-cuda-nonum-struct
 
diff --git a/3rd_party/hypre/AUTOTEST/machine-rztopaz.sh b/3rd_party/hypre/AUTOTEST/machine-rzhound.sh
similarity index 94%
rename from 3rd_party/hypre/AUTOTEST/machine-rztopaz.sh
rename to 3rd_party/hypre/AUTOTEST/machine-rzhound.sh
index 9de900f22..98dd97980 100755
--- a/3rd_party/hypre/AUTOTEST/machine-rztopaz.sh
+++ b/3rd_party/hypre/AUTOTEST/machine-rzhound.sh
@@ -11,14 +11,14 @@ case $1 in
    -h|-help)
       cat <<EOF
 
-   **** Only run this script on the rztopaz machine ****
+   **** Only run this script on the rzhound machine ****
 
    $0 [-h|-help] {src_dir}
 
    where: -h|-help   prints this usage information and exits
           {src_dir}  is the hypre source directory
 
-   This script runs a number of tests suitable for the rztopaz machine.
+   This script runs a number of tests suitable for the rzhound machine.
 
    Example usage: $0 ../src
 
diff --git a/3rd_party/hypre/AUTOTEST/machine-tux-spack.sh b/3rd_party/hypre/AUTOTEST/machine-tux-spack.sh
index 2744ec866..1ced7341e 100755
--- a/3rd_party/hypre/AUTOTEST/machine-tux-spack.sh
+++ b/3rd_party/hypre/AUTOTEST/machine-tux-spack.sh
@@ -37,7 +37,9 @@ src_dir=`cd $1; pwd`
 shift
 
 # OpenMPI limits the number of processes available by default - override
+# RDF: The first environment variable didn't work for me
 export OMPI_MCA_rmaps_base_oversubscribe=1
+export PRTE_MCA_rmaps_default_mapping_policy=:oversubscribe
 
 # Basic build and run tests
 
@@ -53,19 +55,21 @@ export OMPI_MCA_rmaps_base_oversubscribe=1
 # Use the develop branch for superlu-dist
 superludistspec="superlu-dist@develop"
 spackspec="hypre@develop~debug+superlu-dist ^$superludistspec"
-spack install $spackspec
+# The --fresh option will ensure the latest versions of dependencies are used
+spack install --fresh $spackspec
 spack load    $spackspec
 spackdir=`spack location -i $spackspec`
 test.sh basic.sh ../src -co: -mo: -spack $spackdir -ro: -superlu
 ./renametest.sh basic $output_dir/basic-dsuperlu
 
 # Clean-up spack build
-spack spec --yaml $spackspec > test.yaml
-grep ' hash:' test.yaml | sed -e 's/^.*: /\//' | xargs spack mark -e
-spack gc -y
-grep ' hash:' test.yaml | sed -e 's/^.*: /\//' | xargs spack mark -i
-rm -f test.yaml
-spack clean --all
+# RDF: Commenting out PR #481 (for now) to test the '--fresh' option above (faster)
+# spack spec --yaml $spackspec > test.yaml
+# grep ' hash:' test.yaml | sed -e 's/^.*: /\//' | xargs spack mark -e
+# spack gc -y
+# grep ' hash:' test.yaml | sed -e 's/^.*: /\//' | xargs spack mark -i
+# rm -f test.yaml
+# spack clean --all
 spack uninstall -yR $superludistspec
 
 # Echo to stderr all nonempty error files in $output_dir
diff --git a/3rd_party/hypre/CHANGELOG b/3rd_party/hypre/CHANGELOG
index 0de07d1af..9b71cbd33 100644
--- a/3rd_party/hypre/CHANGELOG
+++ b/3rd_party/hypre/CHANGELOG
@@ -7,6 +7,16 @@
 # This file chronicles user-level changes, beginning with the most recent.
 # =============================================================================
 
+Version 2.32.0 released 2024/10/08
+
+- New MGR features and updates
+- New filtering option to AMG for block-diagonal preconditioning
+- New memory usage monitoring functions
+- Added non-UVM support for ILUT and HMIS coarsening
+- Various bug fixes
+
+#====================================
+
 Version 2.31.0 released 2024/02/14
 
 - Added iterative ILU0 option
diff --git a/3rd_party/hypre/src/CMakeLists.txt b/3rd_party/hypre/src/CMakeLists.txt
index b8da5f4be..c65faf06c 100644
--- a/3rd_party/hypre/src/CMakeLists.txt
+++ b/3rd_party/hypre/src/CMakeLists.txt
@@ -12,9 +12,9 @@ else ()
 endif ()
 
 # The version number.
-set(HYPRE_VERSION 2.31.0)
-set(HYPRE_NUMBER  23100)
-set(HYPRE_DATE    2024/02/14)
+set(HYPRE_VERSION 2.32.0)
+set(HYPRE_NUMBER  23200)
+set(HYPRE_DATE    2024/10/08)
 set(HYPRE_TIME    00:00:00)
 set(HYPRE_BUGS    https://github.com/hypre-space/hypre/issues)
 set(HYPRE_SRCDIR  "${PROJECT_SOURCE_DIR}")
@@ -632,6 +632,30 @@ if (HYPRE_USING_UMPIRE)
   target_include_directories(${PROJECT_NAME} PUBLIC ${TPL_UMPIRE_INCLUDE_DIRS})
 endif ()
 
+# Set MPI compile flags
+if (NOT HYPRE_SEQUENTIAL)
+  find_program(MPIEXEC_EXECUTABLE NAMES mpiexec mpirun)
+  find_package(MPI REQUIRED)
+  target_link_libraries(${PROJECT_NAME} PUBLIC MPI::MPI_C)
+  set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} MPI::MPI_C)
+
+  # Check if MPI supports the MPI_Comm_f2c function
+  include(CheckCSourceCompiles)
+  check_c_source_compiles("
+    #include <mpi.h>
+    int main() {
+      MPI_Comm c = MPI_Comm_f2c(0);
+      return 0;
+    }
+  " HYPRE_HAVE_MPI_COMM_F2C)
+endif (NOT HYPRE_SEQUENTIAL)
+
+# Set OpenMP compile flags
+if (HYPRE_USING_OPENMP)
+  find_package(OpenMP REQUIRED)
+  target_link_libraries(${PROJECT_NAME} PUBLIC OpenMP::OpenMP_C)
+endif (HYPRE_USING_OPENMP)
+
 # Configure a header file to pass CMake settings to the source code
 configure_file(
   "${CMAKE_CURRENT_SOURCE_DIR}/config/HYPRE_config.h.cmake.in"
@@ -663,19 +687,6 @@ if (HYPRE_USING_SYCL)
   set_source_files_properties(${HYPRE_GPU_SOURCES} PROPERTIES LANGUAGE CXX)
 endif ()
 
-# Set MPI compile flags
-if (NOT HYPRE_SEQUENTIAL)
-  find_program(MPIEXEC_EXECUTABLE NAMES mpiexec mpirun)
-  find_package(MPI REQUIRED)
-  target_link_libraries(${PROJECT_NAME} PUBLIC MPI::MPI_C)
-endif (NOT HYPRE_SEQUENTIAL)
-
-# Set OpenMP compile flags
-if (HYPRE_USING_OPENMP)
-  find_package(OpenMP REQUIRED)
-  target_link_libraries(${PROJECT_NAME} PUBLIC OpenMP::OpenMP_C)
-endif (HYPRE_USING_OPENMP)
-
 if (MSVC)
   target_compile_definitions(${PROJECT_NAME} PRIVATE _CRT_SECURE_NO_WARNINGS)
   if (MSVC_VERSION LESS 1928) # Visual Studio 2019 version 16.8 claims full C11 support
diff --git a/3rd_party/hypre/src/FEI_mv/fei-hypre/Makefile b/3rd_party/hypre/src/FEI_mv/fei-hypre/Makefile
index e533d4865..ee6c1f347 100644
--- a/3rd_party/hypre/src/FEI_mv/fei-hypre/Makefile
+++ b/3rd_party/hypre/src/FEI_mv/fei-hypre/Makefile
@@ -30,6 +30,7 @@ C_COMPILE_FLAGS = \
  -I$(srcdir)/../../parcsr_mv\
  -I$(srcdir)/../../parcsr_ls\
  -I$(srcdir)/../../seq_mv\
+ -I$(srcdir)/../../seq_block_mv\
  -I$(srcdir)/../../distributed_matrix\
  -I$(srcdir)/../../distributed_ls\
  -I$(srcdir)/../fei-base\
@@ -49,6 +50,7 @@ CXX_COMPILE_FLAGS = \
  -I$(srcdir)/../../krylov\
  -I$(srcdir)/../../parcsr_block_mv\
  -I$(srcdir)/../../parcsr_mv\
+ -I$(srcdir)/../../seq_block_mv\
  -I$(srcdir)/../../parcsr_ls\
  -I$(srcdir)/../../seq_mv\
  -I$(srcdir)/../../distributed_matrix\
diff --git a/3rd_party/hypre/src/FEI_mv/femli/Makefile b/3rd_party/hypre/src/FEI_mv/femli/Makefile
index 42f1a74b2..c931fada5 100644
--- a/3rd_party/hypre/src/FEI_mv/femli/Makefile
+++ b/3rd_party/hypre/src/FEI_mv/femli/Makefile
@@ -20,6 +20,7 @@ MLI_INCLUDES = \
  -I$(srcdir)/../../parcsr_mv\
  -I$(srcdir)/../../parcsr_ls\
  -I$(srcdir)/../../seq_mv\
+ -I$(srcdir)/../../seq_block_mv\
  -I$(srcdir)/../../distributed_matrix\
  -I$(srcdir)/../../distributed_ls\
  -I$(srcdir)/../../FEI_mv/fei-hypre\
diff --git a/3rd_party/hypre/src/IJ_mv/HYPRE_IJMatrix.c b/3rd_party/hypre/src/IJ_mv/HYPRE_IJMatrix.c
index 3b7cc08b9..041fc9b8b 100644
--- a/3rd_party/hypre/src/IJ_mv/HYPRE_IJMatrix.c
+++ b/3rd_party/hypre/src/IJ_mv/HYPRE_IJMatrix.c
@@ -117,6 +117,59 @@ HYPRE_IJMatrixCreate( MPI_Comm        comm,
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * HYPRE_IJMatrixPartialClone
+ *
+ * Creates a new IJMatrix with data copied from an existing matrix except
+ * for the members:
+ *    1) hypre_IJMatrixObject
+ *    2) hypre_IJMatrixTranslator
+ *    3) hypre_IJMatrixAssumedPart
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_IJMatrixPartialClone( HYPRE_IJMatrix  matrix_in,
+                            HYPRE_IJMatrix *matrix_out )
+{
+   hypre_IJMatrix *ijmatrix_in = (hypre_IJMatrix *) matrix_in;
+   hypre_IJMatrix *ijmatrix_out;
+   HYPRE_BigInt    ilower;
+   HYPRE_BigInt    iupper;
+   HYPRE_BigInt    jlower;
+   HYPRE_BigInt    jupper;
+
+   if (!ijmatrix_in)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   HYPRE_IJMatrixGetLocalRange(ijmatrix_in, &ilower, &iupper, &jlower, &jupper);
+
+   ijmatrix_out = hypre_CTAlloc(hypre_IJMatrix, 1, HYPRE_MEMORY_HOST);
+
+   hypre_IJMatrixComm(ijmatrix_out)               = hypre_IJMatrixComm(ijmatrix_in);
+   hypre_IJMatrixObject(ijmatrix_out)             = NULL;
+   hypre_IJMatrixTranslator(ijmatrix_out)         = NULL;
+   hypre_IJMatrixAssumedPart(ijmatrix_out)        = NULL;
+   hypre_IJMatrixObjectType(ijmatrix_out)         = hypre_IJMatrixObjectType(ijmatrix_in);
+   hypre_IJMatrixAssembleFlag(ijmatrix_out)       = 0;
+   hypre_IJMatrixPrintLevel(ijmatrix_out)         = hypre_IJMatrixPrintLevel(ijmatrix_in);
+   hypre_IJMatrixOMPFlag(ijmatrix_out)            = hypre_IJMatrixOMPFlag(ijmatrix_in);
+   hypre_IJMatrixGlobalFirstRow(ijmatrix_out)     = hypre_IJMatrixGlobalFirstRow(ijmatrix_in);
+   hypre_IJMatrixGlobalFirstCol(ijmatrix_out)     = hypre_IJMatrixGlobalFirstCol(ijmatrix_in);
+   hypre_IJMatrixGlobalNumRows(ijmatrix_out)      = hypre_IJMatrixGlobalNumRows(ijmatrix_in);
+   hypre_IJMatrixGlobalNumCols(ijmatrix_out)      = hypre_IJMatrixGlobalNumCols(ijmatrix_in);
+   hypre_IJMatrixRowPartitioning(ijmatrix_out)[0] = ilower;
+   hypre_IJMatrixRowPartitioning(ijmatrix_out)[1] = iupper + 1;
+   hypre_IJMatrixColPartitioning(ijmatrix_out)[0] = jlower;
+   hypre_IJMatrixColPartitioning(ijmatrix_out)[1] = jupper + 1;
+
+   *matrix_out = (HYPRE_IJMatrix) ijmatrix_out;
+
+   return hypre_error_flag;
+}
+
 /*--------------------------------------------------------------------------
  *--------------------------------------------------------------------------*/
 
@@ -177,7 +230,6 @@ HYPRE_IJMatrixInitialize( HYPRE_IJMatrix matrix )
    }
 
    return hypre_error_flag;
-
 }
 
 HYPRE_Int
@@ -463,6 +515,8 @@ HYPRE_IJMatrixSetValues2( HYPRE_IJMatrix       matrix,
       }
    }
 
+   HYPRE_PRINT_MEMORY_USAGE(hypre_IJMatrixComm(ijmatrix));
+
    return hypre_error_flag;
 }
 
@@ -671,6 +725,8 @@ HYPRE_IJMatrixAddToValues2( HYPRE_IJMatrix       matrix,
       }
    }
 
+   HYPRE_PRINT_MEMORY_USAGE(hypre_IJMatrixComm(ijmatrix));
+
    return hypre_error_flag;
 }
 
@@ -688,19 +744,19 @@ HYPRE_IJMatrixAssemble( HYPRE_IJMatrix matrix )
       return hypre_error_flag;
    }
 
-   if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR )
+   if (hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR)
    {
 #if defined(HYPRE_USING_GPU)
-      HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_IJMatrixMemoryLocation(matrix) );
+      HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(hypre_IJMatrixMemoryLocation(matrix));
 
       if (exec == HYPRE_EXEC_DEVICE)
       {
-         return ( hypre_IJMatrixAssembleParCSRDevice( ijmatrix ) );
+         hypre_IJMatrixAssembleParCSRDevice(ijmatrix);
       }
       else
 #endif
       {
-         return ( hypre_IJMatrixAssembleParCSR( ijmatrix ) );
+         hypre_IJMatrixAssembleParCSR(ijmatrix);
       }
    }
    else
@@ -708,6 +764,8 @@ HYPRE_IJMatrixAssemble( HYPRE_IJMatrix matrix )
       hypre_error_in_arg(1);
    }
 
+   HYPRE_PRINT_MEMORY_USAGE(hypre_IJMatrixComm(ijmatrix));
+
    return hypre_error_flag;
 }
 
@@ -814,7 +872,7 @@ HYPRE_IJMatrixGetValues( HYPRE_IJMatrix matrix,
    if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR )
    {
       hypre_IJMatrixGetValuesParCSR( ijmatrix, nrows, ncols,
-                                     rows, cols, values );
+                                     rows, NULL, cols, values, 0 );
    }
    else
    {
@@ -825,6 +883,154 @@ HYPRE_IJMatrixGetValues( HYPRE_IJMatrix matrix,
 
 }
 
+/*--------------------------------------------------------------------------
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_IJMatrixGetValues2( HYPRE_IJMatrix matrix,
+                          HYPRE_Int      nrows,
+                          HYPRE_Int     *ncols,
+                          HYPRE_BigInt  *rows,
+                          HYPRE_Int     *row_indexes,
+                          HYPRE_BigInt  *cols,
+                          HYPRE_Complex *values )
+{
+   hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix;
+
+   if (nrows == 0)
+   {
+      return hypre_error_flag;
+   }
+
+   if (!ijmatrix)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   if (!ncols)
+   {
+      hypre_error_in_arg(3);
+      return hypre_error_flag;
+   }
+
+   if (!rows)
+   {
+      hypre_error_in_arg(4);
+      return hypre_error_flag;
+   }
+
+   if (!cols)
+   {
+      hypre_error_in_arg(5);
+      return hypre_error_flag;
+   }
+
+   if (!values)
+   {
+      hypre_error_in_arg(6);
+      return hypre_error_flag;
+   }
+
+#if defined(HYPRE_USING_GPU)
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_IJMatrixMemoryLocation(matrix) );
+
+   if (exec == HYPRE_EXEC_DEVICE)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "HYPRE_IJMatrixGetValues not implemented for GPUs!");
+   }
+   else
+#endif
+   {
+      if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR )
+      {
+         hypre_IJMatrixGetValuesParCSR( ijmatrix, nrows, ncols,
+                                        rows, row_indexes, cols, values, 0 );
+      }
+      else
+      {
+         hypre_error_in_arg(1);
+      }
+   }
+
+   return hypre_error_flag;
+
+}
+
+/*--------------------------------------------------------------------------
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_IJMatrixGetValuesAndZeroOut( HYPRE_IJMatrix matrix,
+                                   HYPRE_Int      nrows,
+                                   HYPRE_Int     *ncols,
+                                   HYPRE_BigInt  *rows,
+                                   HYPRE_Int     *row_indexes,
+                                   HYPRE_BigInt  *cols,
+                                   HYPRE_Complex *values )
+{
+   hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix;
+
+   if (nrows == 0)
+   {
+      return hypre_error_flag;
+   }
+
+   if (!ijmatrix)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   if (!ncols)
+   {
+      hypre_error_in_arg(3);
+      return hypre_error_flag;
+   }
+
+   if (!rows)
+   {
+      hypre_error_in_arg(4);
+      return hypre_error_flag;
+   }
+
+   if (!cols)
+   {
+      hypre_error_in_arg(5);
+      return hypre_error_flag;
+   }
+
+   if (!values)
+   {
+      hypre_error_in_arg(6);
+      return hypre_error_flag;
+   }
+
+#if defined(HYPRE_USING_GPU)
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_IJMatrixMemoryLocation(matrix) );
+
+   if (exec == HYPRE_EXEC_DEVICE)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "HYPRE_IJMatrixGetValues not implemented for GPUs!");
+   }
+   else
+#endif
+   {
+      if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR )
+      {
+         hypre_IJMatrixGetValuesParCSR( ijmatrix, nrows, ncols,
+                                        rows, row_indexes, cols, values, 1 );
+      }
+      else
+      {
+         hypre_error_in_arg(1);
+      }
+   }
+
+   return hypre_error_flag;
+
+}
+
 /*--------------------------------------------------------------------------
  *--------------------------------------------------------------------------*/
 
@@ -895,6 +1101,48 @@ HYPRE_IJMatrixGetLocalRange( HYPRE_IJMatrix  matrix,
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_IJMatrixGetGlobalInfo( HYPRE_IJMatrix matrix,
+                             HYPRE_BigInt  *global_num_rows,
+                             HYPRE_BigInt  *global_num_cols,
+                             HYPRE_BigInt  *global_num_nonzeros )
+{
+   hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix;
+
+   if (!ijmatrix)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   *global_num_rows = hypre_IJMatrixGlobalNumRows(ijmatrix);
+   *global_num_cols = hypre_IJMatrixGlobalNumCols(ijmatrix);
+
+   if (hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR)
+   {
+      hypre_ParCSRMatrix *par_matrix = (hypre_ParCSRMatrix *) hypre_IJMatrixObject(ijmatrix);
+
+      if (!par_matrix)
+      {
+         hypre_error_in_arg(1);
+         return hypre_error_flag;
+      }
+
+      hypre_ParCSRMatrixSetNumNonzeros(par_matrix);
+      *global_num_nonzeros = hypre_ParCSRMatrixNumNonzeros(par_matrix);
+   }
+   else
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   return hypre_error_flag;
+}
+
 /*--------------------------------------------------------------------------
  *--------------------------------------------------------------------------*/
 
@@ -1070,6 +1318,8 @@ HYPRE_Int
 HYPRE_IJMatrixPrint( HYPRE_IJMatrix  matrix,
                      const char     *filename )
 {
+   void   *object;
+
    if (!matrix)
    {
       hypre_error_in_arg(1);
@@ -1082,11 +1332,8 @@ HYPRE_IJMatrixPrint( HYPRE_IJMatrix  matrix,
       return hypre_error_flag;
    }
 
-   void *object;
    HYPRE_IJMatrixGetObject(matrix, &object);
-   hypre_ParCSRMatrix *par_csr = (hypre_ParCSRMatrix*) object;
-
-   hypre_ParCSRMatrixPrintIJ(par_csr, 0, 0, filename);
+   hypre_ParCSRMatrixPrintIJ((hypre_ParCSRMatrix*) object, 0, 0, filename);
 
    return hypre_error_flag;
 }
diff --git a/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h b/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h
index 2c9c1edf4..f6cda553f 100644
--- a/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h
+++ b/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h
@@ -222,6 +222,37 @@ HYPRE_Int HYPRE_IJMatrixGetValues(HYPRE_IJMatrix  matrix,
                                   HYPRE_BigInt   *cols,
                                   HYPRE_Complex  *values);
 
+/**
+ * Gets values for \e nrows rows or partial rows of the matrix.
+ *
+ * Same as IJMatrixGetValues, but with an additional \e row_indexes array
+ * that provides indexes into the \e cols and \e values arrays.  Because
+ * of this, there can be gaps between the row data in these latter two arrays.
+ *
+ **/
+HYPRE_Int HYPRE_IJMatrixGetValues2(HYPRE_IJMatrix  matrix,
+                                   HYPRE_Int       nrows,
+                                   HYPRE_Int      *ncols,
+                                   HYPRE_BigInt   *rows,
+                                   HYPRE_Int      *row_indexes,
+                                   HYPRE_BigInt   *cols,
+                                   HYPRE_Complex  *values);
+
+/**
+ * Gets values for \e nrows rows or partial rows of the matrix
+ * and zeros out those entries in the matrix.
+ *
+ * Same as IJMatrixGetValues2, but zeros out the entries after getting them.
+ *
+ **/
+HYPRE_Int HYPRE_IJMatrixGetValuesAndZeroOut(HYPRE_IJMatrix  matrix,
+                                            HYPRE_Int       nrows,
+                                            HYPRE_Int      *ncols,
+                                            HYPRE_BigInt   *rows,
+                                            HYPRE_Int      *row_indexes,
+                                            HYPRE_BigInt   *cols,
+                                            HYPRE_Complex  *values);
+
 /**
  * Set the storage type of the matrix object to be constructed.
  * Currently, \e type can only be \c HYPRE_PARCSR.
@@ -249,6 +280,24 @@ HYPRE_Int HYPRE_IJMatrixGetLocalRange(HYPRE_IJMatrix  matrix,
                                       HYPRE_BigInt   *jlower,
                                       HYPRE_BigInt   *jupper);
 
+/**
+ * Gets global information about the matrix, including the total number of rows,
+ * columns, and nonzero elements across all processes.
+ *
+ * @param matrix The IJMatrix object to query.
+ * @param global_num_rows Pointer to store the total number of rows in the matrix.
+ * @param global_num_cols Pointer to store the total number of columns in the matrix.
+ * @param global_num_nonzeros Pointer to store the total number of nonzero elements in the matrix.
+ *
+ * @return HYPRE_Int Error code.
+ *
+ * Collective (must be called by all processes).
+ **/
+HYPRE_Int HYPRE_IJMatrixGetGlobalInfo(HYPRE_IJMatrix  matrix,
+                                      HYPRE_BigInt   *global_num_rows,
+                                      HYPRE_BigInt   *global_num_cols,
+                                      HYPRE_BigInt   *global_num_nonzeros);
+
 /**
  * Get a reference to the constructed matrix object.
  *
@@ -343,6 +392,30 @@ HYPRE_Int HYPRE_IJMatrixReadMM(const char     *filename,
 HYPRE_Int HYPRE_IJMatrixPrint(HYPRE_IJMatrix  matrix,
                               const char     *filename);
 
+/**
+ * Transpose an IJMatrix.
+ **/
+HYPRE_Int
+HYPRE_IJMatrixTranspose( HYPRE_IJMatrix  matrix_A,
+                         HYPRE_IJMatrix *matrix_AT );
+
+/**
+ * Computes the infinity norm of an IJMatrix
+ **/
+HYPRE_Int
+HYPRE_IJMatrixNorm( HYPRE_IJMatrix  matrix,
+                    HYPRE_Real     *norm );
+
+/**
+ * Performs C = alpha*A + beta*B
+ **/
+HYPRE_Int
+HYPRE_IJMatrixAdd( HYPRE_Complex    alpha,
+                   HYPRE_IJMatrix   matrix_A,
+                   HYPRE_Complex    beta,
+                   HYPRE_IJMatrix   matrix_B,
+                   HYPRE_IJMatrix  *matrix_C );
+
 /**
  * Print the matrix to file in binary format. This is mainly for debugging purposes.
  **/
@@ -357,7 +430,6 @@ HYPRE_Int HYPRE_IJMatrixReadBinary(const char     *filename,
                                    HYPRE_Int       type,
                                    HYPRE_IJMatrix *matrix_ptr);
 
-
 /**@}*/
 
 /*--------------------------------------------------------------------------
diff --git a/3rd_party/hypre/src/IJ_mv/IJMatrix_parcsr.c b/3rd_party/hypre/src/IJ_mv/IJMatrix_parcsr.c
index 15abab413..8ba824ba3 100644
--- a/3rd_party/hypre/src/IJ_mv/IJMatrix_parcsr.c
+++ b/3rd_party/hypre/src/IJ_mv/IJMatrix_parcsr.c
@@ -279,7 +279,8 @@ hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_MemoryLocation m
 
       if (!aux_matrix)
       {
-         hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, hypre_ParCSRMatrixNumCols(par_matrix),
+         hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows,
+                                     hypre_ParCSRMatrixNumCols(par_matrix),
                                      NULL);
          hypre_IJMatrixTranslator(matrix) = aux_matrix;
       }
@@ -295,22 +296,22 @@ hypre_IJMatrixInitializeParCSR_v2(hypre_IJMatrix *matrix, HYPRE_MemoryLocation m
          {
             for (i = 0; i < local_num_rows; i++)
             {
-               hypre_CSRMatrixI(diag)[i + 1] = hypre_CSRMatrixI(diag)[i] + hypre_AuxParCSRMatrixDiagSizes(
-                                                  aux_matrix)[i];
+               hypre_CSRMatrixI(diag)[i + 1] = hypre_CSRMatrixI(diag)[i] +
+                                               hypre_AuxParCSRMatrixDiagSizes(aux_matrix)[i];
             }
             hypre_CSRMatrixNumNonzeros(diag) = hypre_CSRMatrixI(diag)[local_num_rows];
-            hypre_CSRMatrixInitialize(diag);
+            hypre_CSRMatrixInitialize_v2(diag, 0, memory_location);
          }
 
          if (hypre_AuxParCSRMatrixOffdSizes(aux_matrix))
          {
             for (i = 0; i < local_num_rows; i++)
             {
-               hypre_CSRMatrixI(offd)[i + 1] = hypre_CSRMatrixI(offd)[i] + hypre_AuxParCSRMatrixOffdSizes(
-                                                  aux_matrix)[i];
+               hypre_CSRMatrixI(offd)[i + 1] = hypre_CSRMatrixI(offd)[i] +
+                                               hypre_AuxParCSRMatrixOffdSizes(aux_matrix)[i];
             }
             hypre_CSRMatrixNumNonzeros(offd) = hypre_CSRMatrixI(offd)[local_num_rows];
-            hypre_CSRMatrixInitialize(offd);
+            hypre_CSRMatrixInitialize_v2(offd, 0, memory_location);
          }
       }
 
@@ -414,8 +415,10 @@ hypre_IJMatrixGetValuesParCSR( hypre_IJMatrix *matrix,
                                HYPRE_Int       nrows,
                                HYPRE_Int      *ncols,
                                HYPRE_BigInt   *rows,
+                               HYPRE_Int      *row_indexes,
                                HYPRE_BigInt   *cols,
-                               HYPRE_Complex  *values)
+                               HYPRE_Complex  *values,
+                               HYPRE_Int       zero_out)
 {
    MPI_Comm             comm = hypre_IJMatrixComm(matrix);
    hypre_ParCSRMatrix  *par_matrix = (hypre_ParCSRMatrix *) hypre_IJMatrixObject(matrix);
@@ -509,11 +512,19 @@ hypre_IJMatrixGetValuesParCSR( hypre_IJMatrix *matrix,
             {
                cols[indx] = (HYPRE_BigInt)diag_j[j] + col_0;
                values[indx++] = diag_data[j];
+               if (zero_out)
+               {
+                  diag_data[j] = 0.0;
+               }
             }
             for (j = offd_i[row_local]; j < offd_i[row_local + 1]; j++)
             {
                cols[indx] = col_map_offd[offd_j[j]];
                values[indx++] = offd_data[j];
+               if (zero_out)
+               {
+                  offd_data[j] = 0.0;
+               }
             }
             counter[i + 1] = indx;
          }
@@ -549,6 +560,7 @@ hypre_IJMatrixGetValuesParCSR( hypre_IJMatrix *matrix,
          {
             continue;
          }
+         indx = (row_indexes) ? row_indexes[ii] : indx;
          if (row >= row_partitioning[0] && row < row_partitioning[1])
          {
             row_local = (HYPRE_Int)(row - row_partitioning[0]);
@@ -565,6 +577,10 @@ hypre_IJMatrixGetValuesParCSR( hypre_IJMatrix *matrix,
                      if (col_map_offd[offd_j[j]] == col_indx)
                      {
                         values[indx] = offd_data[j];
+                        if (zero_out)
+                        {
+                           offd_data[j] = 0.0;
+                        }
                         break;
                      }
                   }
@@ -577,6 +593,10 @@ hypre_IJMatrixGetValuesParCSR( hypre_IJMatrix *matrix,
                      if (diag_j[j] == (HYPRE_Int)col_indx)
                      {
                         values[indx] = diag_data[j];
+                        if (zero_out)
+                        {
+                           diag_data[j] = 0.0;
+                        }
                         break;
                      }
                   }
@@ -829,6 +849,7 @@ hypre_IJMatrixSetValuesParCSR( hypre_IJMatrix       *matrix,
          if (row >= row_partitioning[0] && row < row_partitioning[1])
          {
             row_local = (HYPRE_Int)(row - row_partitioning[0]);
+
             /* compute local row number */
             if (need_aux)
             {
@@ -1016,6 +1037,8 @@ hypre_IJMatrixSetValuesParCSR( hypre_IJMatrix       *matrix,
       }
    }
 
+   HYPRE_PRINT_MEMORY_USAGE(comm);
+
    return hypre_error_flag;
 }
 
@@ -1605,6 +1628,8 @@ hypre_IJMatrixAddToValuesParCSR( hypre_IJMatrix       *matrix,
       }
    }
 
+   HYPRE_PRINT_MEMORY_USAGE(comm);
+
    return hypre_error_flag;
 }
 
@@ -2933,6 +2958,7 @@ hypre_IJMatrixAssembleParCSR(hypre_IJMatrix *matrix)
    hypre_AuxParCSRMatrixDestroy(aux_matrix);
    hypre_IJMatrixTranslator(matrix) = NULL;
 
+   HYPRE_PRINT_MEMORY_USAGE(comm);
    HYPRE_ANNOTATE_FUNC_END;
 
    return hypre_error_flag;
@@ -2978,8 +3004,8 @@ hypre_IJMatrixSetValuesOMPParCSR( hypre_IJMatrix       *matrix,
    HYPRE_Int num_procs, my_id;
    HYPRE_BigInt col_0, col_n, first;
    //HYPRE_Int cancel_indx;
-   HYPRE_BigInt **aux_j;
-   HYPRE_Complex **aux_data;
+   HYPRE_BigInt **aux_j = NULL;
+   HYPRE_Complex **aux_data = NULL;
    HYPRE_Int *row_length, *row_space;
    HYPRE_Int need_aux;
    HYPRE_Int *diag_i = NULL;
@@ -3625,8 +3651,8 @@ hypre_IJMatrixAddToValuesOMPParCSR( hypre_IJMatrix       *matrix,
    MPI_Comm comm = hypre_IJMatrixComm(matrix);
    HYPRE_Int num_procs, my_id;
    HYPRE_BigInt col_0, col_n, first;
-   HYPRE_BigInt **aux_j;
-   HYPRE_Complex **aux_data;
+   HYPRE_BigInt **aux_j = NULL;
+   HYPRE_Complex **aux_data = NULL;
    HYPRE_Int *row_length, *row_space;
    HYPRE_Int need_aux;
    HYPRE_Int *diag_i = NULL;
diff --git a/3rd_party/hypre/src/IJ_mv/_hypre_IJ_mv.h b/3rd_party/hypre/src/IJ_mv/_hypre_IJ_mv.h
index f69c5b6ba..a90dd340f 100644
--- a/3rd_party/hypre/src/IJ_mv/_hypre_IJ_mv.h
+++ b/3rd_party/hypre/src/IJ_mv/_hypre_IJ_mv.h
@@ -441,7 +441,8 @@ HYPRE_Int hypre_IJMatrixInitializeParCSR ( hypre_IJMatrix *matrix );
 HYPRE_Int hypre_IJMatrixGetRowCountsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows,
                                              HYPRE_BigInt *rows, HYPRE_Int *ncols );
 HYPRE_Int hypre_IJMatrixGetValuesParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_Int *ncols,
-                                          HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Complex *values );
+                                          HYPRE_BigInt *rows,
+                                          HYPRE_Int *row_indexes, HYPRE_BigInt *cols, HYPRE_Complex *values, HYPRE_Int zero_out );
 HYPRE_Int hypre_IJMatrixSetValuesParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_Int *ncols,
                                           const HYPRE_BigInt *rows, const HYPRE_Int *row_indexes, const HYPRE_BigInt *cols,
                                           const HYPRE_Complex *values );
@@ -539,6 +540,7 @@ HYPRE_Int hypre_IJVectorUpdateValuesDevice( hypre_IJVector *vector, HYPRE_Int nu
 /* HYPRE_IJMatrix.c */
 HYPRE_Int HYPRE_IJMatrixCreate ( MPI_Comm comm, HYPRE_BigInt ilower, HYPRE_BigInt iupper,
                                  HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_IJMatrix *matrix );
+HYPRE_Int HYPRE_IJMatrixPartialClone ( HYPRE_IJMatrix matrix_in, HYPRE_IJMatrix *matrix_out );
 HYPRE_Int HYPRE_IJMatrixDestroy ( HYPRE_IJMatrix matrix );
 HYPRE_Int HYPRE_IJMatrixInitialize ( HYPRE_IJMatrix matrix );
 HYPRE_Int HYPRE_IJMatrixSetPrintLevel ( HYPRE_IJMatrix matrix, HYPRE_Int print_level );
diff --git a/3rd_party/hypre/src/IJ_mv/aux_parcsr_matrix.c b/3rd_party/hypre/src/IJ_mv/aux_parcsr_matrix.c
index 472108327..07efeb864 100644
--- a/3rd_party/hypre/src/IJ_mv/aux_parcsr_matrix.c
+++ b/3rd_party/hypre/src/IJ_mv/aux_parcsr_matrix.c
@@ -358,8 +358,8 @@ hypre_AuxParCSRMatrixInitialize_v2( hypre_AuxParCSRMatrix *matrix,
             for (i = 0; i < local_num_rows; i++)
             {
                row_space[i] = 30;
-               aux_j[i] = hypre_CTAlloc(HYPRE_BigInt, 30, HYPRE_MEMORY_HOST);
-               aux_data[i] = hypre_CTAlloc(HYPRE_Complex, 30, HYPRE_MEMORY_HOST);
+               aux_j[i] = hypre_CTAlloc(HYPRE_BigInt, row_space[i], HYPRE_MEMORY_HOST);
+               aux_data[i] = hypre_CTAlloc(HYPRE_Complex, row_space[i], HYPRE_MEMORY_HOST);
             }
             hypre_AuxParCSRMatrixRowSpace(matrix) = row_space;
          }
diff --git a/3rd_party/hypre/src/IJ_mv/protos.h b/3rd_party/hypre/src/IJ_mv/protos.h
index 524277c12..d1baa59d8 100644
--- a/3rd_party/hypre/src/IJ_mv/protos.h
+++ b/3rd_party/hypre/src/IJ_mv/protos.h
@@ -71,7 +71,8 @@ HYPRE_Int hypre_IJMatrixInitializeParCSR ( hypre_IJMatrix *matrix );
 HYPRE_Int hypre_IJMatrixGetRowCountsParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows,
                                              HYPRE_BigInt *rows, HYPRE_Int *ncols );
 HYPRE_Int hypre_IJMatrixGetValuesParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_Int *ncols,
-                                          HYPRE_BigInt *rows, HYPRE_BigInt *cols, HYPRE_Complex *values );
+                                          HYPRE_BigInt *rows,
+                                          HYPRE_Int *row_indexes, HYPRE_BigInt *cols, HYPRE_Complex *values, HYPRE_Int zero_out );
 HYPRE_Int hypre_IJMatrixSetValuesParCSR ( hypre_IJMatrix *matrix, HYPRE_Int nrows, HYPRE_Int *ncols,
                                           const HYPRE_BigInt *rows, const HYPRE_Int *row_indexes, const HYPRE_BigInt *cols,
                                           const HYPRE_Complex *values );
@@ -169,6 +170,7 @@ HYPRE_Int hypre_IJVectorUpdateValuesDevice( hypre_IJVector *vector, HYPRE_Int nu
 /* HYPRE_IJMatrix.c */
 HYPRE_Int HYPRE_IJMatrixCreate ( MPI_Comm comm, HYPRE_BigInt ilower, HYPRE_BigInt iupper,
                                  HYPRE_BigInt jlower, HYPRE_BigInt jupper, HYPRE_IJMatrix *matrix );
+HYPRE_Int HYPRE_IJMatrixPartialClone ( HYPRE_IJMatrix matrix_in, HYPRE_IJMatrix *matrix_out );
 HYPRE_Int HYPRE_IJMatrixDestroy ( HYPRE_IJMatrix matrix );
 HYPRE_Int HYPRE_IJMatrixInitialize ( HYPRE_IJMatrix matrix );
 HYPRE_Int HYPRE_IJMatrixSetPrintLevel ( HYPRE_IJMatrix matrix, HYPRE_Int print_level );
diff --git a/3rd_party/hypre/src/config/HYPREConfig.cmake.in b/3rd_party/hypre/src/config/HYPREConfig.cmake.in
index abf6a99bf..717ba85c1 100644
--- a/3rd_party/hypre/src/config/HYPREConfig.cmake.in
+++ b/3rd_party/hypre/src/config/HYPREConfig.cmake.in
@@ -45,6 +45,7 @@ set(HYPRE_WITH_UMPIRE_HOST @HYPRE_WITH_UMPIRE_HOST@)
 set(HYPRE_WITH_UMPIRE_DEVICE @HYPRE_WITH_UMPIRE_DEVICE@)
 set(HYPRE_WITH_UMPIRE_UM @HYPRE_WITH_UMPIRE_UM@)
 set(HYPRE_WITH_UMPIRE_PINNED @HYPRE_WITH_UMPIRE_PINNED@)
+set(HYPRE_HAVE_MPI_COMM_F2C @HYPRE_HAVE_MPI_COMM_F2C@)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
 
diff --git a/3rd_party/hypre/src/config/HYPRE_config.h.cmake.in b/3rd_party/hypre/src/config/HYPRE_config.h.cmake.in
index 21fcc5bf6..9f4278dac 100644
--- a/3rd_party/hypre/src/config/HYPRE_config.h.cmake.in
+++ b/3rd_party/hypre/src/config/HYPRE_config.h.cmake.in
@@ -155,7 +155,8 @@
 /* Use MPI */
 #cmakedefine HYPRE_HAVE_MPI 1
 
-/* #undef HYPRE_HAVE_MPI_COMM_F2C */
+/* Use MPI_Comm_f2c */
+#cmakedefine HYPRE_HAVE_MPI_COMM_F2C 1
 
 /* Define as follows to set the Fortran name mangling scheme:
  * 0 = unspecified
diff --git a/3rd_party/hypre/src/config/configure.in b/3rd_party/hypre/src/config/configure.in
index a58fcc2f6..9591698f8 100644
--- a/3rd_party/hypre/src/config/configure.in
+++ b/3rd_party/hypre/src/config/configure.in
@@ -48,9 +48,9 @@ dnl * Set package information so it only has to be modified in one place
 dnl *********************************************************************
 
 m4_define([M4_HYPRE_NAME],    [hypre])
-m4_define([M4_HYPRE_VERSION], [2.31.0])
-m4_define([M4_HYPRE_NUMBER],  [23100])
-m4_define([M4_HYPRE_DATE],    [2024/02/14])
+m4_define([M4_HYPRE_VERSION], [2.32.0])
+m4_define([M4_HYPRE_NUMBER],  [23200])
+m4_define([M4_HYPRE_DATE],    [2024/10/08])
 m4_define([M4_HYPRE_TIME],    [00:00:00])
 m4_define([M4_HYPRE_BUGS],    [https://github.com/hypre-space/hypre/issues])
 m4_define([M4_HYPRE_SRCDIR],  [`pwd`])
diff --git a/3rd_party/hypre/src/config/update-cmake.py b/3rd_party/hypre/src/config/update-cmake.py
new file mode 100755
index 000000000..11b6fd386
--- /dev/null
+++ b/3rd_party/hypre/src/config/update-cmake.py
@@ -0,0 +1,115 @@
+# Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+# HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+import argparse
+import os
+
+def extract_files_from_section(input_text, section_start):
+    files = []
+    collect_files = False
+    for line in input_text.splitlines():
+        if line.strip().startswith(section_start):
+            collect_files = True
+            continue
+
+        if collect_files:
+            if line.strip() == "":
+                break  # Assuming an empty line marks the end of the section
+
+            # Extract the file name, assuming no spaces in file names
+            file_name = line.strip().split("\\")[0]
+
+            # Filter only source files
+            if file_name.endswith('.c') or file_name.endswith('.cpp'):
+                files.append(file_name)
+
+    return files
+
+def are_files_in_list(file_list_a, file_list_b):
+    return [fn for fn in file_list_a if fn not in file_list_b]
+
+def add_missing_files(file_content, section_start, missing_files):
+    lines = file_content.split('\n')
+    updated_lines = []
+    in_section_start = -1
+    in_section_end = -1
+
+    # Identify the start and end of the SRCS block
+    for i, line in enumerate(lines):
+        trimmed_line = line.strip()
+        if trimmed_line.startswith(section_start):
+            in_section_start = i
+
+        if in_section_start != -1 and trimmed_line.endswith(")") and in_section_end == -1:
+            in_section_end = i
+            break
+
+    if in_section_start == -1 or in_section_end == -1:
+        print("Error: Could not find a complete SRCS block in file B.")
+        return file_content
+
+    # Add all lines up to the end of the section, excluding the closing parenthesis
+    updated_lines.extend(lines[:in_section_end])
+
+    # Add missing files
+    for missing_file in missing_files:
+        updated_lines.append(f"  {missing_file}")
+
+    # Add closing parenthesis
+    updated_lines.append(lines[in_section_end])
+
+    # Add remaining lines
+    updated_lines.extend(lines[in_section_end + 1:])
+
+    updated_content = '\n'.join(updated_lines)
+    return updated_content
+
+def process(args, section_A, section_B):
+    # Read file contents
+    file_A_path = os.path.join(args.folder, "Makefile")
+    file_B_path = os.path.join(args.folder, "CMakeLists.txt")
+    with open(file_A_path, 'r') as file_A, open(file_B_path, 'r') as file_B:
+        file_A_content = file_A.read()
+        file_B_content = file_B.read()
+
+    # Extract files
+    files_A = extract_files_from_section(file_A_content, section_A)
+    files_B = extract_files_from_section(file_B_content, section_B)
+    if args.verbose:
+        print(f"{files_A = }\n")
+        print(f"{files_B = }")
+
+    # Check if all files in A are in B
+    missing = are_files_in_list(files_A, files_B)
+
+    if missing:
+        print("\nAdded to FILE B:", missing)
+        new_file_B_content = add_missing_files(file_B_content, section_B, missing)
+        with open(file_B_path, 'w') as file_B:
+            file_B.write(new_file_B_content)
+
+def main():
+    parser = argparse.ArgumentParser(description="Check and update CMakeLists based on the contents of Makefile")
+    parser.add_argument("-f", "--folder", required=True, help="Folder path")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Turn on verbose mode")
+    args = parser.parse_args()
+
+    # Validate folder path
+    args.folder = os.path.normpath(args.folder)
+    if not os.path.isdir(args.folder):
+        print("The specified folder does not exist or is not a directory.")
+        return
+
+    # Process source files
+    process(args, "FILES =", "set(SRCS")
+
+    # Process GPU source files
+    process(args, "CUFILES =", "set(GPU_SRCS")
+
+    # Done!
+    print(f"Done with {args.folder = }...")
+
+if __name__ == "__main__":
+    main()
diff --git a/3rd_party/hypre/src/config/update-cmake.sh b/3rd_party/hypre/src/config/update-cmake.sh
new file mode 100755
index 000000000..18b344847
--- /dev/null
+++ b/3rd_party/hypre/src/config/update-cmake.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+# HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+# Usage:
+#   ./update-cmake.sh
+#
+# The script checks for missing source files listed in various CMakeLists.txt
+# by looking at the respective Makefile. If any files are missing, they are added
+# to CMakeLists.txt
+
+DIRNAME=$(dirname $0)
+FOLDERS=(blas examples IJ_mv krylov lapack parcsr_block_mv parcsr_ls parcsr_mv seq_block_mv seq_mv sstruct_ls sstruct_mv test utilities)
+
+for FOLDER in ${FOLDERS[@]}; do
+    python3 ${DIRNAME}/update-cmake.py -f ${DIRNAME}/../${FOLDER}
+done
diff --git a/3rd_party/hypre/src/config/version.sh b/3rd_party/hypre/src/config/version.sh
index 9ba58eb56..418815345 100755
--- a/3rd_party/hypre/src/config/version.sh
+++ b/3rd_party/hypre/src/config/version.sh
@@ -4,8 +4,8 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
-hypre_version="2.31.0"
-hypre_reldate="2024/02/14"
+hypre_version="2.32.0"
+hypre_reldate="2024/10/08"
 
 hypre_major=`echo $hypre_version | cut -d. -f 1`
 hypre_minor=`echo $hypre_version | cut -d. -f 2`
diff --git a/3rd_party/hypre/src/configure b/3rd_party/hypre/src/configure
index b3e8d5444..b9bcf5fbe 100755
--- a/3rd_party/hypre/src/configure
+++ b/3rd_party/hypre/src/configure
@@ -1,7 +1,7 @@
 #! /bin/sh
 # From configure.in Id.
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for hypre 2.31.0.
+# Generated by GNU Autoconf 2.71 for hypre 2.32.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -612,8 +612,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='hypre'
 PACKAGE_TARNAME='hypre'
-PACKAGE_VERSION='2.31.0'
-PACKAGE_STRING='hypre 2.31.0'
+PACKAGE_VERSION='2.32.0'
+PACKAGE_STRING='hypre 2.32.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1473,7 +1473,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures hypre 2.31.0 to adapt to many kinds of systems.
+\`configure' configures hypre 2.32.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1539,7 +1539,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of hypre 2.31.0:";;
+     short | recursive ) echo "Configuration of hypre 2.32.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1904,7 +1904,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-hypre configure 2.31.0
+hypre configure 2.32.0
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -2249,7 +2249,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by hypre $as_me 2.31.0, which was
+It was created by hypre $as_me 2.32.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3230,9 +3230,9 @@ ac_config_headers="$ac_config_headers HYPRE_config.h:config/HYPRE_config.h.in"
 
 
 HYPRE_NAME="hypre"
-HYPRE_VERSION="2.31.0"
-HYPRE_NUMBER=23100
-HYPRE_DATE="2024/02/14"
+HYPRE_VERSION="2.32.0"
+HYPRE_NUMBER=23200
+HYPRE_DATE="2024/10/08"
 HYPRE_TIME="00:00:00"
 HYPRE_BUGS="https://github.com/hypre-space/hypre/issues"
 HYPRE_SRCDIR="`pwd`"
@@ -11852,7 +11852,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by hypre $as_me 2.31.0, which was
+This file was extended by hypre $as_me 2.32.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -11916,7 +11916,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-hypre config.status 2.31.0
+hypre config.status 2.32.0
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/3rd_party/hypre/src/distributed_ls/Euclid/mat_dh_private.c b/3rd_party/hypre/src/distributed_ls/Euclid/mat_dh_private.c
index 6e53cc45d..853cb3e7f 100644
--- a/3rd_party/hypre/src/distributed_ls/Euclid/mat_dh_private.c
+++ b/3rd_party/hypre/src/distributed_ls/Euclid/mat_dh_private.c
@@ -884,7 +884,7 @@ void mat_dh_transpose_reuse_private_private(bool allocateMem, HYPRE_Int m,
   START_FUNC_DH
   HYPRE_Int *rp, *cval, *tmp;
   HYPRE_Int i, j, nz = RP[m];
-  HYPRE_Real *aval;
+  HYPRE_Real *aval = NULL;
 
   if (allocateMem) {
     rp = *rpOUT = (HYPRE_Int *)MALLOC_DH((1+m)*sizeof(HYPRE_Int)); CHECK_V_ERROR;
diff --git a/3rd_party/hypre/src/lib/Makefile b/3rd_party/hypre/src/lib/Makefile
index 5c56107e1..b77c431b0 100644
--- a/3rd_party/hypre/src/lib/Makefile
+++ b/3rd_party/hypre/src/lib/Makefile
@@ -50,8 +50,7 @@ FILES_HYPRE = \
  $(STRUCTMVFILES)\
  $(UTILITIESFILES)\
  $(BLASFILES)\
- $(LAPACKFILES)\
- $(IJMVOBJFILES)
+ $(LAPACKFILES)
 
 SONAME = libHYPRE-${HYPRE_RELEASE_VERSION}${HYPRE_LIB_SUFFIX}
 SOLIBS = ${DSUPERLU_LIBS} ${HYPRE_MAGMA_LIB_DIR} ${HYPRE_MAGMA_LIB} ${MPILIBDIRS} ${MPILIBS} ${LAPACKLIBDIRS} ${LAPACKLIBS}\
@@ -83,8 +82,8 @@ distclean: clean
 libHYPRE.a: ${FILES_HYPRE}
 	@echo  "Building libHYPRE ... "
 	rm -f $@
+	${AR} $@ $(FEIHYPREFILES) $(FEMLIFILES) $(IJMVFILES)
 	${AR} $@ $(EUCLIDFILES) $(PARASAILSFILES) $(PILUTFILES) $(DISTMATRIXFILES)
-	${AR} $@ $(IJMVFILES)
 	${AR} $@ $(KRYLOVFILES)
 	${AR} $@ $(MATMATFILES)
 	${AR} $@ $(MULTIVECFILES)
diff --git a/3rd_party/hypre/src/multivector/interpreter.h b/3rd_party/hypre/src/multivector/interpreter.h
index ec4b3cb4b..7ea698ee9 100644
--- a/3rd_party/hypre/src/multivector/interpreter.h
+++ b/3rd_party/hypre/src/multivector/interpreter.h
@@ -38,8 +38,8 @@ typedef struct
    void   (*CopyMultiVector)    ( void *x, void *y );
    void   (*ClearMultiVector)   ( void *x );
    void   (*SetRandomVectors)   ( void *x, HYPRE_Int seed );
-   void   (*MultiInnerProd)     ( void *x, void *y, HYPRE_BigInt, HYPRE_Int, HYPRE_Int, HYPRE_Real* );
-   void   (*MultiInnerProdDiag) ( void *x, void *y, HYPRE_Int*, HYPRE_Int, HYPRE_Real* );
+   void   (*MultiInnerProd)     ( void *x, void *y, HYPRE_BigInt, HYPRE_Int, HYPRE_Int, HYPRE_Complex* );
+   void   (*MultiInnerProdDiag) ( void *x, void *y, HYPRE_Int*, HYPRE_Int, HYPRE_Complex* );
    void   (*MultiVecMat)        ( void *x, HYPRE_BigInt, HYPRE_Int, HYPRE_Int, HYPRE_Complex*,
                                   void *y );
    void   (*MultiVecMatDiag)    ( void *x, HYPRE_Int*, HYPRE_Int, HYPRE_Complex*, void *y );
diff --git a/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_interp.c b/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_interp.c
index 40e90432f..b2d112013 100644
--- a/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_interp.c
+++ b/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_interp.c
@@ -131,7 +131,7 @@ hypre_BoomerAMGBuildBlockInterp( hypre_ParCSRBlockMatrix  *A,
    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
    HYPRE_Real       *identity_block;
    HYPRE_Real       *zero_block;
@@ -1700,7 +1700,7 @@ hypre_BoomerAMGBuildBlockInterpDiag( hypre_ParCSRBlockMatrix  *A,
    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
 
    HYPRE_Real       *identity_block;
@@ -2826,7 +2826,7 @@ hypre_BoomerAMGBuildBlockInterpRV( hypre_ParCSRBlockMatrix    *A,
    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
 
    HYPRE_Real       *identity_block;
@@ -3884,7 +3884,7 @@ hypre_BoomerAMGBuildBlockInterpRV2( hypre_ParCSRBlockMatrix   *A,
    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
 
    HYPRE_Real       *identity_block;
@@ -4899,7 +4899,7 @@ hypre_BoomerAMGBuildBlockDirInterp( hypre_ParCSRBlockMatrix    *A,
    HYPRE_Int            *int_buf_data = NULL;
    HYPRE_BigInt         *big_buf_data = NULL;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
    HYPRE_Real       *identity_block;
    HYPRE_Real       *zero_block;
diff --git a/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_rap.c b/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_rap.c
index 5ecde47b1..40974ade2 100644
--- a/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_rap.c
+++ b/3rd_party/hypre/src/parcsr_block_mv/par_csr_block_rap.c
@@ -234,7 +234,7 @@ hypre_ParCSRBlockMatrixRAP(hypre_ParCSRBlockMatrix  *RT,
    HYPRE_Int             *RAP_int_i;
    HYPRE_BigInt          *RAP_int_j;
 
-   hypre_CSRBlockMatrix  *RAP_ext;
+   hypre_CSRBlockMatrix  *RAP_ext       = NULL;
 
    HYPRE_Complex         *RAP_ext_data  = NULL;
    HYPRE_Int             *RAP_ext_i     = NULL;
diff --git a/3rd_party/hypre/src/parcsr_ls/CMakeLists.txt b/3rd_party/hypre/src/parcsr_ls/CMakeLists.txt
index fde1573f0..d46cfc1f7 100644
--- a/3rd_party/hypre/src/parcsr_ls/CMakeLists.txt
+++ b/3rd_party/hypre/src/parcsr_ls/CMakeLists.txt
@@ -27,6 +27,8 @@ set(SRCS
   F90_HYPRE_parcsr_pcg.c
   F90_HYPRE_parcsr_pilut.c
   F90_HYPRE_parcsr_schwarz.c
+  F90_HYPRE_parcsr_mgr.c
+  F90_HYPRE_parcsr_ilu.c
   F90_HYPRE_ams.c
   gen_redcs_mat.c
   HYPRE_parcsr_amg.c
@@ -96,6 +98,7 @@ set(SRCS
   par_mgr.c
   par_mgr_coarsen.c
   par_mgr_interp.c
+  par_mgr_rap.c
   par_mgr_setup.c
   par_mgr_solve.c
   par_mgr_stats.c
@@ -147,10 +150,6 @@ set(SRCS
   par_mgr_device.c
 )
 
-if (HYPRE_USING_DSUPERLU)
-  list(APPEND SRCS dsuperlu.c)
-endif()
-
 target_sources(${PROJECT_NAME}
   PRIVATE ${SRCS}
           ${HDRS}
@@ -185,6 +184,7 @@ if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
     par_relax_device.c
     par_mgr_device.c
     par_fsai_device.c
+    dsuperlu.c
   )
   convert_filenames_to_full_paths(GPU_SRCS)
   set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
diff --git a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_Euclid.c b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_Euclid.c
index c7b17ed36..38bff44f2 100644
--- a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_Euclid.c
+++ b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_Euclid.c
@@ -90,6 +90,8 @@ HYPRE_EuclidCreate( MPI_Comm comm,
                     HYPRE_Solver *solver )
 {
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(comm);
+   HYPRE_UNUSED_VAR(solver);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -149,6 +151,7 @@ HYPRE_Int
 HYPRE_EuclidDestroy( HYPRE_Solver solver )
 {
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(solver);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -271,6 +274,8 @@ HYPRE_EuclidSetup( HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(x);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(solver);
+   HYPRE_UNUSED_VAR(A);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -325,6 +330,9 @@ HYPRE_EuclidSolve( HYPRE_Solver        solver,
    HYPRE_UNUSED_VAR(A);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(solver);
+   HYPRE_UNUSED_VAR(bb);
+   HYPRE_UNUSED_VAR(xx);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -355,6 +363,8 @@ HYPRE_EuclidSetParams(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(argc);
+   HYPRE_UNUSED_VAR(argv);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -381,6 +391,7 @@ HYPRE_EuclidSetParamsFromFile(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(filename);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -398,6 +409,7 @@ HYPRE_EuclidSetLevel(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(level);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -417,6 +429,7 @@ HYPRE_EuclidSetBJ(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(bj);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -436,6 +449,7 @@ HYPRE_EuclidSetStats(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(eu_stats);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -455,6 +469,7 @@ HYPRE_EuclidSetMem(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(eu_mem);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -474,6 +489,7 @@ HYPRE_EuclidSetSparseA(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(sparse_A);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -494,6 +510,7 @@ HYPRE_EuclidSetRowScale(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(row_scale);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
@@ -514,6 +531,7 @@ HYPRE_EuclidSetILUT(HYPRE_Solver solver,
    HYPRE_UNUSED_VAR(solver);
 
 #ifdef HYPRE_MIXEDINT
+   HYPRE_UNUSED_VAR(ilut);
    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Euclid cannot be used in mixedint mode!");
    return hypre_error_flag;
 #else
diff --git a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_amg.c b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_amg.c
index 80d808d41..5c8dfd456 100644
--- a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_amg.c
+++ b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_amg.c
@@ -1509,23 +1509,49 @@ HYPRE_BoomerAMGSetFSAIKapTolerance( HYPRE_Solver  solver,
 }
 
 /*--------------------------------------------------------------------------
- * HYPRE_BoomerAMGSetNumFunctions, HYPRE_BoomerAMGGetNumFunctions
+ * HYPRE_BoomerAMGSetNumFunctions
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
 HYPRE_BoomerAMGSetNumFunctions( HYPRE_Solver  solver,
-                                HYPRE_Int          num_functions  )
+                                HYPRE_Int     num_functions  )
 {
    return ( hypre_BoomerAMGSetNumFunctions( (void *) solver, num_functions ) );
 }
 
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGGetNumFunctions
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_BoomerAMGGetNumFunctions( HYPRE_Solver  solver,
-                                HYPRE_Int        * num_functions  )
+                                HYPRE_Int     *num_functions  )
 {
    return ( hypre_BoomerAMGGetNumFunctions( (void *) solver, num_functions ) );
 }
 
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGSetFilterFunctions
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGSetFilterFunctions( HYPRE_Solver  solver,
+                                   HYPRE_Int     filter_functions  )
+{
+   return ( hypre_BoomerAMGSetFilterFunctions( (void *) solver, filter_functions ) );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_BoomerAMGGetFilterFunctions
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_BoomerAMGGetFilterFunctions( HYPRE_Solver  solver,
+                                   HYPRE_Int    *filter_functions  )
+{
+   return ( hypre_BoomerAMGGetFilterFunctions( (void *) solver, filter_functions ) );
+}
+
 /*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetNodal
  *--------------------------------------------------------------------------*/
@@ -1547,7 +1573,6 @@ HYPRE_BoomerAMGSetNodalLevels( HYPRE_Solver  solver,
    return ( hypre_BoomerAMGSetNodalLevels( (void *) solver, nodal_levels ) );
 }
 
-
 /*--------------------------------------------------------------------------
  * HYPRE_BoomerAMGSetNodalDiag
  *--------------------------------------------------------------------------*/
diff --git a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h
index 7da26a8c0..5f0d0620c 100644
--- a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h
+++ b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h
@@ -170,7 +170,24 @@ HYPRE_Int HYPRE_BoomerAMGGetFinalRelativeResidualNorm(HYPRE_Solver  solver,
  * The default is 1, i.e. a scalar system.
  **/
 HYPRE_Int HYPRE_BoomerAMGSetNumFunctions(HYPRE_Solver solver,
-                                         HYPRE_Int          num_functions);
+                                         HYPRE_Int    num_functions);
+
+/**
+ * (Optional) Sets filtering for system of PDEs (\e num_functions > 1).
+ *
+ * \param filter_functions An integer flag to enable or disable filtering of inter-variable
+ * connections in the input matrix used for preconditioning.
+ *   - A value of 0 (default) indicates no filtering, preserving all inter-variable connections.
+ *   - A value of 1 enables filtering, removing inter-variable connections to lower
+ *     operator and memory complexities.
+ *
+ * @note This option assumes that variables are stored in an interleaved format,
+ *       where multiple variables are combined in a single vector. Enabling filtering
+ *       can be beneficial when the problem has multiple coupled variables (functions)
+ *       that are not strongly coupled.
+ **/
+HYPRE_Int HYPRE_BoomerAMGSetFilterFunctions(HYPRE_Solver solver,
+                                            HYPRE_Int    filter_functions);
 
 /**
  * (Optional) Sets the mapping that assigns the function to each variable,
@@ -4085,10 +4102,10 @@ HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver solver,
  * The default is 0 (no reduction, i.e. keep the reserved cpoints in the coarse grid solve).
  *
  * The default setup for the reduction is as follows:
- *    interp_type = 2
- *    restrict_type = 0
- *    F-relax method = 99
- *    Galerkin coarse grid
+ *    - Interpolation type: Jacobi (2)
+ *    - Restriction type: Injection (0)
+ *    - F-relaxation type: LU factorization with pivoting (99)
+ *    - Coarse grid type: galerkin (0)
  **/
 HYPRE_Int
 HYPRE_MGRSetReservedCpointsLevelToKeep( HYPRE_Solver solver, HYPRE_Int level);
@@ -4145,17 +4162,38 @@ HYPRE_MGRSetLevelFRelaxType(HYPRE_Solver solver,
  * Options for \e cg_method are:
  *
  *    - 0 : Galerkin coarse grid computation using RAP.
- *    - 5 : Galerkin coarse grid computation using RAI (injective prolongation).
- *    - 1 - 4 : Non-Galerkin coarse grid computation with dropping strategy.
+ *    - 1 - 5 : Non-Galerkin coarse grid computation with dropping strategy.
  *         - 1: inv(A_FF) approximated by its (block) diagonal inverse
  *         - 2: CPR-like approximation with inv(A_FF) approximated by its diagonal inverse
  *         - 3: CPR-like approximation with inv(A_FF) approximated by its block diagonal inverse
  *         - 4: inv(A_FF) approximated by sparse approximate inverse
+ *         - 5: inv(A_FF) is an empty matrix and coarse level matrix is set to A_CC
  **/
 HYPRE_Int
 HYPRE_MGRSetCoarseGridMethod(HYPRE_Solver solver,
                              HYPRE_Int *cg_method );
 
+/**
+ * (Optional) Set the maximum number of nonzeros per row of the coarse grid correction
+ * operator computed in the Non-Galerkin approach. Options for \e max_elmts are:
+ *
+ *     - 0: keep only the (block) diagonal portion of the correction matrix (default).
+ *     - k > 0: keep the (block) diagonal plus the k-th largest entries per row
+ *              of the correction matrix.
+ **/
+HYPRE_Int
+HYPRE_MGRSetNonGalerkinMaxElmts(HYPRE_Solver solver,
+                                HYPRE_Int    max_elmts);
+
+/**
+ * (Optional) Set the maximum number of nonzeros per row of the coarse grid correction
+ * operator computed in the Non-Galerkin approach at each MGR level. For options, see
+ * \e HYPRE_MGRSetNonGalerkinMaxElmts.
+ **/
+HYPRE_Int
+HYPRE_MGRSetLevelNonGalerkinMaxElmts(HYPRE_Solver  solver,
+                                     HYPRE_Int    *max_elmts);
+
 /**
  * (Optional) Set the number of functions for F-relaxation V-cycle.
  * For problems like elasticity, one may want to perform coarsening and
@@ -4279,13 +4317,13 @@ HYPRE_Int HYPRE_MGRSetFSolver(HYPRE_Solver             solver,
 /**
  * (Optional) Set the F-relaxation solver at a given level.
  *
- * @param level [IN] MGR solver level
  * @param solver [IN] MGR solver/preconditioner object
  * @param fsolver [IN] F-relaxation solver object
+ * @param level [IN] MGR solver level
  **/
-HYPRE_Int HYPRE_MGRSetFSolverAtLevel(HYPRE_Int     level,
-                                     HYPRE_Solver  solver,
-                                     HYPRE_Solver  fsolver );
+HYPRE_Int HYPRE_MGRSetFSolverAtLevel(HYPRE_Solver  solver,
+                                     HYPRE_Solver  fsolver,
+                                     HYPRE_Int     level );
 
 /**
  * (Optional) Extract A_FF block from matrix A.
@@ -4436,12 +4474,59 @@ HYPRE_MGRSetGlobalSmoothType( HYPRE_Solver solver,
                               HYPRE_Int smooth_type );
 
 /**
- * (Optional) Determines type of global smoother for each level.
- * See \e HYPRE_MGRSetGlobalSmoothType for global smoother options.
- **/
+ * @brief Sets the type of global smoother for each level in the multigrid reduction (MGR) solver.
+ *
+ * This function allows the user to specify the type of global smoother to be used at each level
+ * of the multigrid reduction process. The types of smoothers available can be found in the
+ * documentation for \e HYPRE_MGRSetGlobalSmoothType. The smoother type for each level is indicated
+ * by the \e smooth_type array, which should have a size equal to \e max_num_coarse_levels.
+ *
+ * @note This function does not take ownership of the \e smooth_type array.
+ * @note If \e smooth_type is a NULL pointer, a default global smoother (Jacobi) is used for all levels.
+ * @note This call is optional. It is intended for advanced users who need specific control over the
+ *       smoothing process at different levels of the solver. If not called, the solver will proceed
+ *       with default smoothing parameters.
+ *
+ * @param[in] \e solver The HYPRE solver object to configure.
+ * @param[in] \e smooth_type An array of integers where each value specifies the type of smoother to
+ *            be used at the corresponding level.
+ *
+ * @return HYPRE_Int Error code (0 for success, non-zero for failure).
+ *
+ * @see HYPRE_MGRSetGlobalSmoothType for details on global smoother options.
+ */
+
+HYPRE_Int
+HYPRE_MGRSetLevelSmoothType(HYPRE_Solver  solver,
+                            HYPRE_Int    *smooth_type);
+
+/**
+ * @brief Sets the global smoother method for a specified MGR level using a HYPRE solver object.
+ *
+ * This function enables solvers within hypre to be used as complex smoothers for a specific level
+ * within the multigrid reduction (MGR) scheme. Users can configure the solver options and pass the
+ * solver in as the smoother. Currently supported solver options via this interface are ILU and AMG.
+ *
+ * @note Unlike some other setup functions that might require an array to set options across multiple
+ *       levels, this function focuses on a single level, identified by the \e level parameter.
+ *
+ * @warning The smoother passed to function takes precedence over the smoother type set for that level
+ *       in the MGR hierarchy.
+ *
+ * @param[in,out] \e solver A pointer to the MGR solver object. This object is modified to include the
+ *                specified smoother for the given level.
+ * @param[in] \e smoother The HYPRE solver object that specifies the global relaxation method to be used
+ *            at the specified level. Currently available choices are BoomerAMG and ILU.
+ * @param[in] \e level The level identifier for which the global relaxation method is to be set.
+ *            Must be within the range of the number of levels in the MGR solver.
+ *
+ * @return HYPRE_Int Returns an error code. Success is indicated by 0, while any non-zero value signifies an error.
+ */
+
 HYPRE_Int
-HYPRE_MGRSetLevelSmoothType( HYPRE_Solver solver,
-                             HYPRE_Int *smooth_type );
+HYPRE_MGRSetGlobalSmootherAtLevel( HYPRE_Solver  solver,
+                                   HYPRE_Solver  smoother,
+                                   HYPRE_Int     level );
 
 /**
  * (Optional) Return the number of MGR iterations.
diff --git a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_mgr.c b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_mgr.c
index f74e9e10f..4123a6f68 100644
--- a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_mgr.c
+++ b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_mgr.c
@@ -16,7 +16,7 @@ HYPRE_MGRCreate( HYPRE_Solver *solver )
 {
    if (!solver)
    {
-      hypre_error_in_arg(2);
+      hypre_error_in_arg(1);
       return hypre_error_flag;
    }
    *solver = ( (HYPRE_Solver) hypre_MGRCreate( ) );
@@ -99,7 +99,7 @@ HYPRE_MGRDirectSolverCreate( HYPRE_Solver *solver )
 {
    if (!solver)
    {
-      hypre_error_in_arg(2);
+      hypre_error_in_arg(1);
       return hypre_error_flag;
    }
    *solver = ( (HYPRE_Solver) hypre_MGRDirectSolverCreate( ) );
@@ -211,11 +211,23 @@ HYPRE_MGRSetNonCpointsToFpoints( HYPRE_Solver solver, HYPRE_Int nonCptToFptFlag)
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetFSolver(HYPRE_Solver          solver,
+HYPRE_MGRSetFSolver(HYPRE_Solver             solver,
                     HYPRE_PtrToParSolverFcn  fine_grid_solver_solve,
                     HYPRE_PtrToParSolverFcn  fine_grid_solver_setup,
-                    HYPRE_Solver          fsolver )
+                    HYPRE_Solver             fsolver )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (!fsolver)
+   {
+      hypre_error_in_arg(4);
+      return hypre_error_flag;
+   }
+
+
    return ( hypre_MGRSetFSolver( (void *) solver,
                                  (HYPRE_Int (*)(void*, void*, void*, void*)) fine_grid_solver_solve,
                                  (HYPRE_Int (*)(void*, void*, void*, void*)) fine_grid_solver_setup,
@@ -227,13 +239,24 @@ HYPRE_MGRSetFSolver(HYPRE_Solver          solver,
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetFSolverAtLevel(HYPRE_Int     level,
-                           HYPRE_Solver  solver,
-                           HYPRE_Solver  fsolver )
+HYPRE_MGRSetFSolverAtLevel(HYPRE_Solver  solver,
+                           HYPRE_Solver  fsolver,
+                           HYPRE_Int     level )
 {
-   return ( hypre_MGRSetFSolverAtLevel( level,
-                                        (void *) solver,
-                                        (void *) fsolver ) );
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (!fsolver)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+
+   return ( hypre_MGRSetFSolverAtLevel( (void *) solver,
+                                        (void *) fsolver,
+                                        level ) );
 }
 
 /*--------------------------------------------------------------------------
@@ -254,11 +277,22 @@ HYPRE_MGRBuildAff(HYPRE_ParCSRMatrix A,
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetCoarseSolver(HYPRE_Solver          solver,
+HYPRE_MGRSetCoarseSolver(HYPRE_Solver             solver,
                          HYPRE_PtrToParSolverFcn  coarse_grid_solver_solve,
                          HYPRE_PtrToParSolverFcn  coarse_grid_solver_setup,
-                         HYPRE_Solver          coarse_grid_solver )
+                         HYPRE_Solver             coarse_grid_solver )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (!coarse_grid_solver)
+   {
+      hypre_error_in_arg(4);
+      return hypre_error_flag;
+   }
+
    return ( hypre_MGRSetCoarseSolver( (void *) solver,
                                       (HYPRE_Int (*)(void*, void*, void*, void*)) coarse_grid_solver_solve,
                                       (HYPRE_Int (*)(void*, void*, void*, void*)) coarse_grid_solver_setup,
@@ -270,8 +304,15 @@ HYPRE_MGRSetCoarseSolver(HYPRE_Solver          solver,
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetMaxCoarseLevels( HYPRE_Solver solver, HYPRE_Int maxlev )
+HYPRE_MGRSetMaxCoarseLevels( HYPRE_Solver solver,
+                             HYPRE_Int    maxlev )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetMaxCoarseLevels(solver, maxlev);
 }
 
@@ -280,8 +321,15 @@ HYPRE_MGRSetMaxCoarseLevels( HYPRE_Solver solver, HYPRE_Int maxlev )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetBlockSize( HYPRE_Solver solver, HYPRE_Int bsize )
+HYPRE_MGRSetBlockSize( HYPRE_Solver solver,
+                       HYPRE_Int    bsize )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetBlockSize(solver, bsize );
 }
 
@@ -290,10 +338,27 @@ HYPRE_MGRSetBlockSize( HYPRE_Solver solver, HYPRE_Int bsize )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver solver, HYPRE_Int reserved_coarse_size,
-                                 HYPRE_BigInt *reserved_coarse_indexes )
+HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver  solver,
+                                 HYPRE_Int     reserved_coarse_size,
+                                 HYPRE_BigInt *reserved_coarse_indices )
 {
-   return hypre_MGRSetReservedCoarseNodes(solver, reserved_coarse_size, reserved_coarse_indexes );
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (reserved_coarse_size < 0)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+   else if (!reserved_coarse_indices)
+   {
+      hypre_error_in_arg(3);
+      return hypre_error_flag;
+   }
+
+   return hypre_MGRSetReservedCoarseNodes(solver, reserved_coarse_size, reserved_coarse_indices);
 }
 
 /*--------------------------------------------------------------------------
@@ -301,8 +366,15 @@ HYPRE_MGRSetReservedCoarseNodes( HYPRE_Solver solver, HYPRE_Int reserved_coarse_
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetReservedCpointsLevelToKeep( HYPRE_Solver solver, HYPRE_Int level)
+HYPRE_MGRSetReservedCpointsLevelToKeep( HYPRE_Solver solver,
+                                        HYPRE_Int    level)
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetReservedCpointsLevelToKeep((void *) solver, level);
 }
 
@@ -311,8 +383,15 @@ HYPRE_MGRSetReservedCpointsLevelToKeep( HYPRE_Solver solver, HYPRE_Int level)
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetRestrictType(HYPRE_Solver solver, HYPRE_Int restrict_type )
+HYPRE_MGRSetRestrictType(HYPRE_Solver solver,
+                         HYPRE_Int    restrict_type )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetRestrictType(solver, restrict_type );
 }
 
@@ -321,8 +400,15 @@ HYPRE_MGRSetRestrictType(HYPRE_Solver solver, HYPRE_Int restrict_type )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetLevelRestrictType( HYPRE_Solver solver, HYPRE_Int *restrict_type )
+HYPRE_MGRSetLevelRestrictType( HYPRE_Solver  solver,
+                               HYPRE_Int    *restrict_type )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelRestrictType( solver, restrict_type );
 }
 
@@ -331,8 +417,15 @@ HYPRE_MGRSetLevelRestrictType( HYPRE_Solver solver, HYPRE_Int *restrict_type )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetFRelaxMethod(HYPRE_Solver solver, HYPRE_Int relax_method )
+HYPRE_MGRSetFRelaxMethod( HYPRE_Solver solver,
+                          HYPRE_Int    relax_method )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetFRelaxMethod(solver, relax_method );
 }
 
@@ -341,8 +434,15 @@ HYPRE_MGRSetFRelaxMethod(HYPRE_Solver solver, HYPRE_Int relax_method )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetLevelFRelaxMethod( HYPRE_Solver solver, HYPRE_Int *relax_method )
+HYPRE_MGRSetLevelFRelaxMethod( HYPRE_Solver  solver,
+                               HYPRE_Int    *relax_method )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelFRelaxMethod( solver, relax_method );
 }
 
@@ -351,8 +451,15 @@ HYPRE_MGRSetLevelFRelaxMethod( HYPRE_Solver solver, HYPRE_Int *relax_method )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetLevelFRelaxType( HYPRE_Solver solver, HYPRE_Int *relax_type )
+HYPRE_MGRSetLevelFRelaxType( HYPRE_Solver  solver,
+                             HYPRE_Int    *relax_type )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelFRelaxType( solver, relax_type );
 }
 
@@ -363,16 +470,49 @@ HYPRE_MGRSetLevelFRelaxType( HYPRE_Solver solver, HYPRE_Int *relax_type )
 HYPRE_Int
 HYPRE_MGRSetCoarseGridMethod( HYPRE_Solver solver, HYPRE_Int *cg_method )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetCoarseGridMethod( solver, cg_method );
 }
 
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetNonGalerkinMaxElmts
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_MGRSetNonGalerkinMaxElmts( HYPRE_Solver solver, HYPRE_Int max_elmts )
+{
+   return hypre_MGRSetNonGalerkinMaxElmts( solver, max_elmts );
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetLevelNonGalerkinMaxElmts
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_MGRSetLevelNonGalerkinMaxElmts( HYPRE_Solver solver, HYPRE_Int *max_elmts )
+{
+   return hypre_MGRSetLevelNonGalerkinMaxElmts( solver, max_elmts );
+}
+
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetLevelFRelaxNumFunctions
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetLevelFRelaxNumFunctions( HYPRE_Solver solver, HYPRE_Int *num_functions )
+HYPRE_MGRSetLevelFRelaxNumFunctions( HYPRE_Solver  solver,
+                                     HYPRE_Int    *num_functions )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelFRelaxNumFunctions( solver, num_functions );
 }
 
@@ -381,8 +521,15 @@ HYPRE_MGRSetLevelFRelaxNumFunctions( HYPRE_Solver solver, HYPRE_Int *num_functio
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetRelaxType(HYPRE_Solver solver, HYPRE_Int relax_type )
+HYPRE_MGRSetRelaxType( HYPRE_Solver solver,
+                       HYPRE_Int    relax_type )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetRelaxType(solver, relax_type );
 }
 
@@ -391,17 +538,32 @@ HYPRE_MGRSetRelaxType(HYPRE_Solver solver, HYPRE_Int relax_type )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetNumRelaxSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps )
+HYPRE_MGRSetNumRelaxSweeps( HYPRE_Solver solver,
+                            HYPRE_Int    nsweeps )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetNumRelaxSweeps(solver, nsweeps);
 }
 
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetLevelNumRelaxSweeps
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-HYPRE_MGRSetLevelNumRelaxSweeps( HYPRE_Solver solver, HYPRE_Int *nsweeps )
+HYPRE_MGRSetLevelNumRelaxSweeps( HYPRE_Solver  solver,
+                                 HYPRE_Int    *nsweeps )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelNumRelaxSweeps(solver, nsweeps);
 }
 
@@ -410,8 +572,15 @@ HYPRE_MGRSetLevelNumRelaxSweeps( HYPRE_Solver solver, HYPRE_Int *nsweeps )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetInterpType( HYPRE_Solver solver, HYPRE_Int interpType )
+HYPRE_MGRSetInterpType( HYPRE_Solver solver,
+                        HYPRE_Int    interpType )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetInterpType(solver, interpType);
 }
 
@@ -420,8 +589,15 @@ HYPRE_MGRSetInterpType( HYPRE_Solver solver, HYPRE_Int interpType )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetLevelInterpType( HYPRE_Solver solver, HYPRE_Int *interpType )
+HYPRE_MGRSetLevelInterpType( HYPRE_Solver  solver,
+                             HYPRE_Int    *interpType )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelInterpType(solver, interpType);
 }
 
@@ -430,8 +606,15 @@ HYPRE_MGRSetLevelInterpType( HYPRE_Solver solver, HYPRE_Int *interpType )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetNumInterpSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps )
+HYPRE_MGRSetNumInterpSweeps( HYPRE_Solver solver,
+                             HYPRE_Int    nsweeps )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetNumInterpSweeps(solver, nsweeps);
 }
 
@@ -440,8 +623,15 @@ HYPRE_MGRSetNumInterpSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetNumRestrictSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps )
+HYPRE_MGRSetNumRestrictSweeps( HYPRE_Solver solver,
+                               HYPRE_Int    nsweeps )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetNumRestrictSweeps(solver, nsweeps);
 }
 
@@ -450,17 +640,37 @@ HYPRE_MGRSetNumRestrictSweeps( HYPRE_Solver solver, HYPRE_Int nsweeps )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetTruncateCoarseGridThreshold( HYPRE_Solver solver, HYPRE_Real threshold)
+HYPRE_MGRSetTruncateCoarseGridThreshold( HYPRE_Solver solver,
+                                         HYPRE_Real   threshold)
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetTruncateCoarseGridThreshold( solver, threshold );
 }
 
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetBlockJacobiBlockSize
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-HYPRE_MGRSetBlockJacobiBlockSize( HYPRE_Solver solver, HYPRE_Int blk_size )
+HYPRE_MGRSetBlockJacobiBlockSize( HYPRE_Solver solver,
+                                  HYPRE_Int    blk_size )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (blk_size < 1)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetBlockJacobiBlockSize(solver, blk_size);
 }
 
@@ -469,8 +679,15 @@ HYPRE_MGRSetBlockJacobiBlockSize( HYPRE_Solver solver, HYPRE_Int blk_size )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetFrelaxPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
+HYPRE_MGRSetFrelaxPrintLevel( HYPRE_Solver solver,
+                              HYPRE_Int    print_level )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetFrelaxPrintLevel( solver, print_level );
 }
 
@@ -479,8 +696,15 @@ HYPRE_MGRSetFrelaxPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetCoarseGridPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
+HYPRE_MGRSetCoarseGridPrintLevel( HYPRE_Solver solver,
+                                  HYPRE_Int    print_level )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetCoarseGridPrintLevel( solver, print_level );
 }
 
@@ -489,8 +713,15 @@ HYPRE_MGRSetCoarseGridPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
+HYPRE_MGRSetPrintLevel( HYPRE_Solver solver,
+                        HYPRE_Int    print_level )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetPrintLevel( solver, print_level );
 }
 
@@ -499,8 +730,15 @@ HYPRE_MGRSetPrintLevel( HYPRE_Solver solver, HYPRE_Int print_level )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetLogging( HYPRE_Solver solver, HYPRE_Int logging )
+HYPRE_MGRSetLogging( HYPRE_Solver solver,
+                     HYPRE_Int    logging )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLogging(solver, logging );
 }
 
@@ -509,8 +747,20 @@ HYPRE_MGRSetLogging( HYPRE_Solver solver, HYPRE_Int logging )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetMaxIter( HYPRE_Solver solver, HYPRE_Int max_iter )
+HYPRE_MGRSetMaxIter( HYPRE_Solver solver,
+                     HYPRE_Int    max_iter )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (max_iter < 0)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetMaxIter( solver, max_iter );
 }
 
@@ -519,8 +769,20 @@ HYPRE_MGRSetMaxIter( HYPRE_Solver solver, HYPRE_Int max_iter )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetTol( HYPRE_Solver solver, HYPRE_Real tol )
+HYPRE_MGRSetTol( HYPRE_Solver solver,
+                 HYPRE_Real   tol )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (tol < 0.0)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetTol( solver, tol );
 }
 
@@ -529,54 +791,128 @@ HYPRE_MGRSetTol( HYPRE_Solver solver, HYPRE_Real tol )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetMaxGlobalSmoothIters( HYPRE_Solver solver, HYPRE_Int max_iter )
+HYPRE_MGRSetMaxGlobalSmoothIters( HYPRE_Solver solver,
+                                  HYPRE_Int    max_iter )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (max_iter < 0)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetMaxGlobalSmoothIters(solver, max_iter);
 }
+
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetLevelsmoothIters
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-HYPRE_MGRSetLevelSmoothIters( HYPRE_Solver solver,
-                              HYPRE_Int *smooth_iters )
+HYPRE_MGRSetLevelSmoothIters( HYPRE_Solver  solver,
+                              HYPRE_Int    *smooth_iters )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelSmoothIters(solver, smooth_iters);
 }
 
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetGlobalsmoothType
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-HYPRE_MGRSetGlobalSmoothType( HYPRE_Solver solver, HYPRE_Int smooth_type )
+HYPRE_MGRSetGlobalSmoothType( HYPRE_Solver solver,
+                              HYPRE_Int    smooth_type )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetGlobalSmoothType(solver, smooth_type);
 }
+
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetLevelsmoothType
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-HYPRE_MGRSetLevelSmoothType( HYPRE_Solver solver,
-                             HYPRE_Int *smooth_type )
+HYPRE_MGRSetLevelSmoothType( HYPRE_Solver  solver,
+                             HYPRE_Int    *smooth_type )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelSmoothType(solver, smooth_type);
 }
+
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetGlobalSmoothCycle
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_MGRSetGlobalSmoothCycle( HYPRE_Solver solver,
-                               HYPRE_Int global_smooth_cycle )
+                               HYPRE_Int    global_smooth_cycle )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetGlobalSmoothCycle(solver, global_smooth_cycle);
 }
 
+/*--------------------------------------------------------------------------
+ * HYPRE_MGRSetGlobalSmootherAtLevel
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_MGRSetGlobalSmootherAtLevel( HYPRE_Solver  solver,
+                                   HYPRE_Solver  smoother,
+                                   HYPRE_Int     level )
+{
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   else if (!smoother)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+
+   return hypre_MGRSetGlobalSmootherAtLevel((void*) solver, smoother, level);
+}
+
 /*--------------------------------------------------------------------------
  * HYPRE_MGRSetPMaxElmts
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetPMaxElmts( HYPRE_Solver solver, HYPRE_Int P_max_elmts )
+HYPRE_MGRSetPMaxElmts( HYPRE_Solver solver,
+                       HYPRE_Int    P_max_elmts )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetPMaxElmts(solver, P_max_elmts);
 }
 
@@ -585,8 +921,15 @@ HYPRE_MGRSetPMaxElmts( HYPRE_Solver solver, HYPRE_Int P_max_elmts )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRSetLevelPMaxElmts( HYPRE_Solver solver, HYPRE_Int *P_max_elmts )
+HYPRE_MGRSetLevelPMaxElmts( HYPRE_Solver  solver,
+                            HYPRE_Int    *P_max_elmts )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRSetLevelPMaxElmts(solver, P_max_elmts);
 }
 
@@ -595,8 +938,15 @@ HYPRE_MGRSetLevelPMaxElmts( HYPRE_Solver solver, HYPRE_Int *P_max_elmts )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRGetCoarseGridConvergenceFactor( HYPRE_Solver solver, HYPRE_Real *conv_factor )
+HYPRE_MGRGetCoarseGridConvergenceFactor( HYPRE_Solver solver,
+                                         HYPRE_Real  *conv_factor )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRGetCoarseGridConvergenceFactor( solver, conv_factor );
 }
 
@@ -605,8 +955,15 @@ HYPRE_MGRGetCoarseGridConvergenceFactor( HYPRE_Solver solver, HYPRE_Real *conv_f
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRGetNumIterations( HYPRE_Solver solver, HYPRE_Int *num_iterations )
+HYPRE_MGRGetNumIterations( HYPRE_Solver  solver,
+                           HYPRE_Int    *num_iterations )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRGetNumIterations( solver, num_iterations );
 }
 
@@ -615,7 +972,14 @@ HYPRE_MGRGetNumIterations( HYPRE_Solver solver, HYPRE_Int *num_iterations )
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_MGRGetFinalRelativeResidualNorm( HYPRE_Solver solver, HYPRE_Real *res_norm )
+HYPRE_MGRGetFinalRelativeResidualNorm( HYPRE_Solver  solver,
+                                       HYPRE_Real   *res_norm )
 {
+   if (!solver)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
    return hypre_MGRGetFinalRelativeResidualNorm(solver, res_norm);
 }
diff --git a/3rd_party/hypre/src/parcsr_ls/Makefile b/3rd_party/hypre/src/parcsr_ls/Makefile
index b5e7b5ab8..9eaa82ce4 100644
--- a/3rd_party/hypre/src/parcsr_ls/Makefile
+++ b/3rd_party/hypre/src/parcsr_ls/Makefile
@@ -127,6 +127,7 @@ FILES =\
  par_mgr.c\
  par_mgr_coarsen.c\
  par_mgr_interp.c\
+ par_mgr_rap.c\
  par_mgr_setup.c\
  par_mgr_solve.c\
  par_mgr_stats.c\
diff --git a/3rd_party/hypre/src/parcsr_ls/_hypre_parcsr_ls.h b/3rd_party/hypre/src/parcsr_ls/_hypre_parcsr_ls.h
index 70bcfb5d6..9cef339a3 100644
--- a/3rd_party/hypre/src/parcsr_ls/_hypre_parcsr_ls.h
+++ b/3rd_party/hypre/src/parcsr_ls/_hypre_parcsr_ls.h
@@ -110,6 +110,7 @@ typedef struct
    hypre_ParCSRMatrix  *A;
    HYPRE_Int            num_variables;
    HYPRE_Int            num_functions;
+   HYPRE_Int            filter_functions;
    HYPRE_Int            nodal;
    HYPRE_Int            nodal_levels;
    HYPRE_Int            nodal_diag;
@@ -380,8 +381,9 @@ typedef struct
 #define hypre_ParAMGDataOuterWt(amg_data) ((amg_data)->outer_wt)
 
 /* problem data parameters */
-#define  hypre_ParAMGDataNumVariables(amg_data)  ((amg_data)->num_variables)
+#define hypre_ParAMGDataNumVariables(amg_data)  ((amg_data)->num_variables)
 #define hypre_ParAMGDataNumFunctions(amg_data) ((amg_data)->num_functions)
+#define hypre_ParAMGDataFilterFunctions(amg_data) ((amg_data)->filter_functions)
 #define hypre_ParAMGDataNodal(amg_data) ((amg_data)->nodal)
 #define hypre_ParAMGDataNodalLevels(amg_data) ((amg_data)->nodal_levels)
 #define hypre_ParAMGDataNodalDiag(amg_data) ((amg_data)->nodal_diag)
@@ -1980,6 +1982,8 @@ HYPRE_Int HYPRE_BoomerAMGSetFSAIThreshold ( HYPRE_Solver solver, HYPRE_Real thre
 HYPRE_Int HYPRE_BoomerAMGSetFSAIKapTolerance ( HYPRE_Solver solver, HYPRE_Real kap_tolerance );
 HYPRE_Int HYPRE_BoomerAMGSetNumFunctions ( HYPRE_Solver solver, HYPRE_Int num_functions );
 HYPRE_Int HYPRE_BoomerAMGGetNumFunctions ( HYPRE_Solver solver, HYPRE_Int *num_functions );
+HYPRE_Int HYPRE_BoomerAMGSetFilterFunctions ( HYPRE_Solver solver, HYPRE_Int filter_functions );
+HYPRE_Int HYPRE_BoomerAMGGetFilterFunctions ( HYPRE_Solver solver, HYPRE_Int *filter_functions );
 HYPRE_Int HYPRE_BoomerAMGSetNodal ( HYPRE_Solver solver, HYPRE_Int nodal );
 HYPRE_Int HYPRE_BoomerAMGSetNodalLevels ( HYPRE_Solver solver, HYPRE_Int nodal_levels );
 HYPRE_Int HYPRE_BoomerAMGSetNodalDiag ( HYPRE_Solver solver, HYPRE_Int nodal );
@@ -2568,6 +2572,8 @@ HYPRE_Int hypre_BoomerAMGSetCoordinates ( void *data, float *coordinates );
 HYPRE_Int hypre_BoomerAMGGetGridHierarchy(void *data, HYPRE_Int *cgrid );
 HYPRE_Int hypre_BoomerAMGSetNumFunctions ( void *data, HYPRE_Int num_functions );
 HYPRE_Int hypre_BoomerAMGGetNumFunctions ( void *data, HYPRE_Int *num_functions );
+HYPRE_Int hypre_BoomerAMGSetFilterFunctions ( void *data, HYPRE_Int filter_functions );
+HYPRE_Int hypre_BoomerAMGGetFilterFunctions ( void *data, HYPRE_Int *filter_functions );
 HYPRE_Int hypre_BoomerAMGSetNodal ( void *data, HYPRE_Int nodal );
 HYPRE_Int hypre_BoomerAMGSetNodalLevels ( void *data, HYPRE_Int nodal_levels );
 HYPRE_Int hypre_BoomerAMGSetNodalDiag ( void *data, HYPRE_Int nodal );
@@ -3606,7 +3612,7 @@ HYPRE_Int hypre_MGRSetFSolver( void *mgr_vdata,
                                HYPRE_Int (*fine_grid_solver_solve)(void*, void*, void*, void*),
                                HYPRE_Int (*fine_grid_solver_setup)(void*, void*, void*, void*),
                                void *fsolver );
-HYPRE_Int hypre_MGRSetFSolverAtLevel( HYPRE_Int level, void *mgr_vdata, void *fsolver );
+HYPRE_Int hypre_MGRSetFSolverAtLevel( void *mgr_vdata, void *fsolver, HYPRE_Int level );
 HYPRE_Int hypre_MGRSetup( void *mgr_vdata, hypre_ParCSRMatrix *A,
                           hypre_ParVector *f, hypre_ParVector *u );
 HYPRE_Int hypre_MGRSolve( void *mgr_vdata, hypre_ParCSRMatrix *A,
@@ -3648,14 +3654,6 @@ HYPRE_Int hypre_MGRAddVectorR( hypre_IntArray *CF_marker, HYPRE_Int point_type,
                                hypre_ParVector **toVector );
 HYPRE_Int hypre_MGRTruncateAcfCPRDevice( hypre_ParCSRMatrix  *A_CF,
                                          hypre_ParCSRMatrix **A_CF_new_ptr );
-HYPRE_Int hypre_MGRComputeNonGalerkinCoarseGrid(hypre_ParCSRMatrix *A_FF,
-                                                hypre_ParCSRMatrix *A_FC,
-                                                hypre_ParCSRMatrix *A_CF,
-                                                hypre_ParCSRMatrix *A_CC,
-                                                hypre_ParCSRMatrix *Wp, hypre_ParCSRMatrix *Wr,
-                                                HYPRE_Int bsize, HYPRE_Int ordering,
-                                                HYPRE_Int method, HYPRE_Int max_elmts,
-                                                hypre_ParCSRMatrix **A_H_ptr);
 HYPRE_Int hypre_MGRSetAffSolverType( void *systg_vdata, HYPRE_Int *aff_solver_type );
 HYPRE_Int hypre_MGRSetCoarseSolverType( void *systg_vdata, HYPRE_Int coarse_solver_type );
 HYPRE_Int hypre_MGRSetCoarseSolverIter( void *systg_vdata, HYPRE_Int coarse_solver_iter );
@@ -3669,6 +3667,8 @@ HYPRE_Int hypre_MGRSetLevelFRelaxMethod( void *mgr_vdata, HYPRE_Int *relax_metho
 HYPRE_Int hypre_MGRSetLevelFRelaxType( void *mgr_vdata, HYPRE_Int *relax_type );
 HYPRE_Int hypre_MGRSetLevelFRelaxNumFunctions( void *mgr_vdata, HYPRE_Int *num_functions );
 HYPRE_Int hypre_MGRSetCoarseGridMethod( void *mgr_vdata, HYPRE_Int *cg_method );
+HYPRE_Int hypre_MGRSetNonGalerkinMaxElmts( void *mgr_vdata, HYPRE_Int max_elmts );
+HYPRE_Int hypre_MGRSetLevelNonGalerkinMaxElmts( void *mgr_vdata, HYPRE_Int *max_elmts );
 HYPRE_Int hypre_MGRSetRestrictType( void *mgr_vdata, HYPRE_Int restrictType );
 HYPRE_Int hypre_MGRSetLevelRestrictType( void *mgr_vdata, HYPRE_Int *restrictType );
 HYPRE_Int hypre_MGRSetInterpType( void *mgr_vdata, HYPRE_Int interpType );
@@ -3680,6 +3680,8 @@ HYPRE_Int hypre_MGRSetNumRestrictSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetLevelSmoothType( void *mgr_vdata, HYPRE_Int *level_smooth_type );
 HYPRE_Int hypre_MGRSetLevelSmoothIters( void *mgr_vdata, HYPRE_Int *level_smooth_iters );
 HYPRE_Int hypre_MGRSetGlobalSmoothCycle( void *mgr_vdata, HYPRE_Int global_smooth_cycle );
+HYPRE_Int hypre_MGRSetGlobalSmootherAtLevel( void *mgr_vdata, HYPRE_Solver smoother,
+                                             HYPRE_Int level );
 HYPRE_Int hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
 HYPRE_Int hypre_MGRSetFrelaxPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
 HYPRE_Int hypre_MGRSetCoarseGridPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
@@ -3706,12 +3708,12 @@ HYPRE_Int hypre_MGRGetCoarseGridConvergenceFactor( void *mgr_data, HYPRE_Real *c
 
 /* par_mgr_interp.c */
 HYPRE_Int hypre_MGRBuildInterp( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
-                                hypre_ParCSRMatrix *A_FC, HYPRE_Int *CF_marker,
-                                hypre_ParCSRMatrix *S, HYPRE_BigInt *num_cpts_global,
+                                hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *S,
+                                hypre_IntArray *CF_marker, HYPRE_BigInt *num_cpts_global,
                                 HYPRE_Real trunc_factor, HYPRE_Int max_elmts,
-                                HYPRE_Int block_jacobi_bsize,
-                                hypre_ParCSRMatrix **P_tr, HYPRE_Int method,
-                                HYPRE_Int num_sweeps_post );
+                                HYPRE_Int block_jacobi_bsize, HYPRE_Int method,
+                                HYPRE_Int num_sweeps_post, hypre_ParCSRMatrix **Wp_ptr,
+                                hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildRestrict( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                   hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *A_CF,
                                   hypre_IntArray *CF_marker, HYPRE_BigInt *num_cpts_global,
@@ -3726,32 +3728,39 @@ HYPRE_Int hypre_MGRBuildPFromWpHost( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *
                                      HYPRE_Int *CF_marker, hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildBlockJacobiWp( hypre_ParCSRMatrix *A_FF, hypre_ParCSRMatrix *A_FC,
                                        HYPRE_Int blk_size, hypre_ParCSRMatrix **Wp_ptr );
-HYPRE_Int hypre_MGRBuildPBlockJacobi( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
+HYPRE_Int hypre_MGRBuildBlockJacobiP( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                       hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *Wp,
                                       HYPRE_Int blk_size, HYPRE_Int *CF_marker,
                                       hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildP( hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker,
                            HYPRE_BigInt *num_cpts_global, HYPRE_Int method,
                            HYPRE_Int debug_flag, hypre_ParCSRMatrix **P_ptr );
-HYPRE_Int hypre_MGRBuildPHost( hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker,
+HYPRE_Int hypre_MGRBuildPHost( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
+                               hypre_ParCSRMatrix *A_FC, HYPRE_Int *CF_marker,
                                HYPRE_BigInt *num_cpts_global, HYPRE_Int method,
-                               hypre_ParCSRMatrix **P_ptr );
+                               hypre_ParCSRMatrix **Wp_ptr, hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildInterpApproximateInverse( hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker,
                                                   HYPRE_BigInt *num_cpts_global,
                                                   hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRTruncateAcfCPR( hypre_ParCSRMatrix *A_CF, hypre_ParCSRMatrix **A_CF_new_ptr );
-HYPRE_Int hypre_MGRBuildRFromW( HYPRE_Int *C_map, HYPRE_Int *F_map,
+HYPRE_Int hypre_MGRBuildRFromW( hypre_IntArray *C_map, hypre_IntArray *F_map,
                                 HYPRE_BigInt global_num_rows_R, HYPRE_BigInt global_num_cols_R,
                                 HYPRE_BigInt *row_starts_R, HYPRE_BigInt *col_starts_R,
                                 hypre_ParCSRMatrix *W, hypre_ParCSRMatrix **R_ptr );
 HYPRE_Int hypre_MGRBlockColLumpedRestrict( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                            hypre_ParCSRMatrix *A_CF, hypre_IntArray *CF_marker,
-                                           HYPRE_Int block_dim, hypre_ParCSRMatrix **W_ptr,
+                                           HYPRE_Int blk_dim, hypre_ParCSRMatrix **W_ptr,
                                            hypre_ParCSRMatrix **R_ptr);
 HYPRE_Int hypre_MGRColLumpedRestrict(hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                      hypre_ParCSRMatrix *A_CF, hypre_IntArray *CF_marker,
                                      hypre_ParCSRMatrix **W_ptr, hypre_ParCSRMatrix **R_ptr);
 
+/* par_mgr_rap.c */
+HYPRE_Int hypre_MGRBuildCoarseOperator(void *mgr_data, hypre_ParCSRMatrix *A_FF,
+                                       hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *A_CF,
+                                       hypre_ParCSRMatrix **A_CC_ptr, hypre_ParCSRMatrix *Wp,
+                                       hypre_ParCSRMatrix *Wr, HYPRE_Int level);
+
 /* par_mgr_coarsen.c */
 HYPRE_Int hypre_MGRCoarseParms( MPI_Comm comm, HYPRE_Int num_rows, hypre_IntArray *CF_marker,
                                 HYPRE_BigInt *row_starts_cpts, HYPRE_BigInt *row_starts_fpts );
@@ -3776,12 +3785,6 @@ HYPRE_Int hypre_ParCSRMatrixBlockDiagMatrixDevice( hypre_ParCSRMatrix *A, HYPRE_
                                                    HYPRE_Int point_type, HYPRE_Int *CF_marker,
                                                    HYPRE_Int diag_type,
                                                    hypre_ParCSRMatrix **B_ptr );
-HYPRE_Int hypre_MGRComputeNonGalerkinCGDevice( hypre_ParCSRMatrix *A_FF, hypre_ParCSRMatrix *A_FC,
-                                               hypre_ParCSRMatrix *A_CF, hypre_ParCSRMatrix *A_CC,
-                                               hypre_ParCSRMatrix *Wp, hypre_ParCSRMatrix *Wr,
-                                               HYPRE_Int blk_size, HYPRE_Int method,
-                                               HYPRE_Complex threshold,
-                                               hypre_ParCSRMatrix **A_H_ptr );
 
 /* par_mgr_stats.c */
 HYPRE_Int hypre_MGRSetupStats( void *mgr_vdata );
diff --git a/3rd_party/hypre/src/parcsr_ls/ams.c b/3rd_party/hypre/src/parcsr_ls/ams.c
index b8271289b..ffd4262b0 100644
--- a/3rd_party/hypre/src/parcsr_ls/ams.c
+++ b/3rd_party/hypre/src/parcsr_ls/ams.c
@@ -1907,7 +1907,7 @@ hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
 {
    HYPRE_UNUSED_VAR(A);
 
-   hypre_ParCSRMatrix *Pix, *Piy, *Piz = NULL;
+   hypre_ParCSRMatrix *Pix, *Piy = NULL, *Piz = NULL;
 
 #if defined(HYPRE_USING_GPU)
    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(G) );
@@ -2378,11 +2378,13 @@ hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
 #endif
          {
             if (G_offd_ncols)
+            {
                for (i = 0; i < G_offd_nrows + 1; i++)
                {
                   Pix_offd_I[i] = G_offd_I[i];
                   Piy_offd_I[i] = G_offd_I[i];
                }
+            }
 
             for (i = 0; i < G_offd_nnz; i++)
             {
@@ -2391,11 +2393,13 @@ hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             }
 
             for (i = 0; i < G_offd_nrows; i++)
+            {
                for (j = G_offd_I[i]; j < G_offd_I[i + 1]; j++)
                {
                   *Pix_offd_data++ = hypre_abs(G_offd_data[j]) * 0.5 * Gx_data[i];
                   *Piy_offd_data++ = hypre_abs(G_offd_data[j]) * 0.5 * Gy_data[i];
                }
+            }
          }
 
          for (i = 0; i < G_offd_ncols; i++)
@@ -4550,7 +4554,7 @@ hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
    HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
    HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
 
-   HYPRE_Real diag = 1.0;
+   HYPRE_Real diag = 0.0;
    HYPRE_Real *l1_norm = hypre_TAlloc(HYPRE_Real, num_rows, hypre_ParCSRMatrixMemoryLocation(A));
    HYPRE_Int ii, ns, ne, rest, size;
 
@@ -4730,6 +4734,7 @@ hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
             if (cf_marker == NULL)
             {
                /* Add the diagonal and the local off-thread part of the ith row */
+               diag = 0.0;
                for (j = A_diag_I[i]; j < A_diag_I[i + 1]; j++)
                {
                   ii = A_diag_J[j];
@@ -4760,6 +4765,7 @@ hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
             {
                cf_diag = cf_marker[i];
                /* Add the diagonal and the local off-thread part of the ith row */
+               diag = 0.0;
                for (j = A_diag_I[i]; j < A_diag_I[i + 1]; j++)
                {
                   ii = A_diag_J[j];
@@ -4792,7 +4798,7 @@ hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
             }
 
             /* Truncate according to Remark 6.2 */
-            if (l1_norm[i] <= 4.0 / 3.0 * diag)
+            if (l1_norm[i] <= (4.0 / 3.0) * diag)
             {
                l1_norm[i] = diag;
             }
@@ -4816,6 +4822,7 @@ hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
             if (cf_marker == NULL)
             {
                /* Add the diagonal and the local off-thread part of the ith row */
+               diag = 0.0;
                for (j = A_diag_I[i]; j < A_diag_I[i + 1]; j++)
                {
                   ii = A_diag_J[j];
@@ -4846,6 +4853,7 @@ hypre_ParCSRComputeL1NormsThreads(hypre_ParCSRMatrix *A,
             {
                cf_diag = cf_marker[i];
                /* Add the diagonal and the local off-thread part of the ith row */
+               diag = 0.0;
                for (j = A_diag_I[i]; j < A_diag_I[i + 1]; j++)
                {
                   ii = A_diag_J[j];
diff --git a/3rd_party/hypre/src/parcsr_ls/dsuperlu.c b/3rd_party/hypre/src/parcsr_ls/dsuperlu.c
index 95864c9a4..3e044aec8 100644
--- a/3rd_party/hypre/src/parcsr_ls/dsuperlu.c
+++ b/3rd_party/hypre/src/parcsr_ls/dsuperlu.c
@@ -33,6 +33,11 @@ hypre_DSLUData;
 
 #endif
 */
+
+/*--------------------------------------------------------------------------
+ * hypre_SLUDistSetup
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SLUDistSetup(HYPRE_Solver       *solver,
                    hypre_ParCSRMatrix *A,
@@ -46,7 +51,6 @@ hypre_SLUDistSetup(HYPRE_Solver       *solver,
    HYPRE_Int          pcols = 1;
    HYPRE_Int          prows = 1;
    hypre_DSLUData    *dslu_data = NULL;
-   HYPRE_Int          info = 0;
    HYPRE_Int          nrhs = 0;
 
    HYPRE_Int          num_rows;
@@ -54,6 +58,7 @@ hypre_SLUDistSetup(HYPRE_Solver       *solver,
    HYPRE_Int          i;
 
    /* SuperLU_Dist variables. Note it uses "int_t" to denote integer types */
+   hypre_int          slu_info = 0;
    int_t             *slu_rowptr;
    int_t             *slu_colidx;
    hypre_double      *slu_data;
@@ -185,7 +190,7 @@ hypre_SLUDistSetup(HYPRE_Solver       *solver,
    pdgssvx(&(dslu_data->dslu_options), &(dslu_data->A_dslu),
            &(dslu_data->dslu_ScalePermstruct), NULL, num_rows, nrhs,
            &(dslu_data->dslu_data_grid), &(dslu_data->dslu_data_LU),
-           &(dslu_data->dslu_solve), dslu_data->berr, &(dslu_data->dslu_data_stat), &info);
+           &(dslu_data->dslu_solve), dslu_data->berr, &(dslu_data->dslu_data_stat), &slu_info);
 
    dslu_data->dslu_options.Fact = FACTORED;
    *solver = (HYPRE_Solver) dslu_data;
@@ -193,24 +198,29 @@ hypre_SLUDistSetup(HYPRE_Solver       *solver,
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SLUDistSolve
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SLUDistSolve(void            *solver,
                    hypre_ParVector *b,
                    hypre_ParVector *x)
 {
    hypre_DSLUData  *dslu_data = (hypre_DSLUData *) solver;
-   HYPRE_Int        info = 0;
    HYPRE_Real      *x_data;
-   hypre_ParVector *x_host = NULL;
    HYPRE_Int        size = hypre_VectorSize(hypre_ParVectorLocalVector(x));
    HYPRE_Int        nrhs = 1;
    HYPRE_Int        i;
 
+   hypre_int        slu_info;
    hypre_double    *slu_data;
 
    hypre_ParVectorCopy(b, x);
 
 #if defined(HYPRE_USING_GPU)
+   hypre_ParVector *x_host = NULL;
+
    if (hypre_GetActualMemLocation(hypre_ParVectorMemoryLocation(x)) != hypre_MEMORY_HOST)
    {
       x_host = hypre_ParVectorCloneDeep_v2(x, HYPRE_MEMORY_HOST);
@@ -247,7 +257,7 @@ hypre_SLUDistSolve(void            *solver,
            &(dslu_data->dslu_solve),
            dslu_data->berr,
            &(dslu_data->dslu_data_stat),
-           &info);
+           &slu_info);
 
    /* Free memory */
    if ((void*) slu_data != (void*) x_data)
@@ -266,6 +276,10 @@ hypre_SLUDistSolve(void            *solver,
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SLUDistDestroy
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SLUDistDestroy(void* solver)
 {
diff --git a/3rd_party/hypre/src/parcsr_ls/par_amg.c b/3rd_party/hypre/src/parcsr_ls/par_amg.c
index b81ca52f9..b10876e1d 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_amg.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_amg.c
@@ -48,6 +48,7 @@ hypre_BoomerAMGCreate( void )
    HYPRE_Int    setup_type;
    HYPRE_Int    P_max_elmts;
    HYPRE_Int    num_functions;
+   HYPRE_Int    filter_functions;
    HYPRE_Int    nodal, nodal_levels, nodal_diag;
    HYPRE_Int    keep_same_sign;
    HYPRE_Int    num_paths;
@@ -182,6 +183,7 @@ hypre_BoomerAMGCreate( void )
    agg_P_max_elmts = 0;
    agg_P12_max_elmts = 0;
    num_functions = 1;
+   filter_functions = 0;
    nodal = 0;
    nodal_levels = max_levels;
    nodal_diag = 0;
@@ -359,6 +361,7 @@ hypre_BoomerAMGCreate( void )
    hypre_BoomerAMGSetAggPMaxElmts(amg_data, agg_P_max_elmts);
    hypre_BoomerAMGSetAggP12MaxElmts(amg_data, agg_P12_max_elmts);
    hypre_BoomerAMGSetNumFunctions(amg_data, num_functions);
+   hypre_BoomerAMGSetFilterFunctions(amg_data, filter_functions);
    hypre_BoomerAMGSetNodal(amg_data, nodal);
    hypre_BoomerAMGSetNodalLevels(amg_data, nodal_levels);
    hypre_BoomerAMGSetNodal(amg_data, nodal_diag);
@@ -3201,9 +3204,13 @@ hypre_BoomerAMGSetNumFunctions( void     *data,
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_BoomerAMGGetNumFunctions
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-hypre_BoomerAMGGetNumFunctions( void     *data,
-                                HYPRE_Int     * num_functions )
+hypre_BoomerAMGGetNumFunctions( void      *data,
+                                HYPRE_Int *num_functions )
 {
    hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
 
@@ -3217,6 +3224,51 @@ hypre_BoomerAMGGetNumFunctions( void     *data,
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_BoomerAMGSetFilterFunctions
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_BoomerAMGSetFilterFunctions( void      *data,
+                                   HYPRE_Int  filter_functions )
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   if (filter_functions < 0 || filter_functions > 1)
+   {
+      hypre_error_in_arg(2);
+      return hypre_error_flag;
+   }
+   hypre_ParAMGDataFilterFunctions(amg_data) = filter_functions;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_BoomerAMGGetFilterFunctions
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_BoomerAMGGetFilterFunctions( void      *data,
+                                   HYPRE_Int *filter_functions )
+{
+   hypre_ParAMGData  *amg_data = (hypre_ParAMGData*) data;
+
+   if (!amg_data)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+   *filter_functions = hypre_ParAMGDataFilterFunctions(amg_data);
+
+   return hypre_error_flag;
+}
+
 /*--------------------------------------------------------------------------
  * Indicate whether to use nodal systems function
  *--------------------------------------------------------------------------*/
diff --git a/3rd_party/hypre/src/parcsr_ls/par_amg.h b/3rd_party/hypre/src/parcsr_ls/par_amg.h
index 879aecded..715cf32cd 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_amg.h
+++ b/3rd_party/hypre/src/parcsr_ls/par_amg.h
@@ -92,6 +92,7 @@ typedef struct
    hypre_ParCSRMatrix  *A;
    HYPRE_Int            num_variables;
    HYPRE_Int            num_functions;
+   HYPRE_Int            filter_functions;
    HYPRE_Int            nodal;
    HYPRE_Int            nodal_levels;
    HYPRE_Int            nodal_diag;
@@ -362,8 +363,9 @@ typedef struct
 #define hypre_ParAMGDataOuterWt(amg_data) ((amg_data)->outer_wt)
 
 /* problem data parameters */
-#define  hypre_ParAMGDataNumVariables(amg_data)  ((amg_data)->num_variables)
+#define hypre_ParAMGDataNumVariables(amg_data)  ((amg_data)->num_variables)
 #define hypre_ParAMGDataNumFunctions(amg_data) ((amg_data)->num_functions)
+#define hypre_ParAMGDataFilterFunctions(amg_data) ((amg_data)->filter_functions)
 #define hypre_ParAMGDataNodal(amg_data) ((amg_data)->nodal)
 #define hypre_ParAMGDataNodalLevels(amg_data) ((amg_data)->nodal_levels)
 #define hypre_ParAMGDataNodalDiag(amg_data) ((amg_data)->nodal_diag)
diff --git a/3rd_party/hypre/src/parcsr_ls/par_amg_setup.c b/3rd_party/hypre/src/parcsr_ls/par_amg_setup.c
index 4b8525abb..279ae300e 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_amg_setup.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_amg_setup.c
@@ -31,8 +31,9 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
                       hypre_ParVector    *f,
                       hypre_ParVector    *u )
 {
-   MPI_Comm            comm = hypre_ParCSRMatrixComm(A);
-   hypre_ParAMGData   *amg_data = (hypre_ParAMGData*) amg_vdata;
+   MPI_Comm             comm = hypre_ParCSRMatrixComm(A);
+   hypre_ParAMGData    *amg_data = (hypre_ParAMGData*) amg_vdata;
+   hypre_ParCSRMatrix  *A_tilde = A;
 
    /* Data Structure variables */
    HYPRE_Int            num_vectors;
@@ -89,6 +90,9 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
 
    HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);
    hypre_ParAMGDataMemoryLocation(amg_data) = memory_location;
+#if defined(HYPRE_USING_GPU)
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(memory_location);
+#endif
 
    /* Local variables */
    HYPRE_Int           *CF_marker;
@@ -121,7 +125,6 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    HYPRE_Int       setup_type;
    HYPRE_BigInt    fine_size;
    HYPRE_Int       offset;
-   HYPRE_Real      size;
    HYPRE_Int       not_finished_coarsening = 1;
    HYPRE_Int       coarse_threshold = hypre_ParAMGDataMaxCoarseSize(amg_data);
    HYPRE_Int       min_coarse_size = hypre_ParAMGDataMinCoarseSize(amg_data);
@@ -133,6 +136,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
 #endif
    HYPRE_Int      *grid_relax_type = hypre_ParAMGDataGridRelaxType(amg_data);
    HYPRE_Int       num_functions = hypre_ParAMGDataNumFunctions(amg_data);
+   HYPRE_Int       filter_functions = hypre_ParAMGDataFilterFunctions(amg_data);
    HYPRE_Int       nodal = hypre_ParAMGDataNodal(amg_data);
    HYPRE_Int       nodal_levels = hypre_ParAMGDataNodalLevels(amg_data);
    HYPRE_Int       nodal_diag = hypre_ParAMGDataNodalDiag(amg_data);
@@ -235,6 +239,9 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
 
    HYPRE_Real cum_nnz_AP = hypre_ParAMGDataCumNnzAP(amg_data);
 
+   HYPRE_ANNOTATE_FUNC_BEGIN;
+   hypre_GpuProfilingPushRange("AMGsetup");
+   hypre_MemoryPrintUsage(comm, hypre_HandleLogLevel(hypre_handle()), "BoomerAMG setup begin", 0);
    hypre_MPI_Comm_size(comm, &num_procs);
    hypre_MPI_Comm_rank(comm, &my_id);
 
@@ -338,8 +345,6 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
       num_vectors = 1;
    }
 
-   HYPRE_ANNOTATE_FUNC_BEGIN;
-
    /* change in definition of standard and multipass interpolation, by
       eliminating interp_type 9 and 5 and setting sep_weight instead
       when using separation of weights option */
@@ -697,7 +702,6 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    if (num_C_points_coarse > 0)
    {
 #if defined(HYPRE_USING_GPU)
-      HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(memory_location);
       if (exec == HYPRE_EXEC_DEVICE)
       {
 #if defined(HYPRE_USING_SYCL)
@@ -753,7 +757,6 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
       offset = (HYPRE_Int) ( first_local_row % ((HYPRE_BigInt) num_functions) );
 
 #if defined(HYPRE_USING_GPU)
-      HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(memory_location);
       if (exec == HYPRE_EXEC_DEVICE)
       {
          hypre_BoomerAMGInitDofFuncDevice(hypre_IntArrayData(dof_func), local_size, offset, num_functions);
@@ -768,7 +771,13 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
       }
    }
 
-   A_array[0] = A;
+   /* Eliminate inter-variable connections among functions for preconditioning purposes */
+   if (num_functions > 1 && filter_functions)
+   {
+      hypre_ParCSRMatrixBlkFilter(A, num_functions, &A_tilde);
+   }
+
+   A_array[0] = A_tilde;
 
    /* interp vectors setup */
    if (interp_vec_variant == 1)
@@ -776,7 +785,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
       num_levels_interp_vectors = interp_vec_first_level + 1;
       hypre_ParAMGNumLevelsInterpVectors(amg_data) = num_levels_interp_vectors;
    }
-   if ( interp_vec_variant > 0 &&  num_interp_vectors > 0)
+   if (interp_vec_variant > 0 && num_interp_vectors > 0)
    {
       interp_vectors_array =  hypre_CTAlloc(hypre_ParVector**, num_levels_interp_vectors,
                                             HYPRE_MEMORY_HOST);
@@ -3127,10 +3136,15 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
          A_array[level] = A_H;
       }
 
-      size = ((HYPRE_Real) fine_size ) * .75;
-      if (coarsen_type > 0 && coarse_size >= (HYPRE_BigInt) size)
+#if defined(HYPRE_USING_GPU)
+      if (exec == HYPRE_EXEC_HOST)
+#endif
       {
-         coarsen_type = 0;
+         HYPRE_Real size = ((HYPRE_Real)fine_size) * .75;
+         if (coarsen_type > 0 && coarse_size >= (HYPRE_BigInt)size)
+         {
+            coarsen_type = 0;
+         }
       }
 
       {
@@ -3239,6 +3253,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
     * Setup of special smoothers when needed
     *-----------------------------------------------------------------------*/
 
+   hypre_GpuProfilingPushRange("Relaxation");
    if (addlvl > -1 ||
        grid_relax_type[1] ==  7 || grid_relax_type[2] ==  7 || grid_relax_type[3] ==  7 ||
        grid_relax_type[1] ==  8 || grid_relax_type[2] ==  8 || grid_relax_type[3] ==  8 ||
@@ -3549,8 +3564,6 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
                               (HYPRE_ParCSRMatrix) A_array[j],
                               (HYPRE_ParVector) F_array[j],
                               (HYPRE_ParVector) U_array[j]);
-
-
       }
 
       if (relax_weight[j] == 0.0)
@@ -3783,6 +3796,7 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
       hypre_GpuProfilingPopRange();
       hypre_GpuProfilingPopRange();
    } /* end of levels loop */
+   hypre_GpuProfilingPopRange(); /* Relaxation */
 
    if (amg_logging > 1)
    {
@@ -3828,8 +3842,14 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
       hypre_BoomerAMGSetupStats(amg_data, A);
    }
 
-   /* print out CF info to plot grids in matlab (see 'tools/AMGgrids.m') */
+   /* Destroy filtered matrix */
+   if (A_tilde != A)
+   {
+      hypre_ParCSRMatrixDestroy(A_tilde);
+      A_array[0] = A;
+   }
 
+   /* Print out CF info to plot grids in matlab (see 'tools/AMGgrids.m') */
    if (hypre_ParAMGDataPlotGrids(amg_data))
    {
       HYPRE_Int *CF, *CFc, *itemp;
@@ -4018,6 +4038,8 @@ hypre_BoomerAMGSetup( void               *amg_vdata,
    }
 #endif
 
+   hypre_MemoryPrintUsage(comm, hypre_HandleLogLevel(hypre_handle()), "BoomerAMG setup end", 0);
+   hypre_GpuProfilingPopRange();
    HYPRE_ANNOTATE_FUNC_END;
 
    return (hypre_error_flag);
diff --git a/3rd_party/hypre/src/parcsr_ls/par_cg_relax_wt.c b/3rd_party/hypre/src/parcsr_ls/par_cg_relax_wt.c
index 91a621996..07e7dece6 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_cg_relax_wt.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_cg_relax_wt.c
@@ -27,7 +27,7 @@ hypre_BoomerAMGCGRelaxWt( void       *amg_vdata,
    hypre_ParAMGData *amg_data = (hypre_ParAMGData*) amg_vdata;
 
    MPI_Comm comm;
-   HYPRE_Solver *smoother;
+   HYPRE_Solver *smoother = NULL;
    /* Data Structure variables */
 
    /* hypre_ParCSRMatrix **A_array = hypre_ParAMGDataAArray(amg_data); */
diff --git a/3rd_party/hypre/src/parcsr_ls/par_coarsen.c b/3rd_party/hypre/src/parcsr_ls/par_coarsen.c
index d2cafbb8d..1f1314326 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_coarsen.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_coarsen.c
@@ -1169,7 +1169,7 @@ hypre_BoomerAMGCoarsenRuge( hypre_ParCSRMatrix    *S,
    if (*CF_marker_ptr == NULL)
    {
       *CF_marker_ptr = hypre_IntArrayCreate(num_variables);
-      hypre_IntArrayInitialize(*CF_marker_ptr);
+      hypre_IntArrayInitialize_v2(*CF_marker_ptr, HYPRE_MEMORY_HOST);
    }
    CF_marker = hypre_IntArrayData(*CF_marker_ptr);
 
@@ -2355,15 +2355,15 @@ hypre_BoomerAMGCoarsenPMISHost( hypre_ParCSRMatrix    *S,
       cnt = 0;
       for (i = 0; i < num_variables; i++)
       {
-         if ( CF_marker[i] != SF_PT )
+         if (CF_marker[i] != SF_PT)
          {
-            if ( S_offd_i[i + 1] - S_offd_i[i] > 0 || CF_marker[i] == -1 )
+            if (S_offd_i[i + 1] - S_offd_i[i] > 0 || CF_marker[i] == -1)
             {
                CF_marker[i] = 0;
             }
-            if ( CF_marker[i] == Z_PT)
+            if (CF_marker[i] == Z_PT)
             {
-               if ( measure_array[i] >= 1.0 || S_diag_i[i + 1] - S_diag_i[i] > 0 )
+               if (measure_array[i] >= 1.0 || S_diag_i[i + 1] - S_diag_i[i] > 0)
                {
                   CF_marker[i] = 0;
                   graph_array[cnt++] = i;
@@ -2821,26 +2821,30 @@ hypre_BoomerAMGCoarsenPMIS( hypre_ParCSRMatrix    *S,
 {
    hypre_GpuProfilingPushRange("PMIS");
 
-   HYPRE_Int ierr = 0;
-
 #if defined(HYPRE_USING_GPU)
    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
 
    if (exec == HYPRE_EXEC_DEVICE)
    {
-      ierr = hypre_BoomerAMGCoarsenPMISDevice( S, A, CF_init, debug_flag, CF_marker_ptr );
+      hypre_BoomerAMGCoarsenPMISDevice(S, A, CF_init, debug_flag, CF_marker_ptr);
    }
    else
 #endif
    {
-      ierr = hypre_BoomerAMGCoarsenPMISHost( S, A, CF_init, debug_flag, CF_marker_ptr );
+      hypre_BoomerAMGCoarsenPMISHost(S, A, CF_init, debug_flag, CF_marker_ptr);
    }
 
    hypre_GpuProfilingPopRange();
 
-   return ierr;
+   return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_BoomerAMGCoarsenHMIS
+ *
+ * Ruge coarsening followed by CLJP coarsening
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_BoomerAMGCoarsenHMIS( hypre_ParCSRMatrix    *S,
                             hypre_ParCSRMatrix    *A,
@@ -2849,16 +2853,62 @@ hypre_BoomerAMGCoarsenHMIS( hypre_ParCSRMatrix    *S,
                             HYPRE_Int              debug_flag,
                             hypre_IntArray       **CF_marker_ptr)
 {
-   HYPRE_Int              ierr = 0;
+   hypre_ParCSRMatrix    *h_A, *h_S;
+   hypre_IntArray        *CF_marker = NULL;
+   HYPRE_MemoryLocation   CF_memory_location;
 
-   /*-------------------------------------------------------
-    * Perform Ruge coarsening followed by CLJP coarsening
-    *-------------------------------------------------------*/
+   /* Clone matrices on the host if needed */
+   h_S = hypre_GetActualMemLocation(hypre_ParCSRMatrixMemoryLocation(S)) == hypre_MEMORY_DEVICE ?
+         hypre_ParCSRMatrixClone_v2(S, 0, HYPRE_MEMORY_HOST) : S;
 
-   ierr += hypre_BoomerAMGCoarsenRuge (S, A, measure_type, 10, cut_factor,
-                                       debug_flag, CF_marker_ptr);
+   h_A = hypre_GetActualMemLocation(hypre_ParCSRMatrixMemoryLocation(A)) == hypre_MEMORY_DEVICE ?
+         hypre_ParCSRMatrixClone_v2(A, 0, HYPRE_MEMORY_HOST) : A;
+
+   /* Clone/Create CF_marker on the host if needed */
+   if (*CF_marker_ptr)
+   {
+      CF_memory_location = hypre_IntArrayMemoryLocation(*CF_marker_ptr);
+      CF_marker = hypre_GetActualMemLocation(CF_memory_location) == hypre_MEMORY_DEVICE ?
+                  hypre_IntArrayCloneDeep_v2(*CF_marker_ptr, HYPRE_MEMORY_HOST) : *CF_marker_ptr;
+   }
+   else
+   {
+      CF_memory_location = HYPRE_MEMORY_HOST;
+      CF_marker = hypre_IntArrayCreate(hypre_ParCSRMatrixNumRows(A));
+      hypre_IntArrayInitialize_v2(CF_marker, CF_memory_location);
+   }
 
-   ierr += hypre_BoomerAMGCoarsenPMISHost (S, A, 1, debug_flag, CF_marker_ptr);
+   /* Perform Ruge coarsening on the host */
+   hypre_BoomerAMGCoarsenRuge(h_S, h_A, measure_type, 10, cut_factor, debug_flag, &CF_marker);
 
-   return (ierr);
+   /* Free cloned matrices on the host */
+   if (h_S != S) { hypre_ParCSRMatrixDestroy(h_S); }
+   if (h_A != A) { hypre_ParCSRMatrixDestroy(h_A); }
+
+   /* Move CF_marker to device if needed */
+#if defined(HYPRE_USING_GPU)
+   if (hypre_GetExecPolicy1(hypre_ParCSRMatrixMemoryLocation(A)) == HYPRE_EXEC_DEVICE)
+   {
+      if (*CF_marker_ptr && (*CF_marker_ptr != CF_marker))
+      {
+         hypre_IntArrayCopy(CF_marker, *CF_marker_ptr);
+         hypre_IntArrayDestroy(CF_marker);
+         CF_marker = NULL;
+      }
+      else if (*CF_marker_ptr == NULL)
+      {
+         *CF_marker_ptr = hypre_IntArrayCloneDeep_v2(CF_marker, HYPRE_MEMORY_DEVICE);
+         hypre_IntArrayDestroy(CF_marker);
+      }
+   }
+   else
+#endif
+   {
+      *CF_marker_ptr = CF_marker;
+   }
+
+   /* Perform PMIS coarsening on the host or device */
+   hypre_BoomerAMGCoarsenPMIS(S, A, 1, debug_flag, CF_marker_ptr);
+
+   return hypre_error_flag;
 }
diff --git a/3rd_party/hypre/src/parcsr_ls/par_coarsen_device.c b/3rd_party/hypre/src/parcsr_ls/par_coarsen_device.c
index 42253bcd2..43658b5b9 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_coarsen_device.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_coarsen_device.c
@@ -312,8 +312,32 @@ hypreGPUKernel_PMISCoarseningInit(hypre_DeviceItem &item,
 
    if (CF_init == 1)
    {
-      // TODO
-      hypre_device_assert(0);
+      CF_marker_i = read_only_load(&CF_marker_diag[i]);
+
+      if (CF_marker_i != SF_PT)
+      {
+         if (read_only_load(&S_offd_i[i + 1]) - read_only_load(&S_offd_i[i]) > 0 ||
+             CF_marker_i == F_PT)
+         {
+            CF_marker_i = 0;
+         }
+         if (CF_marker_i == Z_PT)
+         {
+            if (measure_diag[i] > 1.0 ||
+                read_only_load(&S_diag_i[i + 1]) - read_only_load(&S_diag_i[i]))
+            {
+               CF_marker_i = 0;
+            }
+            else
+            {
+               CF_marker_i = SF_PT;
+            }
+         }
+      }
+      else
+      {
+         measure_diag[i] = 0.0;
+      }
    }
    else
    {
@@ -626,4 +650,3 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix  *S,               /* in
 }
 
 #endif // #if defined(HYPRE_USING_GPU)
-
diff --git a/3rd_party/hypre/src/parcsr_ls/par_fsai_setup.c b/3rd_party/hypre/src/parcsr_ls/par_fsai_setup.c
index 9794de592..f58df3e77 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_fsai_setup.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_fsai_setup.c
@@ -422,8 +422,6 @@ hypre_FSAISetupNative( void               *fsai_vdata,
    HYPRE_Int              *A_i              = hypre_CSRMatrixI(A_diag);
    HYPRE_Complex          *A_a              = hypre_CSRMatrixData(A_diag);
    HYPRE_Int               num_rows_diag_A  = hypre_CSRMatrixNumRows(A_diag);
-   HYPRE_Int               num_nnzs_diag_A  = hypre_CSRMatrixNumNonzeros(A_diag);
-   HYPRE_Int               avg_nnzrow_diag_A;
 
    /* Matrix G variables */
    hypre_ParCSRMatrix     *G = hypre_ParFSAIDataGmat(fsai_data);
@@ -432,16 +430,13 @@ hypre_FSAISetupNative( void               *fsai_vdata,
    HYPRE_Int              *G_j;
    HYPRE_Complex          *G_a;
    HYPRE_Int               max_nnzrow_diag_G;   /* Max. number of nonzeros per row in G_diag */
-   HYPRE_Int               max_cand_size;       /* Max size of kg_pos */
 
    /* Local variables */
-   char                     msg[512];    /* Warning message */
-   HYPRE_Int           *twspace;     /* shared work space for omp threads */
+   char                    msg[512];    /* Warning message */
+   HYPRE_Int              *twspace;     /* shared work space for omp threads */
 
    /* Initalize some variables */
-   avg_nnzrow_diag_A = (num_rows_diag_A > 0) ? num_nnzs_diag_A / num_rows_diag_A : 0;
    max_nnzrow_diag_G = max_steps * max_step_size + 1;
-   max_cand_size     = avg_nnzrow_diag_A * max_nnzrow_diag_G;
 
    G_diag = hypre_ParCSRMatrixDiag(G);
    G_a = hypre_CSRMatrixData(G_diag);
@@ -490,10 +485,10 @@ hypre_FSAISetupNative( void               *fsai_vdata,
       /* Allocate and initialize local vector variables */
       G_temp    = hypre_SeqVectorCreate(max_nnzrow_diag_G);
       A_subrow  = hypre_SeqVectorCreate(max_nnzrow_diag_G);
-      kap_grad  = hypre_SeqVectorCreate(max_cand_size);
+      kap_grad  = hypre_SeqVectorCreate(num_rows_diag_A);
       A_sub     = hypre_SeqVectorCreate(max_nnzrow_diag_G * max_nnzrow_diag_G);
       pattern   = hypre_CTAlloc(HYPRE_Int, max_nnzrow_diag_G, HYPRE_MEMORY_HOST);
-      kg_pos    = hypre_CTAlloc(HYPRE_Int, max_cand_size, HYPRE_MEMORY_HOST);
+      kg_pos    = hypre_CTAlloc(HYPRE_Int, num_rows_diag_A, HYPRE_MEMORY_HOST);
       kg_marker = hypre_CTAlloc(HYPRE_Int, num_rows_diag_A, HYPRE_MEMORY_HOST);
       marker    = hypre_TAlloc(HYPRE_Int, num_rows_diag_A, HYPRE_MEMORY_HOST);
 
@@ -706,8 +701,6 @@ hypre_FSAISetupOMPDyn( void               *fsai_vdata,
    HYPRE_Int              *A_i              = hypre_CSRMatrixI(A_diag);
    HYPRE_Complex          *A_a              = hypre_CSRMatrixData(A_diag);
    HYPRE_Int               num_rows_diag_A  = hypre_CSRMatrixNumRows(A_diag);
-   HYPRE_Int               num_nnzs_diag_A  = hypre_CSRMatrixNumNonzeros(A_diag);
-   HYPRE_Int               avg_nnzrow_diag_A;
 
    /* Matrix G variables */
    hypre_ParCSRMatrix     *G = hypre_ParFSAIDataGmat(fsai_data);
@@ -717,17 +710,14 @@ hypre_FSAISetupOMPDyn( void               *fsai_vdata,
    HYPRE_Complex          *G_a;
    HYPRE_Int              *G_nnzcnt;          /* Array holding number of nonzeros of row G[i,:] */
    HYPRE_Int               max_nnzrow_diag_G; /* Max. number of nonzeros per row in G_diag */
-   HYPRE_Int               max_cand_size;     /* Max size of kg_pos */
 
    /* Local variables */
-   HYPRE_Int                i, j, jj;
-   char                     msg[512];    /* Warning message */
-   HYPRE_Complex           *twspace;     /* shared work space for omp threads */
+   HYPRE_Int               i, j, jj;
+   char                    msg[512];    /* Warning message */
+   HYPRE_Complex          *twspace;     /* shared work space for omp threads */
 
    /* Initalize some variables */
-   avg_nnzrow_diag_A = num_nnzs_diag_A / num_rows_diag_A;
    max_nnzrow_diag_G = max_steps * max_step_size + 1;
-   max_cand_size     = avg_nnzrow_diag_A * max_nnzrow_diag_G;
 
    G_diag = hypre_ParCSRMatrixDiag(G);
    G_a = hypre_CSRMatrixData(G_diag);
@@ -765,14 +755,13 @@ hypre_FSAISetupOMPDyn( void               *fsai_vdata,
       HYPRE_Complex  *G_temp_data;
       HYPRE_Complex  *A_subrow_data;
 
-
       /* Allocate and initialize local vector variables */
       G_temp    = hypre_SeqVectorCreate(max_nnzrow_diag_G);
       A_subrow  = hypre_SeqVectorCreate(max_nnzrow_diag_G);
-      kap_grad  = hypre_SeqVectorCreate(max_cand_size);
+      kap_grad  = hypre_SeqVectorCreate(num_rows_diag_A);
       A_sub     = hypre_SeqVectorCreate(max_nnzrow_diag_G * max_nnzrow_diag_G);
       pattern   = hypre_CTAlloc(HYPRE_Int, max_nnzrow_diag_G, HYPRE_MEMORY_HOST);
-      kg_pos    = hypre_CTAlloc(HYPRE_Int, max_cand_size, HYPRE_MEMORY_HOST);
+      kg_pos    = hypre_CTAlloc(HYPRE_Int, num_rows_diag_A, HYPRE_MEMORY_HOST);
       kg_marker = hypre_CTAlloc(HYPRE_Int, num_rows_diag_A, HYPRE_MEMORY_HOST);
       marker    = hypre_TAlloc(HYPRE_Int, num_rows_diag_A, HYPRE_MEMORY_HOST);
 
diff --git a/3rd_party/hypre/src/parcsr_ls/par_gauss_elim.c b/3rd_party/hypre/src/parcsr_ls/par_gauss_elim.c
index 3eb87f3f6..c3a9ec6e4 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_gauss_elim.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_gauss_elim.c
@@ -96,7 +96,13 @@ hypre_GaussElimSetup(hypre_ParAMGData *amg_data,
    }
    else
    {
+      /* Fallback to host execution when dependency libraries are not met (cuSOLVER/MAGMA) */
+#if defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_CUSOLVER) && !defined(HYPRE_USING_MAGMA) ||\
+    (defined(HYPRE_USING_HIP) && !defined(HYPRE_USING_MAGMA))
+      ge_memory_location = HYPRE_MEMORY_HOST;
+#else
       ge_memory_location = memory_location;
+#endif
    }
    hypre_ParAMGDataGEMemoryLocation(amg_data) = ge_memory_location;
 
diff --git a/3rd_party/hypre/src/parcsr_ls/par_gsmg.c b/3rd_party/hypre/src/parcsr_ls/par_gsmg.c
index da9e3af3c..1c40dc143 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_gsmg.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_gsmg.c
@@ -78,7 +78,7 @@ hypre_ParCSRMatrixFillSmooth(HYPRE_Int nsamples, HYPRE_Real *samples,
    HYPRE_Int i, j, k, ii, index, start;
    HYPRE_Int num_cols_offd;
    HYPRE_Int num_sends;
-   HYPRE_Int *dof_func_offd;
+   HYPRE_Int *dof_func_offd = NULL;
    HYPRE_Int *int_buf_data;
    HYPRE_Real temp;
    HYPRE_Real *p;
diff --git a/3rd_party/hypre/src/parcsr_ls/par_ilu.c b/3rd_party/hypre/src/parcsr_ls/par_ilu.c
index b87aac095..52ec7ad8d 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_ilu.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_ilu.c
@@ -4432,7 +4432,7 @@ hypre_ILUParCSRInverseNSH(hypre_ParCSRMatrix  *A,
    hypre_CSRMatrix         *M_offd;
    HYPRE_Int               *M_offd_i;
 
-   HYPRE_Real              time_s, time_e;
+   HYPRE_Real              time_s = 0.0, time_e;
 
    HYPRE_Int               n = hypre_CSRMatrixNumRows(A_diag);
    HYPRE_Int               i;
diff --git a/3rd_party/hypre/src/parcsr_ls/par_ilu_setup.c b/3rd_party/hypre/src/parcsr_ls/par_ilu_setup.c
index c8d23c271..b207b8541 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_ilu_setup.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_ilu_setup.c
@@ -330,7 +330,6 @@ hypre_ILUSetup( void               *ilu_vdata,
          else
 #endif
          {
-
             hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, perm, nLU, nLU,
                                &matL, &matD, &matU, &matS, &u_end);
          }
@@ -1465,8 +1464,8 @@ hypre_ParILUExtractEBFC(hypre_CSRMatrix   *A_diag,
          }
          for (; j < A_diag_i[i + 1]; j++)
          {
-            col = A_diag_j[j];
-            col = col - nLU;
+            col = A_diag_j[j] - nLU;
+            hypre_assert(col >= 0);
             F_j[ctrF] = col;
             F_data[ctrF++] = A_diag_data[j];
             if (ctrF >= capacity_F)
@@ -1513,8 +1512,8 @@ hypre_ParILUExtractEBFC(hypre_CSRMatrix   *A_diag,
          }
          for (; j < A_diag_i[i + 1]; j++)
          {
-            col = A_diag_j[j];
-            col = col - nLU;
+            col = A_diag_j[j] - nLU;
+            hypre_assert(col >= 0);
             C_j[ctrC] = col;
             C_data[ctrC++] = A_diag_data[j];
             if (ctrC >= capacity_C)
@@ -1733,40 +1732,29 @@ hypre_ILUSetupLDUtoCusparse(hypre_ParCSRMatrix  *L,
                             hypre_ParCSRMatrix  *U,
                             hypre_ParCSRMatrix **LDUp)
 {
-   /* data slots */
-   HYPRE_Int            i, j, pos;
-
-   hypre_CSRMatrix      *L_diag        = hypre_ParCSRMatrixDiag(L);
-   hypre_CSRMatrix      *U_diag        = hypre_ParCSRMatrixDiag(U);
-   HYPRE_Int            *L_diag_i      = hypre_CSRMatrixI(L_diag);
-   HYPRE_Int            *L_diag_j      = hypre_CSRMatrixJ(L_diag);
-   HYPRE_Real           *L_diag_data   = hypre_CSRMatrixData(L_diag);
-   HYPRE_Int            *U_diag_i      = hypre_CSRMatrixI(U_diag);
-   HYPRE_Int            *U_diag_j      = hypre_CSRMatrixJ(U_diag);
-   HYPRE_Real           *U_diag_data   = hypre_CSRMatrixData(U_diag);
-   HYPRE_Int            n              = hypre_ParCSRMatrixNumRows(L);
-   HYPRE_Int            nnz_L          = L_diag_i[n];
-   HYPRE_Int            nnz_U          = U_diag_i[n];
-   HYPRE_Int            nnz_LDU        = n + nnz_L + nnz_U;
+   MPI_Comm              comm     = hypre_ParCSRMatrixComm(L);
+   hypre_CSRMatrix      *L_diag   = hypre_ParCSRMatrixDiag(L);
+   hypre_CSRMatrix      *U_diag   = hypre_ParCSRMatrixDiag(U);
+   HYPRE_Int            *L_diag_i = hypre_CSRMatrixI(L_diag);
+   HYPRE_Int            *L_diag_j = hypre_CSRMatrixJ(L_diag);
+   HYPRE_Real           *L_diag_a = hypre_CSRMatrixData(L_diag);
+   HYPRE_Int            *U_diag_i = hypre_CSRMatrixI(U_diag);
+   HYPRE_Int            *U_diag_j = hypre_CSRMatrixJ(U_diag);
+   HYPRE_Real           *U_diag_a = hypre_CSRMatrixData(U_diag);
+   HYPRE_Int             n        = hypre_ParCSRMatrixNumRows(L);
+   HYPRE_Int             nnz_L    = L_diag_i[n];
+   HYPRE_Int             nnz_U    = U_diag_i[n];
+   HYPRE_Int             nnz_LDU  = n + nnz_L + nnz_U;
 
    hypre_ParCSRMatrix   *LDU;
    hypre_CSRMatrix      *LDU_diag;
    HYPRE_Int            *LDU_diag_i;
    HYPRE_Int            *LDU_diag_j;
-   HYPRE_Real           *LDU_diag_data;
-
-   /* MPI */
-   MPI_Comm             comm                 = hypre_ParCSRMatrixComm(L);
-   HYPRE_Int            num_procs,  my_id;
-
-   hypre_MPI_Comm_size(comm, &num_procs);
-   hypre_MPI_Comm_rank(comm, &my_id);
+   HYPRE_Real           *LDU_diag_a;
 
+   HYPRE_Int             i, j, pos;
 
-   /* cuda data slot */
-
-   /* create matrix */
-
+   /* Create matrix */
    LDU = hypre_ParCSRMatrixCreate(comm,
                                   hypre_ParCSRMatrixGlobalNumRows(L),
                                   hypre_ParCSRMatrixGlobalNumRows(L),
@@ -1775,41 +1763,37 @@ hypre_ILUSetupLDUtoCusparse(hypre_ParCSRMatrix  *L,
                                   0,
                                   nnz_LDU,
                                   0);
-
+   hypre_ParCSRMatrixInitialize_v2(LDU, HYPRE_MEMORY_HOST);
    LDU_diag = hypre_ParCSRMatrixDiag(LDU);
-   LDU_diag_i = hypre_TAlloc(HYPRE_Int, n + 1, HYPRE_MEMORY_DEVICE);
-   LDU_diag_j = hypre_TAlloc(HYPRE_Int, nnz_LDU, HYPRE_MEMORY_DEVICE);
-   LDU_diag_data = hypre_TAlloc(HYPRE_Real, nnz_LDU, HYPRE_MEMORY_DEVICE);
+   LDU_diag_i = hypre_CSRMatrixI(LDU_diag);
+   LDU_diag_j = hypre_CSRMatrixJ(LDU_diag);
+   LDU_diag_a = hypre_CSRMatrixData(LDU_diag);
 
    pos = 0;
-
-   for (i = 1; i <= n; i++)
+   for (i = 0; i < n; i++)
    {
-      LDU_diag_i[i - 1] = pos;
-      for (j = L_diag_i[i - 1]; j < L_diag_i[i]; j++)
+      LDU_diag_i[i] = pos;
+      for (j = L_diag_i[i]; j < L_diag_i[i + 1]; j++)
       {
-         LDU_diag_j[pos] = L_diag_j[j];
-         LDU_diag_data[pos++] = L_diag_data[j];
+         LDU_diag_j[pos]   = L_diag_j[j];
+         LDU_diag_a[pos++] = L_diag_a[j];
       }
-      LDU_diag_j[pos] = i - 1;
-      LDU_diag_data[pos++] = 1.0 / D[i - 1];
-      for (j = U_diag_i[i - 1]; j < U_diag_i[i]; j++)
+      LDU_diag_j[pos]   = i;
+      LDU_diag_a[pos++] = 1.0 / D[i];
+      for (j = U_diag_i[i]; j < U_diag_i[i + 1]; j++)
       {
-         LDU_diag_j[pos] = U_diag_j[j];
-         LDU_diag_data[pos++] = U_diag_data[j];
+         LDU_diag_j[pos]   = U_diag_j[j];
+         LDU_diag_a[pos++] = U_diag_a[j];
       }
    }
    LDU_diag_i[n] = pos;
 
-   hypre_CSRMatrixI(LDU_diag)    = LDU_diag_i;
-   hypre_CSRMatrixJ(LDU_diag)    = LDU_diag_j;
-   hypre_CSRMatrixData(LDU_diag) = LDU_diag_data;
+   /* Migrate to device (abstract memory space) */
+   hypre_ParCSRMatrixMigrate(LDU, HYPRE_MEMORY_DEVICE);
 
-   /* now sort */
 #if defined(HYPRE_USING_GPU)
-   hypre_CSRMatrixSortRow(LDU_diag);
+   hypre_CSRMatrixSortRow(hypre_ParCSRMatrixDiag(LDU));
 #endif
-   hypre_ParCSRMatrixDiag(LDU) = LDU_diag;
 
    *LDUp = LDU;
 
@@ -1848,7 +1832,7 @@ hypre_ILUSetupRAPMILU0(hypre_ParCSRMatrix  *A,
 
    /* Free memory */
    hypre_ParCSRMatrixDestroy(L);
-   hypre_TFree(D, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(D, hypre_ParCSRMatrixMemoryLocation(A));
    hypre_ParCSRMatrixDestroy(U);
 
    *ALUp = ALU;
@@ -3975,9 +3959,9 @@ hypre_ILUSetupILUT(hypre_ParCSRMatrix  *A,
    HYPRE_Int                *S_diag_j        = NULL;
    HYPRE_Int                *S_offd_i        = NULL;
    HYPRE_Int                *S_offd_j        = NULL;
-   HYPRE_BigInt                *S_offd_colmap   = NULL;
+   HYPRE_BigInt             *S_offd_colmap   = NULL;
    HYPRE_Real               *S_offd_data;
-   HYPRE_BigInt                *send_buf        = NULL;
+   HYPRE_BigInt             *send_buf        = NULL;
    HYPRE_Int                *u_end_array;
 
    /* reverse permutation */
@@ -4669,6 +4653,8 @@ hypre_ILUSetupILUT(hypre_ParCSRMatrix  *A,
                                     0,
                                     L_diag_i[n],
                                     0 );
+   hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixDiag(matL)) = memory_location;
+   hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixOffd(matL)) = memory_location;
 
    L_diag = hypre_ParCSRMatrixDiag(matL);
    hypre_CSRMatrixI(L_diag) = L_diag_i;
@@ -4696,6 +4682,8 @@ hypre_ILUSetupILUT(hypre_ParCSRMatrix  *A,
                                     0,
                                     U_diag_i[n],
                                     0 );
+   hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixDiag(matU)) = memory_location;
+   hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixOffd(matU)) = memory_location;
 
    U_diag = hypre_ParCSRMatrixDiag(matU);
    hypre_CSRMatrixI(U_diag) = U_diag_i;
diff --git a/3rd_party/hypre/src/parcsr_ls/par_ilu_setup_device.c b/3rd_party/hypre/src/parcsr_ls/par_ilu_setup_device.c
index 7a55e171e..f626b67ca 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_ilu_setup_device.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_ilu_setup_device.c
@@ -114,24 +114,12 @@ hypre_ILUSetupDevice(hypre_ParILUData       *ilu_data,
                         "ILUK setup on device runs requires unified memory!");
       return hypre_error_flag;
    }
-   else if (ilu_type == 1)
-   {
-      hypre_error_w_msg(HYPRE_ERROR_GENERIC,
-                        "ILUT setup on device runs requires unified memory!");
-      return hypre_error_flag;
-   }
    else if (ilu_type == 10 && fill_level)
    {
       hypre_error_w_msg(HYPRE_ERROR_GENERIC,
                         "GMRES+ILUK setup on device runs requires unified memory!");
       return hypre_error_flag;
    }
-   else if (ilu_type == 11)
-   {
-      hypre_error_w_msg(HYPRE_ERROR_GENERIC,
-                        "GMRES+ILUT setup on device runs requires unified memory!");
-      return hypre_error_flag;
-   }
 #endif
 
    /* Build the inverse permutation arrays */
@@ -210,6 +198,7 @@ hypre_ILUSetupDevice(hypre_ParILUData       *ilu_data,
          }
          else if ((ilu_type % 10) == 1)
          {
+            hypre_ParCSRMatrixMigrate(Apq, HYPRE_MEMORY_HOST);
             hypre_ILUSetupILUT(Apq, max_row_nnz, droptol, NULL, NULL, n, n,
                                &parL, &parD, &parU, &parS, &uend);
          }
@@ -219,14 +208,19 @@ hypre_ILUSetupDevice(hypre_ParILUData       *ilu_data,
          hypre_ParCSRMatrixDestroy(parS);
 
          hypre_ILUSetupLDUtoCusparse(parL, parD, parU, &ALU);
-
+         if ((ilu_type % 10) == 1)
+         {
+            hypre_TFree(parD, HYPRE_MEMORY_HOST);
+         }
+         else
+         {
+            hypre_TFree(parD, HYPRE_MEMORY_DEVICE);
+         }
          hypre_ParCSRMatrixDestroy(parL);
          hypre_ParCSRMatrixDestroy(parU);
-         hypre_TFree(parD, HYPRE_MEMORY_DEVICE);
 
          hypre_ParILUExtractEBFC(hypre_ParCSRMatrixDiag(ALU), nLU,
                                  BLUptr, &SLU, Eptr, Fptr);
-
          hypre_ParCSRMatrixDestroy(ALU);
       }
    }
diff --git a/3rd_party/hypre/src/parcsr_ls/par_ilu_solve.c b/3rd_party/hypre/src/parcsr_ls/par_ilu_solve.c
index bf63ab342..45b113fce 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_ilu_solve.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_ilu_solve.c
@@ -1380,7 +1380,7 @@ hypre_ILUSolveRAPGMRESHost(hypre_ParCSRMatrix *A,
    hypre_Vector      *rhs_local;
    HYPRE_Real        *rhs_data;
    hypre_Vector      *x_local = NULL;
-   HYPRE_Real        *x_data;
+   HYPRE_Real        *x_data  = NULL;
 
    /* xtemp might be null when we have no Schur complement */
    if (xtemp)
diff --git a/3rd_party/hypre/src/parcsr_ls/par_interp.c b/3rd_party/hypre/src/parcsr_ls/par_interp.c
index 6decad964..09e2fc596 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_interp.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_interp.c
@@ -123,7 +123,7 @@ hypre_BoomerAMGBuildInterp( hypre_ParCSRMatrix      *A,
    HYPRE_Int local_numrows = hypre_CSRMatrixNumRows(A_diag);
    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
    hypre_MPI_Comm_size(comm, &num_procs);
    hypre_MPI_Comm_rank(comm, &my_id);
@@ -1090,7 +1090,7 @@ hypre_BoomerAMGBuildInterpHE( hypre_ParCSRMatrix   *A,
    HYPRE_Int local_numrows = hypre_CSRMatrixNumRows(A_diag);
    HYPRE_BigInt col_n = col_1 + local_numrows;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
    hypre_MPI_Comm_size(comm, &num_procs);
    hypre_MPI_Comm_rank(comm, &my_id);
@@ -2237,6 +2237,8 @@ hypre_BoomerAMGBuildDirInterpHost( hypre_ParCSRMatrix   *A,
    {
       HYPRE_Int       *P_marker, *P_marker_offd;
 
+      alfa = 1.0;
+      beta = 1.0;
       size = n_fine / num_threads;
       rest = n_fine - size * num_threads;
       if (jl < rest)
@@ -2623,7 +2625,7 @@ hypre_BoomerAMGInterpTruncation( hypre_ParCSRMatrix *P,
 {
    if (trunc_factor <= 0.0 && max_elmts == 0)
    {
-      return 0;
+      return hypre_error_flag;
    }
 
 #if defined(HYPRE_USING_GPU)
@@ -2759,7 +2761,7 @@ hypre_BoomerAMGBuildInterpModUnk( hypre_ParCSRMatrix   *A,
    HYPRE_Int local_numrows = hypre_CSRMatrixNumRows(A_diag);
    HYPRE_BigInt col_n = col_1 + local_numrows;
 
-   HYPRE_Real       wall_time;  /* for debugging instrumentation  */
+   HYPRE_Real       wall_time = 0.0;  /* for debugging instrumentation  */
 
    hypre_MPI_Comm_size(comm, &num_procs);
    hypre_MPI_Comm_rank(comm, &my_id);
diff --git a/3rd_party/hypre/src/parcsr_ls/par_lr_interp.c b/3rd_party/hypre/src/parcsr_ls/par_lr_interp.c
index 7ca6380a7..5311f81bb 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_lr_interp.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_lr_interp.c
@@ -132,11 +132,10 @@ hypre_BoomerAMGBuildStdInterp(hypre_ParCSRMatrix  *A,
    /* Definitions */
    HYPRE_Real       zero = 0.0;
    HYPRE_Real       one  = 1.0;
-   HYPRE_Real       wall_time;
-   HYPRE_Real       wall_1 = 0;
-   HYPRE_Real       wall_2 = 0;
-   HYPRE_Real       wall_3 = 0;
-
+   HYPRE_Real       wall_time = 0.0;
+   HYPRE_Real       wall_1 = 0.0;
+   HYPRE_Real       wall_2 = 0.0;
+   HYPRE_Real       wall_3 = 0.0;
 
    hypre_ParCSRCommPkg   *extend_comm_pkg = NULL;
 
@@ -1246,6 +1245,8 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix   *A,
        */
 
       /* initialize thread-wise variables */
+      P_marker = NULL;
+      P_marker_offd = NULL;
       strong_f_marker = -2;
       coarse_counter = 0;
       jj_counter = start_indexing;
@@ -4876,8 +4877,7 @@ hypre_BoomerAMGBuildExtInterpHost(hypre_ParCSRMatrix  *A,
    /* Definitions */
    HYPRE_Real       zero = 0.0;
    HYPRE_Real       one  = 1.0;
-   HYPRE_Real       wall_time;
-
+   HYPRE_Real       wall_time = 0.0;
 
    hypre_ParCSRCommPkg   *extend_comm_pkg = NULL;
 
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr.c b/3rd_party/hypre/src/parcsr_ls/par_mgr.c
index 85b446fd1..b5bdeb691 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_mgr.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr.c
@@ -135,7 +135,8 @@ hypre_MGRCreate(void)
    (mgr_data -> Frelax_num_functions) = NULL;
    (mgr_data -> max_local_lvls) = 10;
 
-   (mgr_data -> mgr_coarse_grid_method) = NULL;
+   (mgr_data -> coarse_grid_method) = NULL;
+   (mgr_data -> nonglk_max_elmts) = NULL;
 
    (mgr_data -> print_coarse_system) = 0;
 
@@ -415,16 +416,21 @@ hypre_MGRDestroy( void *data )
       }
       hypre_TFree(mgr_data -> FrelaxVcycleData, HYPRE_MEMORY_HOST);
    }
+
    /* data for reserved coarse nodes */
    hypre_TFree(mgr_data -> reserved_coarse_indexes, HYPRE_MEMORY_HOST);
+
    /* index array for setting Cpoints by global block */
    if ((mgr_data -> set_c_points_method) == 1)
    {
       hypre_TFree(mgr_data -> idx_array, HYPRE_MEMORY_HOST);
    }
-   /* array for setting option to use non-Galerkin coarse grid */
-   hypre_TFree(mgr_data -> mgr_coarse_grid_method, HYPRE_MEMORY_HOST);
-   /* coarse level matrix - RAP */
+
+   /* Coarse grid options */
+   hypre_TFree(mgr_data -> coarse_grid_method, HYPRE_MEMORY_HOST);
+   hypre_TFree(mgr_data -> nonglk_max_elmts, HYPRE_MEMORY_HOST);
+
+   /* coarsest level matrix - RAP */
    if ((mgr_data -> RAP))
    {
       hypre_ParCSRMatrixDestroy((mgr_data -> RAP));
@@ -444,6 +450,11 @@ hypre_MGRDestroy( void *data )
             {
                HYPRE_ILUDestroy((mgr_data -> level_smoother)[i]);
             }
+            else if ((mgr_data -> level_smoother)[i])
+            {
+               hypre_Solver *smoother_base = (hypre_Solver*) (mgr_data -> level_smoother)[i];
+               hypre_SolverDestroy(smoother_base)((mgr_data -> level_smoother)[i]);
+            }
          }
       }
       hypre_TFree(mgr_data -> level_smoother, HYPRE_MEMORY_HOST);
@@ -852,17 +863,6 @@ hypre_MGRSetReservedCoarseNodes(void      *mgr_vdata,
    HYPRE_BigInt *reserved_coarse_indexes = NULL;
    HYPRE_Int i;
 
-   if (!mgr_data)
-   {
-      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! MGR object empty!\n");
-      return hypre_error_flag;
-   }
-
-   if (reserved_coarse_size < 0)
-   {
-      hypre_error_in_arg(2);
-      return hypre_error_flag;
-   }
    /* free data not previously destroyed */
    if ((mgr_data -> reserved_coarse_indexes))
    {
@@ -1032,263 +1032,6 @@ hypre_ParCSRMatrixLeftScale(HYPRE_Real *vector,
    return hypre_error_flag;
 }
 
-/*--------------------------------------------------------------------------
- * hypre_MGRComputeNonGalerkinCoarseGrid
- *
- * Computes the level (grid) operator A_H = RAP.
- *
- * Available methods:
- *   1: inv(A_FF) approximated by its (block) diagonal inverse
- *   2: CPR-like approx. with inv(A_FF) approx. by its diagonal inverse
- *   3: CPR-like approx. with inv(A_FF) approx. by its block diagonal inverse
- *   4: inv(A_FF) approximated by sparse approximate inverse
- *   5: Uses classical restriction R = [-Wr I] from input parameters list.
- *
- * Methods 1-4 assume that restriction is the injection operator.
- * Method 5 assumes that interpolation is the injection operator.
- *
- * TODO (VPM): Can we have a single function that works for host and device?
- *--------------------------------------------------------------------------*/
-
-HYPRE_Int
-hypre_MGRComputeNonGalerkinCoarseGrid(hypre_ParCSRMatrix    *A_FF,
-                                      hypre_ParCSRMatrix    *A_FC,
-                                      hypre_ParCSRMatrix    *A_CF,
-                                      hypre_ParCSRMatrix    *A_CC,
-                                      hypre_ParCSRMatrix    *Wp,
-                                      hypre_ParCSRMatrix    *Wr,
-                                      HYPRE_Int              bsize,
-                                      HYPRE_Int              ordering,
-                                      HYPRE_Int              method,
-                                      HYPRE_Int              max_elmts,
-                                      hypre_ParCSRMatrix   **A_H_ptr)
-{
-   HYPRE_MemoryLocation   memory_location = hypre_ParCSRMatrixMemoryLocation(A_FF);
-
-   hypre_ParCSRMatrix    *A_H = NULL;
-   hypre_ParCSRMatrix    *A_Hc = NULL;
-   hypre_ParCSRMatrix    *Wp_tmp = NULL;
-   hypre_ParCSRMatrix    *Wr_tmp = NULL;
-   hypre_ParCSRMatrix    *A_CF_truncated = NULL;
-   hypre_ParCSRMatrix    *A_FF_inv = NULL;
-   hypre_ParCSRMatrix    *minus_Wp = NULL;
-
-   HYPRE_Int              i, i1, jj;
-   HYPRE_Int              blk_inv_size;
-   HYPRE_Real             neg_one = -1.0;
-   HYPRE_Real             one = 1.0;
-
-   if (method == 1)
-   {
-      if (Wp != NULL)
-      {
-         A_Hc = hypre_ParCSRMatMat(A_CF, Wp);
-      }
-      else
-      {
-         // Build block diagonal inverse for A_FF
-         hypre_ParCSRMatrixBlockDiagMatrix(A_FF, 1, -1, NULL, 1, &A_FF_inv);
-
-         // compute Wp = A_FF_inv * A_FC
-         // NOTE: Use hypre_ParMatmul here instead of hypre_ParCSRMatMat to avoid padding
-         // zero entries at diagonals for the latter routine. Use MatMat once this padding
-         // issue is resolved since it is more efficient.
-         //         hypre_ParCSRMatrix *Wp_tmp = hypre_ParCSRMatMat(A_FF_inv, A_FC);
-         Wp_tmp = hypre_ParMatmul(A_FF_inv, A_FC);
-
-         /* Compute correction A_Hc = A_CF * (A_FF_inv * A_FC); */
-         A_Hc = hypre_ParCSRMatMat(A_CF, Wp_tmp);
-         hypre_ParCSRMatrixDestroy(Wp_tmp);
-         hypre_ParCSRMatrixDestroy(A_FF_inv);
-      }
-   }
-   else if (method == 2 || method == 3)
-   {
-      /* Extract the diagonal of A_CF */
-      hypre_MGRTruncateAcfCPR(A_CF, &A_CF_truncated);
-      if (Wp != NULL)
-      {
-         A_Hc = hypre_ParCSRMatMat(A_CF_truncated, Wp);
-      }
-      else
-      {
-         blk_inv_size = method == 2 ? 1 : bsize;
-         hypre_ParCSRMatrixBlockDiagMatrix(A_FF, blk_inv_size, -1, NULL, 1, &A_FF_inv);
-
-         /* TODO (VPM): We shouldn't need to compute Wr_tmp since we are passing in Wr already */
-         Wr_tmp = hypre_ParCSRMatMat(A_CF_truncated, A_FF_inv);
-         A_Hc = hypre_ParCSRMatMat(Wr_tmp, A_FC);
-         hypre_ParCSRMatrixDestroy(Wr_tmp);
-         hypre_ParCSRMatrixDestroy(A_FF_inv);
-      }
-      hypre_ParCSRMatrixDestroy(A_CF_truncated);
-   }
-   else if (method == 4)
-   {
-      /* Approximate inverse for ideal interploation */
-      hypre_MGRApproximateInverse(A_FF, &A_FF_inv);
-
-      minus_Wp = hypre_ParCSRMatMat(A_FF_inv, A_FC);
-      A_Hc = hypre_ParCSRMatMat(A_CF, minus_Wp);
-
-      hypre_ParCSRMatrixDestroy(minus_Wp);
-   }
-   else if (method == 5)
-   {
-      if (!Wr)
-      {
-         hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Expected Wr matrix!");
-         return hypre_error_flag;
-      }
-
-      /* A_Hc = Wr * A_FC */
-      A_Hc = hypre_ParCSRMatMat(Wr, A_FC);
-   }
-
-   /* Drop small entries in the correction term A_Hc */
-   if (max_elmts > 0)
-   {
-      // perform dropping for A_Hc
-      // specific to multiphase poromechanics
-      // we only keep the diagonal of each block
-      HYPRE_Int        n_local_cpoints = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_Hc));
-
-      hypre_CSRMatrix *A_Hc_diag    = hypre_ParCSRMatrixDiag(A_Hc);
-      HYPRE_Complex   *A_Hc_diag_a  = hypre_CSRMatrixData(A_Hc_diag);
-      HYPRE_Int       *A_Hc_diag_i  = hypre_CSRMatrixI(A_Hc_diag);
-      HYPRE_Int       *A_Hc_diag_j  = hypre_CSRMatrixJ(A_Hc_diag);
-      HYPRE_Int        ncol_diag    = hypre_CSRMatrixNumCols(A_Hc_diag);
-
-      hypre_CSRMatrix *A_Hc_offd    = hypre_ParCSRMatrixOffd(A_Hc);
-      HYPRE_Complex   *A_Hc_offd_a  = hypre_CSRMatrixData(A_Hc_offd);
-      HYPRE_Int       *A_Hc_offd_i  = hypre_CSRMatrixI(A_Hc_offd);
-      HYPRE_Int       *A_Hc_offd_j  = hypre_CSRMatrixJ(A_Hc_offd);
-
-      if (ordering == 0) // interleaved ordering
-      {
-         HYPRE_Int      *A_Hc_diag_i_new, *A_Hc_diag_j_new;
-         HYPRE_Complex  *A_Hc_diag_a_new;
-         HYPRE_Int       num_nonzeros_diag_new = 0;
-
-         HYPRE_Int      *A_Hc_offd_i_new, *A_Hc_offd_j_new;
-         HYPRE_Complex  *A_Hc_offd_a_new;
-         HYPRE_Int       num_nonzeros_offd_new = 0;
-
-         /* Allocate new memory */
-         A_Hc_diag_i_new = hypre_CTAlloc(HYPRE_Int, n_local_cpoints + 1, memory_location);
-         A_Hc_diag_j_new = hypre_CTAlloc(HYPRE_Int, (bsize + max_elmts) * n_local_cpoints,
-                                         memory_location);
-         A_Hc_diag_a_new = hypre_CTAlloc(HYPRE_Complex, (bsize + max_elmts) * n_local_cpoints,
-                                         memory_location);
-         A_Hc_offd_i_new = hypre_CTAlloc(HYPRE_Int, n_local_cpoints + 1, memory_location);
-         A_Hc_offd_j_new = hypre_CTAlloc(HYPRE_Int, max_elmts * n_local_cpoints,
-                                         memory_location);
-         A_Hc_offd_a_new = hypre_CTAlloc(HYPRE_Complex, max_elmts * n_local_cpoints,
-                                         memory_location);
-
-         for (i = 0; i < n_local_cpoints; i++)
-         {
-            HYPRE_Int   max_num_nonzeros = A_Hc_diag_i[i + 1] - A_Hc_diag_i[i] +
-                                           A_Hc_offd_i[i + 1] - A_Hc_offd_i[i];
-            HYPRE_Int  *aux_j     = hypre_CTAlloc(HYPRE_Int, max_num_nonzeros, memory_location);
-            HYPRE_Real *aux_data  = hypre_CTAlloc(HYPRE_Real, max_num_nonzeros, memory_location);
-            HYPRE_Int   row_start = i - (i % bsize);
-            HYPRE_Int   row_stop  = row_start + bsize - 1;
-            HYPRE_Int   cnt       = 0;
-
-            for (jj = A_Hc_offd_i[i]; jj < A_Hc_offd_i[i + 1]; jj++)
-            {
-               aux_j[cnt] = A_Hc_offd_j[jj] + ncol_diag;
-               aux_data[cnt] = A_Hc_offd_a[jj];
-               cnt++;
-            }
-
-            for (jj = A_Hc_diag_i[i]; jj < A_Hc_diag_i[i + 1]; jj++)
-            {
-               aux_j[cnt] = A_Hc_diag_j[jj];
-               aux_data[cnt] = A_Hc_diag_a[jj];
-               cnt++;
-            }
-            hypre_qsort2_abs(aux_j, aux_data, 0, cnt - 1);
-
-            for (jj = A_Hc_diag_i[i]; jj < A_Hc_diag_i[i + 1]; jj++)
-            {
-               i1 = A_Hc_diag_j[jj];
-               if (i1 >= row_start && i1 <= row_stop)
-               {
-                  // copy data to new arrays
-                  A_Hc_diag_j_new[num_nonzeros_diag_new] = i1;
-                  A_Hc_diag_a_new[num_nonzeros_diag_new] = A_Hc_diag_a[jj];
-                  ++num_nonzeros_diag_new;
-               }
-               else
-               {
-                  // Do nothing
-               }
-            }
-
-            if (max_elmts > 0)
-            {
-               for (jj = 0; jj < hypre_min(max_elmts, cnt); jj++)
-               {
-                  HYPRE_Int  col_idx   = aux_j[jj];
-                  HYPRE_Real col_value = aux_data[jj];
-                  if (col_idx < ncol_diag && (col_idx < row_start || col_idx > row_stop))
-                  {
-                     A_Hc_diag_j_new[num_nonzeros_diag_new] = col_idx;
-                     A_Hc_diag_a_new[num_nonzeros_diag_new] = col_value;
-                     ++num_nonzeros_diag_new;
-                  }
-                  else if (col_idx >= ncol_diag)
-                  {
-                     A_Hc_offd_j_new[num_nonzeros_offd_new] = col_idx - ncol_diag;
-                     A_Hc_offd_a_new[num_nonzeros_offd_new] = col_value;
-                     ++num_nonzeros_offd_new;
-                  }
-               }
-            }
-            A_Hc_diag_i_new[i + 1] = num_nonzeros_diag_new;
-            A_Hc_offd_i_new[i + 1] = num_nonzeros_offd_new;
-
-            hypre_TFree(aux_j, memory_location);
-            hypre_TFree(aux_data, memory_location);
-         }
-
-         hypre_TFree(A_Hc_diag_i, memory_location);
-         hypre_TFree(A_Hc_diag_j, memory_location);
-         hypre_TFree(A_Hc_diag_a, memory_location);
-         hypre_CSRMatrixI(A_Hc_diag) = A_Hc_diag_i_new;
-         hypre_CSRMatrixJ(A_Hc_diag) = A_Hc_diag_j_new;
-         hypre_CSRMatrixData(A_Hc_diag) = A_Hc_diag_a_new;
-         hypre_CSRMatrixNumNonzeros(A_Hc_diag) = num_nonzeros_diag_new;
-
-         hypre_TFree(A_Hc_offd_i, memory_location);
-         hypre_TFree(A_Hc_offd_j, memory_location);
-         hypre_TFree(A_Hc_offd_a, memory_location);
-         hypre_CSRMatrixI(A_Hc_offd) = A_Hc_offd_i_new;
-         hypre_CSRMatrixJ(A_Hc_offd) = A_Hc_offd_j_new;
-         hypre_CSRMatrixData(A_Hc_offd) = A_Hc_offd_a_new;
-         hypre_CSRMatrixNumNonzeros(A_Hc_offd) = num_nonzeros_offd_new;
-      }
-      else
-      {
-         hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Non-interleaved dropping not implemented!");
-         return hypre_error_flag;
-      }
-   }
-
-   /* Coarse grid / Schur complement */
-   hypre_ParCSRMatrixAdd(one, A_CC, neg_one, A_Hc, &A_H);
-
-   /* Free memory */
-   hypre_ParCSRMatrixDestroy(A_Hc);
-
-   /* Set output pointer */
-   *A_H_ptr = A_H;
-
-   return hypre_error_flag;
-}
-
 HYPRE_Int
 hypre_MGRComputeAlgebraicFixedStress(hypre_ParCSRMatrix  *A,
                                      HYPRE_BigInt        *mgr_idx_array,
@@ -2908,14 +2651,8 @@ hypre_MGRSetFSolver( void  *mgr_vdata,
                      void       *fsolver )
 {
    hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata;
-
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
-   HYPRE_Int max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
-   HYPRE_Solver **aff_solver = (mgr_data -> aff_solver);
+   HYPRE_Int         max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Solver    **aff_solver = (mgr_data -> aff_solver);
 
    if (aff_solver == NULL)
    {
@@ -2943,24 +2680,18 @@ hypre_MGRSetFSolver( void  *mgr_vdata,
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-hypre_MGRSetFSolverAtLevel( HYPRE_Int   level,
-                            void       *mgr_vdata,
-                            void       *fsolver )
+hypre_MGRSetFSolverAtLevel( void       *mgr_vdata,
+                            void       *fsolver,
+                            HYPRE_Int   level )
 {
    hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata;
-
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
-   HYPRE_Int        max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
-   HYPRE_Solver   **aff_solver = (mgr_data -> aff_solver);
+   HYPRE_Int         max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Solver    **aff_solver = (mgr_data -> aff_solver);
 
    /* Check if the requested level makes sense */
    if (level < 0 || level >= max_num_coarse_levels)
    {
-      hypre_error_in_arg(2);
+      hypre_error_in_arg(1);
       return hypre_error_flag;
    }
 
@@ -2973,26 +2704,20 @@ hypre_MGRSetFSolverAtLevel( HYPRE_Int   level,
    }
 
    aff_solver[level] = (HYPRE_Solver *) fsolver;
-   (mgr_data -> fsolver_mode)  = 0;
+   (mgr_data -> fsolver_mode) = 1;
 
    return hypre_error_flag;
 }
 
 /* set coarse grid solver */
 HYPRE_Int
-hypre_MGRSetCoarseSolver( void  *mgr_vdata,
+hypre_MGRSetCoarseSolver( void        *mgr_vdata,
                           HYPRE_Int  (*coarse_grid_solver_solve)(void*, void*, void*, void*),
                           HYPRE_Int  (*coarse_grid_solver_setup)(void*, void*, void*, void*),
-                          void  *coarse_grid_solver )
+                          void        *coarse_grid_solver )
 {
    hypre_ParMGRData *mgr_data = (hypre_ParMGRData*) mgr_vdata;
 
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
-
    (mgr_data -> coarse_grid_solver_solve) = coarse_grid_solver_solve;
    (mgr_data -> coarse_grid_solver_setup) = coarse_grid_solver_setup;
    (mgr_data -> coarse_grid_solver)       = (HYPRE_Solver) coarse_grid_solver;
@@ -3010,6 +2735,7 @@ hypre_MGRSetMaxCoarseLevels( void *mgr_vdata, HYPRE_Int maxcoarselevs )
 {
    hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
    (mgr_data -> max_num_coarse_levels) = maxcoarselevs;
+
    return hypre_error_flag;
 }
 
@@ -3019,6 +2745,7 @@ hypre_MGRSetBlockSize( void *mgr_vdata, HYPRE_Int bsize )
 {
    hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
    (mgr_data -> block_size) = bsize;
+
    return hypre_error_flag;
 }
 
@@ -3183,24 +2910,80 @@ hypre_MGRSetCoarseGridMethod( void *mgr_vdata, HYPRE_Int *cg_method )
    HYPRE_Int i;
    HYPRE_Int max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
 
-   hypre_TFree(mgr_data -> mgr_coarse_grid_method, HYPRE_MEMORY_HOST);
-   HYPRE_Int *mgr_coarse_grid_method = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels,
-                                                     HYPRE_MEMORY_HOST);
+   hypre_TFree(mgr_data -> coarse_grid_method, HYPRE_MEMORY_HOST);
+   HYPRE_Int *coarse_grid_method = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels,
+                                                 HYPRE_MEMORY_HOST);
    if (cg_method != NULL)
    {
       for (i = 0; i < max_num_coarse_levels; i++)
       {
-         mgr_coarse_grid_method[i] = cg_method[i];
+         coarse_grid_method[i] = cg_method[i];
       }
    }
    else
    {
       for (i = 0; i < max_num_coarse_levels; i++)
       {
-         mgr_coarse_grid_method[i] = 0;
+         coarse_grid_method[i] = 0;
       }
    }
-   (mgr_data -> mgr_coarse_grid_method) = mgr_coarse_grid_method;
+   (mgr_data -> coarse_grid_method) = coarse_grid_method;
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRSetNonGalerkinMaxElmts
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRSetNonGalerkinMaxElmts( void *mgr_vdata, HYPRE_Int max_elmts )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   HYPRE_Int           max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Int          *nonglk_max_elmts = (mgr_data -> nonglk_max_elmts);
+   HYPRE_Int           i;
+
+   if (!nonglk_max_elmts)
+   {
+      nonglk_max_elmts = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+   }
+   hypre_TFree(mgr_data -> nonglk_max_elmts, HYPRE_MEMORY_HOST);
+
+   for (i = 0; i < max_num_coarse_levels; i++)
+   {
+      nonglk_max_elmts[i] = max_elmts;
+   }
+
+   (mgr_data -> nonglk_max_elmts) = nonglk_max_elmts;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRSetLevelNonGalerkinMaxElmts
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRSetLevelNonGalerkinMaxElmts( void *mgr_vdata, HYPRE_Int *max_elmts )
+{
+   hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   HYPRE_Int           max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Int          *nonglk_max_elmts = (mgr_data -> nonglk_max_elmts);
+   HYPRE_Int           i;
+
+   if (!nonglk_max_elmts)
+   {
+      nonglk_max_elmts = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+   }
+   hypre_TFree(mgr_data -> nonglk_max_elmts, HYPRE_MEMORY_HOST);
+
+   for (i = 0; i < max_num_coarse_levels; i++)
+   {
+      nonglk_max_elmts[i] = max_elmts[i];
+   }
+
+   (mgr_data -> nonglk_max_elmts) = nonglk_max_elmts;
+
    return hypre_error_flag;
 }
 
@@ -3461,7 +3244,12 @@ hypre_MGRSetMaxGlobalSmoothIters( void *mgr_vdata, HYPRE_Int max_iter )
    return hypre_error_flag;
 }
 
-/* Set global smoothing type for mgr solver */
+/*--------------------------------------------------------------------------
+ * hypre_MGRSetGlobalSmoothType
+ *
+ * Set global smoothing type at the first (finest) MGR level
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_MGRSetGlobalSmoothType( void *mgr_vdata, HYPRE_Int gsmooth_type )
 {
@@ -3482,43 +3270,74 @@ hypre_MGRSetGlobalSmoothType( void *mgr_vdata, HYPRE_Int gsmooth_type )
    return hypre_error_flag;
 }
 
-/* Set global smoothing type for mgr solver */
+/*--------------------------------------------------------------------------
+ * hypre_MGRSetLevelSmoothType
+ *
+ * Set global smoothing type at each MGR level.
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-hypre_MGRSetLevelSmoothType( void *mgr_vdata, HYPRE_Int *gsmooth_type )
+hypre_MGRSetLevelSmoothType( void       *mgr_vdata,
+                             HYPRE_Int  *gsmooth_type )
 {
    hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
-   HYPRE_Int i;
-   HYPRE_Int max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
-   hypre_TFree((mgr_data -> level_smooth_type), HYPRE_MEMORY_HOST);
+   HYPRE_Int           max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Int          *level_smooth_type, i;
+   char                msg[1024];
 
-   HYPRE_Int *level_smooth_type = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+   /* Set level_smooth_type array */
+   level_smooth_type = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+   hypre_TFree((mgr_data -> level_smooth_type), HYPRE_MEMORY_HOST);
    if (gsmooth_type != NULL)
    {
       for (i = 0; i < max_num_coarse_levels; i++)
       {
-         level_smooth_type[i] = gsmooth_type[i];
+         /* For meaningful values of global smoothing type, the option set via
+            hypre_MGRSetGlobalSmootherAtLevel has precedence over the option set
+            via this function. */
+         if ((mgr_data -> level_smoother) && (mgr_data -> level_smoother)[i] &&
+             (gsmooth_type[i] >= 0))
+         {
+            hypre_sprintf(msg, "hypre_MGRSetLevelSmoothType does not take effect at level %d since\n\
+                                hypre_MGRSetGlobalSmootherAtLevel has been called at the same level",
+                          i);
+            hypre_error_w_msg(HYPRE_ERROR_GENERIC, msg);
+         }
+         else
+         {
+            level_smooth_type[i] = gsmooth_type[i];
+         }
       }
    }
    else
    {
       for (i = 0; i < max_num_coarse_levels; i++)
       {
-         level_smooth_type[i] = 0;
+         level_smooth_type[i] = 0; // Jacobi
       }
    }
    (mgr_data -> level_smooth_type) = level_smooth_type;
+
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_MGRSetLevelSmoothIters
+ *
+ * Set the number of global smoothing iterations at each MGR level.
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
-hypre_MGRSetLevelSmoothIters( void *mgr_vdata, HYPRE_Int *gsmooth_iters )
+hypre_MGRSetLevelSmoothIters( void      *mgr_vdata,
+                              HYPRE_Int *gsmooth_iters )
 {
    hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
-   HYPRE_Int i;
-   HYPRE_Int max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
-   hypre_TFree((mgr_data -> level_smooth_iters), HYPRE_MEMORY_HOST);
+   HYPRE_Int           max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Int          *level_smooth_iters;
+   HYPRE_Int           i;
 
-   HYPRE_Int *level_smooth_iters = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+   level_smooth_iters = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+   hypre_TFree((mgr_data -> level_smooth_iters), HYPRE_MEMORY_HOST);
    if (gsmooth_iters != NULL)
    {
       for (i = 0; i < max_num_coarse_levels; i++)
@@ -3534,6 +3353,75 @@ hypre_MGRSetLevelSmoothIters( void *mgr_vdata, HYPRE_Int *gsmooth_iters )
       }
    }
    (mgr_data -> level_smooth_iters) = level_smooth_iters;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRSetGlobalSmootherAtLevel
+ *
+ * Set global relaxation method for a given MGR level via a HYPRE solver object.
+ *
+ * Note this function asks for a level identifier and doesn't expect an array
+ * of function pointers for each level (as done by SetLevel functions).
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRSetGlobalSmootherAtLevel( void         *mgr_vdata,
+                                   HYPRE_Solver  smoother,
+                                   HYPRE_Int     level )
+{
+   hypre_Solver          *base = (hypre_Solver*) smoother;
+   HYPRE_PtrToSolverFcn   setup = hypre_SolverSetup(base);
+   hypre_ParMGRData      *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   HYPRE_Int              max_num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Int              smoother_type;
+   char                   msg[1024];
+
+   /* Check if the requested level makes sense */
+   if (level < 0 || level >= max_num_coarse_levels)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   /* Allocate level_smoother if needed */
+   if (!(mgr_data -> level_smoother))
+   {
+      (mgr_data -> level_smoother) = hypre_CTAlloc(HYPRE_Solver,
+                                                   max_num_coarse_levels,
+                                                   HYPRE_MEMORY_HOST);
+   }
+
+   /* Allocate level_smooth_type if needed */
+   if (!(mgr_data -> level_smooth_type))
+   {
+      (mgr_data -> level_smooth_type) = hypre_CTAlloc(HYPRE_Int,
+                                                      max_num_coarse_levels,
+                                                      HYPRE_MEMORY_HOST);
+   }
+
+   (mgr_data -> level_smoother)[level] = smoother;
+
+   /* Obtain corresponding smoother type */
+   if (setup == (HYPRE_PtrToSolverFcn) HYPRE_ILUSetup)
+   {
+      smoother_type = 16;
+   }
+   else
+   {
+      smoother_type = -1; /* Unknown smoother */
+   }
+
+   /* Check if level_smooth_type[level] corresponds to the right smoother type */
+   if ((mgr_data -> level_smooth_type)[level] > 0 &&
+       (mgr_data -> level_smooth_type)[level] != smoother_type)
+   {
+      hypre_sprintf(msg, "Reseting global relaxation type at level %d to user's smoother", level);
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, msg);
+   }
+   (mgr_data -> level_smooth_type)[level] = smoother_type;
+
    return hypre_error_flag;
 }
 
@@ -3589,11 +3477,6 @@ hypre_MGRGetNumIterations( void *mgr_vdata, HYPRE_Int *num_iterations )
 {
    hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
 
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
    *num_iterations = mgr_data->num_iterations;
 
    return hypre_error_flag;
@@ -3605,11 +3488,6 @@ hypre_MGRGetFinalRelativeResidualNorm( void *mgr_vdata, HYPRE_Real *res_norm )
 {
    hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
 
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
    *res_norm = mgr_data->final_rel_residual_norm;
 
    return hypre_error_flag;
@@ -3620,11 +3498,6 @@ hypre_MGRGetCoarseGridConvergenceFactor( void *mgr_vdata, HYPRE_Real *conv_facto
 {
    hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
 
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
    *conv_factor = (mgr_data -> cg_convergence_factor);
 
    return hypre_error_flag;
@@ -4239,11 +4112,6 @@ hypre_MGRGetCoarseGridMatrix( void *mgr_vdata, hypre_ParCSRMatrix **RAP )
 {
    hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
 
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
    if (mgr_data -> RAP == NULL)
    {
       hypre_error_w_msg(HYPRE_ERROR_GENERIC,
@@ -4261,11 +4129,6 @@ hypre_MGRGetCoarseGridSolution( void *mgr_vdata, hypre_ParVector **sol )
 {
    hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
 
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
    if (mgr_data -> U_array == NULL)
    {
       hypre_error_w_msg(HYPRE_ERROR_GENERIC,
@@ -4283,11 +4146,6 @@ hypre_MGRGetCoarseGridRHS( void *mgr_vdata, hypre_ParVector **rhs )
 {
    hypre_ParMGRData  *mgr_data = (hypre_ParMGRData*) mgr_vdata;
 
-   if (!mgr_data)
-   {
-      hypre_error_in_arg(1);
-      return hypre_error_flag;
-   }
    if (mgr_data -> F_array == NULL)
    {
       hypre_error_w_msg(HYPRE_ERROR_GENERIC,
@@ -4548,6 +4406,11 @@ hypre_MGRDataPrint(void *mgr_vdata)
  ***************************************************************************/
 
 #ifdef HYPRE_USING_DSUPERLU
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRDirectSolverCreate
+ *--------------------------------------------------------------------------*/
+
 void *
 hypre_MGRDirectSolverCreate()
 {
@@ -4556,33 +4419,44 @@ hypre_MGRDirectSolverCreate()
    return NULL;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_MGRDirectSolverSetup
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_MGRDirectSolverSetup( void                *solver,
                             hypre_ParCSRMatrix  *A,
                             hypre_ParVector     *f,
                             hypre_ParVector     *u )
 {
-   HYPRE_Int ierr;
-   ierr = hypre_SLUDistSetup( solver, A, 0);
+   HYPRE_UNUSED_VAR(f);
+   HYPRE_UNUSED_VAR(u);
 
-   return ierr;
+   return hypre_SLUDistSetup(solver, A, 0);
 }
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRDirectSolverSolve
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_MGRDirectSolverSolve( void                *solver,
                             hypre_ParCSRMatrix  *A,
                             hypre_ParVector     *f,
                             hypre_ParVector     *u )
 {
-   hypre_SLUDistSolve(solver, f, u);
+   HYPRE_UNUSED_VAR(A);
 
-   return hypre_error_flag;
+   return hypre_SLUDistSolve(solver, f, u);
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_MGRDirectSolverDestroy
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_MGRDirectSolverDestroy( void *solver )
 {
-   hypre_SLUDistDestroy(solver);
-
-   return hypre_error_flag;
+   return hypre_SLUDistDestroy(solver);
 }
 #endif
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr.h b/3rd_party/hypre/src/parcsr_ls/par_mgr.h
index 517fc5b87..5e761be17 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_mgr.h
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr.h
@@ -8,24 +8,6 @@
 #ifndef hypre_ParMGR_DATA_HEADER
 #define hypre_ParMGR_DATA_HEADER
 
-/*--------------------------------------------------------------------------
- * MGR print level codes
- *--------------------------------------------------------------------------*/
-
-#define HYPRE_MGR_PRINT_INFO_SETUP  0x01       /*   1 (1st bit) */
-#define HYPRE_MGR_PRINT_INFO_SOLVE  0x02       /*   2 (2nd bit) */
-#define HYPRE_MGR_PRINT_INFO_PARAMS 0x04       /*   4 (3rd bit) */
-#define HYPRE_MGR_PRINT_MODE_ASCII  0x08       /*   8 (4th bit) */
-#define HYPRE_MGR_PRINT_FINE_MATRIX 0x10       /*  16 (5th bit) */
-#define HYPRE_MGR_PRINT_FINE_RHS    0x20       /*  32 (6th bit) */
-#define HYPRE_MGR_PRINT_CRSE_MATRIX 0x40       /*  64 (7th bit) */
-#define HYPRE_MGR_PRINT_LVLS_MATRIX 0x80       /* 128 (8th bit) */
-/* ... */
-/* Reserved codes */
-#define HYPRE_MGR_PRINT_RESERVED_C  0x10000000 /*  268435456 (29th bit) */
-#define HYPRE_MGR_PRINT_RESERVED_B  0x20000000 /*  536870912 (30th bit) */
-#define HYPRE_MGR_PRINT_RESERVED_A  0x40000000 /* 1073741824 (31th bit) */
-
 /*--------------------------------------------------------------------------
  * hypre_ParMGRData
  *--------------------------------------------------------------------------*/
@@ -140,7 +122,8 @@ typedef struct
    HYPRE_Int            *Frelax_num_functions;
 
    /* Non-Galerkin coarse grid */
-   HYPRE_Int            *mgr_coarse_grid_method; /* TODO (VPM): Change name? remove mgr_?*/
+   HYPRE_Int            *coarse_grid_method;
+   HYPRE_Int            *nonglk_max_elmts;
 
    /* V-cycle F relaxation method */
    hypre_ParAMGData    **FrelaxVcycleData;
@@ -199,11 +182,34 @@ typedef struct
 #define FPT(i, bsize) (((i) % (bsize)) == FMRK)
 #define CPT(i, bsize) (((i) % (bsize)) == CMRK)
 
+/*--------------------------------------------------------------------------
+ * MGR print level codes
+ *--------------------------------------------------------------------------*/
+
+#define HYPRE_MGR_PRINT_INFO_SETUP  0x01       /*   1 (1st bit) */
+#define HYPRE_MGR_PRINT_INFO_SOLVE  0x02       /*   2 (2nd bit) */
+#define HYPRE_MGR_PRINT_INFO_PARAMS 0x04       /*   4 (3rd bit) */
+#define HYPRE_MGR_PRINT_MODE_ASCII  0x08       /*   8 (4th bit) */
+#define HYPRE_MGR_PRINT_FINE_MATRIX 0x10       /*  16 (5th bit) */
+#define HYPRE_MGR_PRINT_FINE_RHS    0x20       /*  32 (6th bit) */
+#define HYPRE_MGR_PRINT_CRSE_MATRIX 0x40       /*  64 (7th bit) */
+#define HYPRE_MGR_PRINT_LVLS_MATRIX 0x80       /* 128 (8th bit) */
+/* ... */
+/* Reserved codes */
+#define HYPRE_MGR_PRINT_RESERVED_C  0x10000000 /*  268435456 (29th bit) */
+#define HYPRE_MGR_PRINT_RESERVED_B  0x20000000 /*  536870912 (30th bit) */
+#define HYPRE_MGR_PRINT_RESERVED_A  0x40000000 /* 1073741824 (31th bit) */
+
 /*--------------------------------------------------------------------------
  * Acessor macros
  *--------------------------------------------------------------------------*/
 
 /* TODO (VPM): add remaining acessor macros */
+#define hypre_ParMGRDataBlockSize(data)             ((data) -> block_size)     /* TODO (VPM): block_dim? 3x3=9 is the block_size */
+#define hypre_ParMGRDataBlockNumCoarseIndexes(data) ((data) -> block_num_coarse_indexes)
+#define hypre_ParMGRDataBlockCFMarker(data)         ((data) -> block_cf_marker)
+#define hypre_ParMGRDataPointMarker(data)           ((data) -> point_marker_array)
+
 #define hypre_ParMGRDataNumCoarseLevels(data)       ((data) -> num_coarse_levels)     /* TODO (VPM): change to num_levels ? */
 #define hypre_ParMGRDataMaxCoarseLevels(data)       ((data) -> max_num_coarse_levels) /* TODO (VPM): change to max_levels ? */
 
@@ -213,6 +219,8 @@ typedef struct
 #define hypre_ParMGRDataB(data, i)                  ((data) -> B_array[i])
 #define hypre_ParMGRDataPArray(data)                ((data) -> P_array)
 #define hypre_ParMGRDataP(data, i)                  ((data) -> P_array[i])
+#define hypre_ParMGRDataRArray(data)                ((data) -> R_array)
+#define hypre_ParMGRDataR(data, i)                  ((data) -> R_array[i])
 #define hypre_ParMGRDataRTArray(data)               ((data) -> RT_array)
 #define hypre_ParMGRDataRT(data, i)                 ((data) -> RT_array[i])
 #define hypre_ParMGRDataBFFArray(data)              ((data) -> B_FF_array)
@@ -235,8 +243,8 @@ typedef struct
 #define hypre_ParMGRDataAFFsolver(data)             ((data) -> aff_solver)
 #define hypre_ParMGRDataAFFsolverI(data)            ((data) -> aff_solver[i])
 
-#define hypre_ParMGRDataCoarseGridMethod(data)      ((data) -> mgr_coarse_grid_method)
-#define hypre_ParMGRDataCoarseGridMethodI(data, i)  ((data) -> mgr_coarse_grid_method[i])
+#define hypre_ParMGRDataCoarseGridMethod(data)      ((data) -> coarse_grid_method)
+#define hypre_ParMGRDataCoarseGridMethodI(data, i)  ((data) -> coarse_grid_method[i])
 #define hypre_ParMGRDataCoarseGridSolver(data)      ((data) -> coarse_grid_solver)
 #define hypre_ParMGRDataCoarseGridSolverSetup(data) ((data) -> coarse_grid_solver_setup)
 
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr_device.c b/3rd_party/hypre/src/parcsr_ls/par_mgr_device.c
index 7c168978d..c9f37eb8c 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_mgr_device.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr_device.c
@@ -1001,133 +1001,4 @@ hypre_ParCSRMatrixBlockDiagMatrixDevice( hypre_ParCSRMatrix  *A,
    return hypre_error_flag;
 }
 
-/*--------------------------------------------------------------------------
- * hypre_MGRComputeNonGalerkinCGDevice
- *
- * See hypre_MGRComputeNonGalerkinCoarseGrid for available methods.
- *
- * TODO (VPM): Can we have a single function that works for host and device?
- *             inv(A_FF)*A_FC might have been computed before. Reuse it!
- *--------------------------------------------------------------------------*/
-
-HYPRE_Int
-hypre_MGRComputeNonGalerkinCGDevice(hypre_ParCSRMatrix    *A_FF,
-                                    hypre_ParCSRMatrix    *A_FC,
-                                    hypre_ParCSRMatrix    *A_CF,
-                                    hypre_ParCSRMatrix    *A_CC,
-                                    hypre_ParCSRMatrix    *Wp,
-                                    hypre_ParCSRMatrix    *Wr,
-                                    HYPRE_Int              blk_size,
-                                    HYPRE_Int              method,
-                                    HYPRE_Complex          threshold,
-                                    hypre_ParCSRMatrix   **A_H_ptr)
-{
-   /* Local variables */
-   hypre_ParCSRMatrix   *A_H;
-   hypre_ParCSRMatrix   *A_Hc;
-   hypre_ParCSRMatrix   *A_CF_trunc;
-   hypre_ParCSRMatrix   *Wp_tmp = Wp;
-   HYPRE_Complex         alpha  = -1.0;
-
-   hypre_GpuProfilingPushRange("MGRComputeNonGalerkinCG");
-
-   /* Truncate A_CF according to the method */
-   if (method == 2 || method == 3)
-   {
-      hypre_MGRTruncateAcfCPRDevice(A_CF, &A_CF_trunc);
-   }
-   else
-   {
-      A_CF_trunc = A_CF;
-   }
-
-   /* Compute Wp/Wr if not passed in */
-   if (!Wp && (method == 1 || method == 2))
-   {
-      hypre_Vector         *D_FF_inv;
-      HYPRE_Complex        *data;
-
-      /* Create vector to store A_FF's diagonal inverse  */
-      D_FF_inv = hypre_SeqVectorCreate(hypre_ParCSRMatrixNumRows(A_FF));
-      hypre_SeqVectorInitialize_v2(D_FF_inv, HYPRE_MEMORY_DEVICE);
-      data = hypre_VectorData(D_FF_inv);
-
-      /* Compute the inverse of A_FF and compute its inverse */
-      hypre_CSRMatrixExtractDiagonalDevice(hypre_ParCSRMatrixDiag(A_FF), data, 2);
-
-      /* Compute D_FF_inv*A_FC */
-      Wp_tmp = hypre_ParCSRMatrixClone(A_FC, 1);
-      hypre_CSRMatrixDiagScaleDevice(hypre_ParCSRMatrixDiag(Wp_tmp), D_FF_inv, NULL);
-      hypre_CSRMatrixDiagScaleDevice(hypre_ParCSRMatrixOffd(Wp_tmp), D_FF_inv, NULL);
-
-      /* Free memory */
-      hypre_SeqVectorDestroy(D_FF_inv);
-   }
-   else if (!Wp && (method == 3))
-   {
-      hypre_ParCSRMatrix  *B_FF_inv;
-
-      /* Compute the block diagonal inverse of A_FF */
-      hypre_ParCSRMatrixBlockDiagMatrixDevice(A_FF, blk_size, -1, NULL, 1, &B_FF_inv);
-
-      /* Compute Wp = A_FF_inv * A_FC */
-      Wp_tmp = hypre_ParCSRMatMat(B_FF_inv, A_FC);
-
-      /* Free memory */
-      hypre_ParCSRMatrixDestroy(B_FF_inv);
-   }
-   else
-   {
-      if (method != 5)
-      {
-         /* Use approximate inverse for ideal interploation */
-         hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Error: feature not implemented yet!");
-         hypre_GpuProfilingPopRange();
-
-         return hypre_error_flag;
-      }
-   }
-
-   /* Compute A_Hc (the correction for A_H) */
-   if (method != 5)
-   {
-      A_Hc = hypre_ParCSRMatMat(A_CF_trunc, Wp_tmp);
-   }
-   else if (Wr && (method == 5))
-   {
-      A_Hc = hypre_ParCSRMatMat(Wr, A_FC);
-   }
-   else
-   {
-      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Wr matrix was not provided!");
-      hypre_GpuProfilingPopRange();
-
-      return hypre_error_flag;
-   }
-
-   /* Drop small entries from A_Hc */
-   hypre_ParCSRMatrixDropSmallEntriesDevice(A_Hc, threshold, -1);
-
-   /* Coarse grid (Schur complement) computation */
-   hypre_ParCSRMatrixAdd(1.0, A_CC, alpha, A_Hc, &A_H);
-
-   /* Free memory */
-   hypre_ParCSRMatrixDestroy(A_Hc);
-   if (Wp_tmp != Wp)
-   {
-      hypre_ParCSRMatrixDestroy(Wp_tmp);
-   }
-   if (method == 2 || method == 3)
-   {
-      hypre_ParCSRMatrixDestroy(A_CF_trunc);
-   }
-
-   /* Set output pointer to coarse grid matrix */
-   *A_H_ptr = A_H;
-
-   hypre_GpuProfilingPopRange();
-
-   return hypre_error_flag;
-}
-
 #endif
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr_interp.c b/3rd_party/hypre/src/parcsr_ls/par_mgr_interp.c
index 635569352..68e7e6b9b 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_mgr_interp.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr_interp.c
@@ -18,22 +18,41 @@ HYPRE_Int
 hypre_MGRBuildInterp(hypre_ParCSRMatrix   *A,
                      hypre_ParCSRMatrix   *A_FF,
                      hypre_ParCSRMatrix   *A_FC,
-                     HYPRE_Int            *CF_marker,
-                     hypre_ParCSRMatrix   *aux_mat,
+                     hypre_ParCSRMatrix   *S,
+                     hypre_IntArray       *CF_marker,
                      HYPRE_BigInt         *num_cpts_global,
                      HYPRE_Real            trunc_factor,
                      HYPRE_Int             max_elmts,
                      HYPRE_Int             blk_size,
-                     hypre_ParCSRMatrix  **P_ptr,
                      HYPRE_Int             interp_type,
-                     HYPRE_Int             num_sweeps_post)
+                     HYPRE_Int             num_sweeps_post,
+                     hypre_ParCSRMatrix  **Wp_ptr,
+                     hypre_ParCSRMatrix  **P_ptr)
 {
+   HYPRE_Int             *CF_marker_data  = hypre_IntArrayData(CF_marker);
    hypre_ParCSRMatrix    *P = NULL;
+   hypre_ParCSRMatrix    *Wp = NULL;
+
 #if defined (HYPRE_USING_GPU)
-   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
+   HYPRE_MemoryLocation   memory_location = hypre_ParCSRMatrixMemoryLocation(A);
+   HYPRE_ExecutionPolicy  exec = hypre_GetExecPolicy1(memory_location);
 #endif
 
+   /* Sanity checks */
+   if (!Wp_ptr)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Wp_ptr is not NULL!");
+      return hypre_error_flag;
+   }
+
+   if (!P_ptr)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "P_ptr is not NULL!");
+      return hypre_error_flag;
+   }
+
    HYPRE_ANNOTATE_FUNC_BEGIN;
+   hypre_GpuProfilingPushRange("Interp");
 
    /* Interpolation for each level */
    if (interp_type < 3)
@@ -41,12 +60,13 @@ hypre_MGRBuildInterp(hypre_ParCSRMatrix   *A,
 #if defined (HYPRE_USING_GPU)
       if (exec == HYPRE_EXEC_DEVICE)
       {
-         hypre_MGRBuildPDevice(A, CF_marker, num_cpts_global, interp_type, &P);
+         hypre_MGRBuildPDevice(A, CF_marker_data, num_cpts_global, interp_type, &P);
       }
       else
 #endif
       {
-         hypre_MGRBuildPHost(A, CF_marker, num_cpts_global, interp_type, &P);
+         hypre_MGRBuildPHost(A, A_FF, A_FC, CF_marker_data, num_cpts_global,
+                             interp_type, &Wp, &P);
 
          /* TODO (VPM): Revisit Prolongation post-smoothing */
 #if 0
@@ -58,7 +78,7 @@ hypre_MGRBuildInterp(hypre_ParCSRMatrix   *A,
 
             for (i = 0; i < num_sweeps_post; i++)
             {
-               hypre_BoomerAMGJacobiInterp(A, &P, S, 1, NULL, CF_marker, 0,
+               hypre_BoomerAMGJacobiInterp(A, &P, S, 1, NULL, CF_marker_data, 0,
                                            jac_trunc_threshold, jac_trunc_threshold_minus);
             }
             hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
@@ -73,47 +93,54 @@ hypre_MGRBuildInterp(hypre_ParCSRMatrix   *A,
 #if defined (HYPRE_USING_GPU)
       if (exec == HYPRE_EXEC_DEVICE)
       {
-         hypre_error_w_msg(HYPRE_ERROR_GENERIC, "No GPU support!");
-
-         HYPRE_ANNOTATE_FUNC_END;
-         return hypre_error_flag;
+         hypre_IntArrayMigrate(CF_marker, HYPRE_MEMORY_HOST);
+         hypre_ParCSRMatrixMigrate(A, HYPRE_MEMORY_HOST);
+         hypre_MGRBuildInterpApproximateInverse(A, CF_marker_data, num_cpts_global, &P);
+         hypre_ParCSRMatrixMigrate(A, memory_location);
+         hypre_IntArrayMigrate(CF_marker, memory_location);
       }
       else
 #endif
       {
-         hypre_MGRBuildInterpApproximateInverse(A, CF_marker, num_cpts_global, &P);
-         hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
+         hypre_MGRBuildInterpApproximateInverse(A, CF_marker_data, num_cpts_global, &P);
       }
+
+      /* Perform truncation */
+      hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
    }
    else if (interp_type == 5)
    {
-      hypre_BoomerAMGBuildModExtInterp(A, CF_marker, aux_mat, num_cpts_global,
+      hypre_BoomerAMGBuildModExtInterp(A, CF_marker_data, S, num_cpts_global,
                                        1, NULL, 0, trunc_factor, max_elmts, &P);
    }
    else if (interp_type == 6)
    {
-      hypre_BoomerAMGBuildModExtPIInterp(A, CF_marker, aux_mat, num_cpts_global,
+      hypre_BoomerAMGBuildModExtPIInterp(A, CF_marker_data, S, num_cpts_global,
                                          1, NULL, 0, trunc_factor, max_elmts, &P);
    }
    else if (interp_type == 7)
    {
-      hypre_BoomerAMGBuildModExtPEInterp(A, CF_marker, aux_mat, num_cpts_global,
+      hypre_BoomerAMGBuildModExtPEInterp(A, CF_marker_data, S, num_cpts_global,
                                          1, NULL, 0, trunc_factor, max_elmts, &P);
    }
    else if (interp_type == 12)
    {
-      hypre_MGRBuildPBlockJacobi(A, A_FF, A_FC, aux_mat, blk_size, CF_marker, &P);
+      /* Block diagonal interpolation */
+      hypre_MGRBuildBlockJacobiWp(A_FF, A_FC, blk_size, &Wp);
+      hypre_MGRBuildBlockJacobiP(A, A_FF, A_FC, Wp, blk_size, CF_marker_data, &P);
    }
    else
    {
       /* Classical modified interpolation */
-      hypre_BoomerAMGBuildInterp(A, CF_marker, aux_mat, num_cpts_global,
-                                 1, NULL, 0, trunc_factor, max_elmts, &P);
+      hypre_BoomerAMGBuildInterp(A, CF_marker_data, S, num_cpts_global, 1, NULL, 0,
+                                 trunc_factor, max_elmts, &P);
    }
 
-   /* set pointer to P */
+   /* set pointer to Wp and P */
+   *Wp_ptr = Wp;
    *P_ptr = P;
 
+   hypre_GpuProfilingPopRange();
    HYPRE_ANNOTATE_FUNC_END;
 
    return hypre_error_flag;
@@ -164,6 +191,7 @@ hypre_MGRBuildRestrict( hypre_ParCSRMatrix    *A,
 #endif
 
    HYPRE_ANNOTATE_FUNC_BEGIN;
+   hypre_GpuProfilingPushRange("Restrict");
 
    /* Build AT (transpose A) */
    if (restrict_type > 0 && restrict_type != 14)
@@ -215,9 +243,36 @@ hypre_MGRBuildRestrict( hypre_ParCSRMatrix    *A,
       hypre_MGRBuildInterpApproximateInverse(AT, CF_marker_data, num_cpts_global, &RT);
       hypre_BoomerAMGInterpTruncation(RT, trunc_factor, max_elmts);
    }
+   else if (restrict_type == 4 || restrict_type == 5)
+   {
+      /* Approximate Ideal Restriction (AIR) */
+      HYPRE_Real    filter_thresholdR = 0.0;
+      HYPRE_Int     gmres_switch = 64;
+      HYPRE_Int     is_triangular = 0;
+      HYPRE_Int    *dofunc_buff_data = NULL;
+      HYPRE_Int     air15_flag = 1;
+      HYPRE_Int     debug = 0;
+
+      hypre_BoomerAMGCreateSabs(A, strong_threshold, 1.0, 1, NULL, &ST);
+
+      if (restrict_type == 4)
+      {
+         /* distance-1 AIR */
+         hypre_BoomerAMGBuildRestrAIR(A, CF_marker_data, ST, num_cpts_global, 1,
+                                      dofunc_buff_data, filter_thresholdR, debug, &R,
+                                      is_triangular, gmres_switch);
+      }
+      else
+      {
+         /* distance-1.5 AIR - distance 2 locally and distance 1 across procs. */
+         hypre_BoomerAMGBuildRestrDist2AIR(A, CF_marker_data, ST, num_cpts_global, 1,
+                                           dofunc_buff_data, filter_thresholdR, debug, &R,
+                                           air15_flag, is_triangular, gmres_switch);
+      }
+   }
    else if (restrict_type == 12)
    {
-      hypre_MGRBuildPBlockJacobi(AT, A_FFT, A_FCT, NULL, blk_size, CF_marker_data, &RT);
+      hypre_MGRBuildBlockJacobiP(AT, A_FFT, A_FCT, NULL, blk_size, CF_marker_data, &RT);
    }
    else if (restrict_type == 13) // CPR-like restriction operator
    {
@@ -232,6 +287,7 @@ hypre_MGRBuildRestrict( hypre_ParCSRMatrix    *A,
       {
          hypre_error_w_msg(HYPRE_ERROR_GENERIC, "No GPU support!");
 
+         hypre_GpuProfilingPopRange();
          HYPRE_ANNOTATE_FUNC_END;
          return hypre_error_flag;
       }
@@ -296,17 +352,12 @@ hypre_MGRBuildRestrict( hypre_ParCSRMatrix    *A,
    *W_ptr  = W;
 
    /* Free memory */
-   if (restrict_type > 0)
-   {
-      hypre_ParCSRMatrixDestroy(AT);
-      hypre_ParCSRMatrixDestroy(A_FFT);
-      hypre_ParCSRMatrixDestroy(A_FCT);
-   }
-   if (restrict_type > 5)
-   {
-      hypre_ParCSRMatrixDestroy(ST);
-   }
+   hypre_ParCSRMatrixDestroy(AT);
+   hypre_ParCSRMatrixDestroy(A_FFT);
+   hypre_ParCSRMatrixDestroy(A_FCT);
+   hypre_ParCSRMatrixDestroy(ST);
 
+   hypre_GpuProfilingPopRange();
    HYPRE_ANNOTATE_FUNC_END;
 
    return hypre_error_flag;
@@ -483,10 +534,13 @@ hypre_MGRBuildPFromWpHost( hypre_ParCSRMatrix    *A,
    hypre_CSRMatrixI(P_offd) = P_offd_i;
    hypre_CSRMatrixJ(P_offd) = P_offd_j;
 
-   hypre_ParCSRMatrixDeviceColMapOffd(P) = hypre_ParCSRMatrixDeviceColMapOffd(Wp);
-   hypre_ParCSRMatrixColMapOffd(P)       = hypre_ParCSRMatrixColMapOffd(Wp);
-   //hypre_ParCSRMatrixDeviceColMapOffd(Wp) = NULL;
-   //hypre_ParCSRMatrixColMapOffd(Wp)       = NULL;
+   hypre_ParCSRMatrixColMapOffd(P) =
+      hypre_TAlloc(HYPRE_BigInt,
+                   hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(Wp)),
+                   memory_location_P);
+   hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(P), hypre_ParCSRMatrixColMapOffd(Wp),
+                 HYPRE_BigInt, hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(Wp)),
+                 memory_location_P, memory_location_P);
 
    hypre_ParCSRMatrixNumNonzeros(P)  = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(P)) +
                                        hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(P));
@@ -501,7 +555,7 @@ hypre_MGRBuildPFromWpHost( hypre_ParCSRMatrix    *A,
 /*--------------------------------------------------------------------------
  * hypre_MGRBuildBlockJacobiWp
  *
- * TODO: Move this to hypre_MGRBuildPBlockJacobi? (VPM)
+ * TODO: Move this to hypre_MGRBuildBlockJacobiP? (VPM)
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
@@ -533,11 +587,11 @@ hypre_MGRBuildBlockJacobiWp( hypre_ParCSRMatrix   *A_FF,
 }
 
 /*--------------------------------------------------------------------------
- * hypre_MGRBuildPBlockJacobi
+ * hypre_MGRBuildBlockJacobiP
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-hypre_MGRBuildPBlockJacobi( hypre_ParCSRMatrix   *A,
+hypre_MGRBuildBlockJacobiP( hypre_ParCSRMatrix   *A,
                             hypre_ParCSRMatrix   *A_FF,
                             hypre_ParCSRMatrix   *A_FC,
                             hypre_ParCSRMatrix   *Wp,
@@ -585,7 +639,7 @@ hypre_ExtendWtoPHost(HYPRE_Int      P_nr_of_rows,
                      HYPRE_Int     *P_diag_j,
                      HYPRE_Complex *P_diag_data,
                      HYPRE_Int     *W_offd_i,
-                     HYPRE_Int     *P_offd_i )
+                     HYPRE_Int     *P_offd_i)
 {
    HYPRE_Int      jj_counter, jj_counter_offd;
    HYPRE_Int      start_indexing = 0; /* start indexing for P_data at 0 */
@@ -599,7 +653,7 @@ hypre_ExtendWtoPHost(HYPRE_Int      P_nr_of_rows,
     *  Intialize counters and allocate mapping vector.
     *-----------------------------------------------------------------------*/
 
-   fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  P_nr_of_rows, HYPRE_MEMORY_HOST);
+   fine_to_coarse = hypre_CTAlloc(HYPRE_Int, P_nr_of_rows, HYPRE_MEMORY_HOST);
 
    for (i = 0; i < P_nr_of_rows; i++) { fine_to_coarse[i] = -1; }
 
@@ -624,7 +678,7 @@ hypre_ExtendWtoPHost(HYPRE_Int      P_nr_of_rows,
    }
 
    /*-----------------------------------------------------------------------
-    *  Intialize some stuff.
+    *  Intialize counters
     *-----------------------------------------------------------------------*/
 
    jj_counter = start_indexing;
@@ -670,7 +724,8 @@ hypre_ExtendWtoPHost(HYPRE_Int      P_nr_of_rows,
    P_diag_i[P_nr_of_rows] = jj_counter;
 
    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
-   return 0;
+
+   return hypre_error_flag;
 }
 
 /*--------------------------------------------------------------------------
@@ -681,66 +736,62 @@ hypre_ExtendWtoPHost(HYPRE_Int      P_nr_of_rows,
 
 HYPRE_Int
 hypre_MGRBuildPHost( hypre_ParCSRMatrix   *A,
+                     hypre_ParCSRMatrix   *A_FF,
+                     hypre_ParCSRMatrix   *A_FC,
                      HYPRE_Int            *CF_marker,
                      HYPRE_BigInt         *num_cpts_global,
                      HYPRE_Int             method,
+                     hypre_ParCSRMatrix  **Wp_ptr,
                      hypre_ParCSRMatrix  **P_ptr)
 {
    MPI_Comm             comm = hypre_ParCSRMatrixComm(A);
+   HYPRE_Int            num_rows_A   = hypre_ParCSRMatrixNumRows(A);
+   HYPRE_Int            num_rows_AFF = hypre_ParCSRMatrixNumRows(A_FF);
    HYPRE_Int            num_procs, my_id;
-   HYPRE_Int            A_nr_of_rows = hypre_ParCSRMatrixNumRows(A);
 
-   hypre_ParCSRMatrix  *A_FF = NULL, *A_FC = NULL, *P = NULL;
+   HYPRE_Real           zero      = 0.0;
+   HYPRE_Real           one       = 1.0;
+   HYPRE_Complex        scal      = 1.0;
+   hypre_CSRMatrix     *A_FF_diag = hypre_ParCSRMatrixDiag(A_FF);
+   hypre_CSRMatrix     *A_FC_diag = hypre_ParCSRMatrixDiag(A_FC);
+   hypre_CSRMatrix     *A_FF_offd = hypre_ParCSRMatrixOffd(A_FF);
+   hypre_CSRMatrix     *A_FC_offd = hypre_ParCSRMatrixOffd(A_FC);
+
+   hypre_ParVector     *D_FF_inv;
+   hypre_ParCSRMatrix  *P, *Wp;
    hypre_CSRMatrix     *W_diag = NULL, *W_offd = NULL;
-   HYPRE_Int            P_diag_nnz, nfpoints;
+   hypre_CSRMatrix     *P_diag = NULL, *P_offd = NULL;
+   HYPRE_Int            P_diag_nnz;
    HYPRE_Int           *P_diag_i = NULL, *P_diag_j = NULL, *P_offd_i = NULL;
-   HYPRE_Complex       *P_diag_data = NULL, *diag = NULL, *diag1 = NULL;
-   HYPRE_BigInt         nC_global;
+   HYPRE_Complex       *P_diag_a = NULL, *diag = NULL, *diag_FF = NULL;
    HYPRE_Int            i;
+   HYPRE_Complex        dsum;
 
    HYPRE_MemoryLocation memory_location_P = hypre_ParCSRMatrixMemoryLocation(A);
 
    hypre_MPI_Comm_size(comm, &num_procs);
    hypre_MPI_Comm_rank(comm, &my_id);
 
-   nfpoints = 0;
-   for (i = 0; i < A_nr_of_rows; i++)
-   {
-      if (CF_marker[i] == -1)
-      {
-         nfpoints++;
-      }
-   }
-
    if (method > 0)
    {
-      hypre_ParCSRMatrixGenerateFFFCHost(A, CF_marker, num_cpts_global, NULL, &A_FC, &A_FF);
-      diag = hypre_CTAlloc(HYPRE_Complex, nfpoints, memory_location_P);
+      diag = hypre_CTAlloc(HYPRE_Complex, num_rows_AFF, memory_location_P);
       if (method == 1)
       {
-         // extract diag inverse sqrt
-         //        hypre_CSRMatrixExtractDiagonalHost(hypre_ParCSRMatrixDiag(A_FF), diag, 3);
-
          // L1-Jacobi-type interpolation
-         HYPRE_Complex     scal = 1.0;
-         hypre_CSRMatrix  *A_FF_diag = hypre_ParCSRMatrixDiag(A_FF);
-         hypre_CSRMatrix  *A_FC_diag = hypre_ParCSRMatrixDiag(A_FC);
-         hypre_CSRMatrix  *A_FF_offd = hypre_ParCSRMatrixOffd(A_FF);
-         hypre_CSRMatrix  *A_FC_offd = hypre_ParCSRMatrixOffd(A_FC);
-
-         diag1 = hypre_CTAlloc(HYPRE_Complex, nfpoints, memory_location_P);
+         diag_FF = hypre_CTAlloc(HYPRE_Complex, num_rows_AFF, memory_location_P);
          hypre_CSRMatrixExtractDiagonalHost(hypre_ParCSRMatrixDiag(A_FF), diag, 0);
-         hypre_CSRMatrixComputeRowSumHost(A_FF_diag, NULL, NULL, diag1, 1, 1.0, "set");
-         hypre_CSRMatrixComputeRowSumHost(A_FC_diag, NULL, NULL, diag1, 1, 1.0, "add");
-         hypre_CSRMatrixComputeRowSumHost(A_FF_offd, NULL, NULL, diag1, 1, 1.0, "add");
-         hypre_CSRMatrixComputeRowSumHost(A_FC_offd, NULL, NULL, diag1, 1, 1.0, "add");
+         hypre_CSRMatrixComputeRowSumHost(A_FF_diag, NULL, NULL, diag_FF, 1, 1.0, "set");
+         hypre_CSRMatrixComputeRowSumHost(A_FC_diag, NULL, NULL, diag_FF, 1, 1.0, "add");
+         hypre_CSRMatrixComputeRowSumHost(A_FF_offd, NULL, NULL, diag_FF, 1, 1.0, "add");
+         hypre_CSRMatrixComputeRowSumHost(A_FC_offd, NULL, NULL, diag_FF, 1, 1.0, "add");
 
-         for (i = 0; i < nfpoints; i++)
+         for (i = 0; i < num_rows_AFF; i++)
          {
-            HYPRE_Complex dsum = diag[i] + scal * (diag1[i] - hypre_cabs(diag[i]));
-            diag[i] = 1. / dsum;
+            dsum = diag[i] + scal * (diag_FF[i] - hypre_cabs(diag[i]));
+            diag[i] = (hypre_cabs(dsum) > zero || hypre_cabs(dsum) < zero) ?
+                      (one / dsum) : one;
          }
-         hypre_TFree(diag1, memory_location_P);
+         hypre_TFree(diag_FF, memory_location_P);
       }
       else if (method == 2)
       {
@@ -748,104 +799,95 @@ hypre_MGRBuildPHost( hypre_ParCSRMatrix   *A,
          hypre_CSRMatrixExtractDiagonalHost(hypre_ParCSRMatrixDiag(A_FF), diag, 2);
       }
 
-      for (i = 0; i < nfpoints; i++)
-      {
-         diag[i] = -diag[i];
-      }
-
-      hypre_Vector *D_FF_inv = hypre_SeqVectorCreate(nfpoints);
-      hypre_VectorData(D_FF_inv) = diag;
-      hypre_SeqVectorInitialize_v2(D_FF_inv, memory_location_P);
-      hypre_CSRMatrixDiagScale(hypre_ParCSRMatrixDiag(A_FC), D_FF_inv, NULL);
-      hypre_CSRMatrixDiagScale(hypre_ParCSRMatrixOffd(A_FC), D_FF_inv, NULL);
-      hypre_SeqVectorDestroy(D_FF_inv);
-      W_diag = hypre_ParCSRMatrixDiag(A_FC);
-      W_offd = hypre_ParCSRMatrixOffd(A_FC);
-      nC_global = hypre_ParCSRMatrixGlobalNumCols(A_FC);
+      Wp = hypre_ParCSRMatrixClone_v2(A_FC, 1, memory_location_P);
+      D_FF_inv = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_FF),
+                                       hypre_ParCSRMatrixGlobalNumRows(A_FF),
+                                       hypre_ParCSRMatrixRowStarts(A_FC));
+      hypre_ParVectorLocalData(D_FF_inv) = diag;
+      hypre_ParVectorInitialize_v2(D_FF_inv, memory_location_P);
+      hypre_ParVectorScale(-1.0, D_FF_inv);
+      hypre_ParCSRMatrixDiagScale(Wp, D_FF_inv, NULL);
+      hypre_ParVectorDestroy(D_FF_inv);
    }
    else
    {
-      W_diag = hypre_CSRMatrixCreate(nfpoints, A_nr_of_rows - nfpoints, 0);
-      W_offd = hypre_CSRMatrixCreate(nfpoints, 0, 0);
-      hypre_CSRMatrixInitialize_v2(W_diag, 0, memory_location_P);
-      hypre_CSRMatrixInitialize_v2(W_offd, 0, memory_location_P);
-
-      if (my_id == (num_procs - 1))
-      {
-         nC_global = num_cpts_global[1];
-      }
-      hypre_MPI_Bcast(&nC_global, 1, HYPRE_MPI_BIG_INT, num_procs - 1, comm);
+      Wp = hypre_ParCSRMatrixCreate(hypre_ParCSRMatrixComm(A_FF),
+                                    hypre_ParCSRMatrixGlobalNumRows(A_FF),
+                                    hypre_ParCSRMatrixGlobalNumCols(A_FC),
+                                    hypre_ParCSRMatrixRowStarts(A_FC),
+                                    hypre_ParCSRMatrixColStarts(A_FC),
+                                    0, 0, 0);
+      hypre_ParCSRMatrixInitialize_v2(Wp, memory_location_P);
    }
 
    /* Construct P from matrix product W_diag */
+   W_diag      = hypre_ParCSRMatrixDiag(Wp);
+   W_offd      = hypre_ParCSRMatrixOffd(Wp);
    P_diag_nnz  = hypre_CSRMatrixNumNonzeros(W_diag) + hypre_CSRMatrixNumCols(W_diag);
-   P_diag_i    = hypre_CTAlloc(HYPRE_Int,     A_nr_of_rows + 1, memory_location_P);
+   P_diag_i    = hypre_CTAlloc(HYPRE_Int,     num_rows_A + 1, memory_location_P);
    P_diag_j    = hypre_CTAlloc(HYPRE_Int,     P_diag_nnz,     memory_location_P);
-   P_diag_data = hypre_CTAlloc(HYPRE_Complex, P_diag_nnz,     memory_location_P);
-   P_offd_i    = hypre_CTAlloc(HYPRE_Int,     A_nr_of_rows + 1, memory_location_P);
+   P_diag_a    = hypre_CTAlloc(HYPRE_Complex, P_diag_nnz,     memory_location_P);
+   P_offd_i    = hypre_CTAlloc(HYPRE_Int,     num_rows_A + 1, memory_location_P);
 
    /* Extend W data to P data */
-   hypre_ExtendWtoPHost( A_nr_of_rows,
-                         CF_marker,
-                         hypre_CSRMatrixI(W_diag),
-                         hypre_CSRMatrixJ(W_diag),
-                         hypre_CSRMatrixData(W_diag),
-                         P_diag_i,
-                         P_diag_j,
-                         P_diag_data,
-                         hypre_CSRMatrixI(W_offd),
-                         P_offd_i );
-
-   // finalize P
+   hypre_ExtendWtoPHost(num_rows_A,
+                        CF_marker,
+                        hypre_CSRMatrixI(W_diag),
+                        hypre_CSRMatrixJ(W_diag),
+                        hypre_CSRMatrixData(W_diag),
+                        P_diag_i,
+                        P_diag_j,
+                        P_diag_a,
+                        hypre_CSRMatrixI(W_offd),
+                        P_offd_i);
+
+   /* finalize P */
    P = hypre_ParCSRMatrixCreate(hypre_ParCSRMatrixComm(A),
                                 hypre_ParCSRMatrixGlobalNumRows(A),
-                                nC_global,
+                                hypre_ParCSRMatrixGlobalNumCols(A_FC),
                                 hypre_ParCSRMatrixColStarts(A),
                                 num_cpts_global,
                                 hypre_CSRMatrixNumCols(W_offd),
                                 P_diag_nnz,
-                                hypre_CSRMatrixNumNonzeros(W_offd) );
-
-   hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixDiag(P)) = memory_location_P;
-   hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixOffd(P)) = memory_location_P;
-
-   hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(P))    = P_diag_i;
-   hypre_CSRMatrixJ(hypre_ParCSRMatrixDiag(P))    = P_diag_j;
-   hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(P)) = P_diag_data;
-
-   hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(P))    = P_offd_i;
-   hypre_CSRMatrixJ(hypre_ParCSRMatrixOffd(P))    = hypre_CSRMatrixJ(W_offd);
-   hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(P)) = hypre_CSRMatrixData(W_offd);
-   hypre_CSRMatrixJ(W_offd)    = NULL;
-   hypre_CSRMatrixData(W_offd) = NULL;
+                                hypre_CSRMatrixNumNonzeros(W_offd));
+   P_diag = hypre_ParCSRMatrixDiag(P);
+   P_offd = hypre_ParCSRMatrixOffd(P);
 
-   if (method > 0)
-   {
-      hypre_ParCSRMatrixColMapOffd(P)    = hypre_ParCSRMatrixColMapOffd(A_FC);
-      hypre_ParCSRMatrixColMapOffd(P)    = hypre_ParCSRMatrixColMapOffd(A_FC);
-      hypre_ParCSRMatrixColMapOffd(A_FC) = NULL;
-      hypre_ParCSRMatrixColMapOffd(A_FC) = NULL;
-      hypre_ParCSRMatrixNumNonzeros(P)   = hypre_ParCSRMatrixNumNonzeros(A_FC) +
-                                           hypre_ParCSRMatrixGlobalNumCols(A_FC);
-   }
-   else
-   {
-      hypre_ParCSRMatrixNumNonzeros(P) = nC_global;
-   }
+   hypre_CSRMatrixMemoryLocation(P_diag) = memory_location_P;
+   hypre_CSRMatrixMemoryLocation(P_offd) = memory_location_P;
+   hypre_CSRMatrixI(P_diag)              = P_diag_i;
+   hypre_CSRMatrixJ(P_diag)              = P_diag_j;
+   hypre_CSRMatrixData(P_diag)           = P_diag_a;
+   hypre_CSRMatrixI(P_offd)              = P_offd_i;
+   hypre_CSRMatrixJ(P_offd)              = hypre_TAlloc(HYPRE_Int,
+                                                        hypre_CSRMatrixNumNonzeros(W_offd),
+                                                        memory_location_P);
+   hypre_CSRMatrixData(P_offd)           = hypre_TAlloc(HYPRE_Complex,
+                                                        hypre_CSRMatrixNumNonzeros(W_offd),
+                                                        memory_location_P);
+   hypre_ParCSRMatrixColMapOffd(P)       = hypre_TAlloc(HYPRE_BigInt,
+                                                        hypre_CSRMatrixNumCols(W_offd),
+                                                        memory_location_P);
+
+   hypre_TMemcpy(hypre_CSRMatrixJ(P_offd), hypre_CSRMatrixJ(W_offd),
+                 HYPRE_Int, hypre_CSRMatrixNumNonzeros(W_offd),
+                 memory_location_P, memory_location_P);
+
+   hypre_TMemcpy(hypre_CSRMatrixData(P_offd), hypre_CSRMatrixData(W_offd),
+                 HYPRE_Complex, hypre_CSRMatrixNumNonzeros(W_offd),
+                 memory_location_P, memory_location_P);
+
+   hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(P), hypre_ParCSRMatrixColMapOffd(Wp),
+                 HYPRE_BigInt, hypre_CSRMatrixNumCols(W_offd),
+                 memory_location_P, memory_location_P);
+
+   hypre_ParCSRMatrixSetNumNonzeros(P);
    hypre_ParCSRMatrixDNumNonzeros(P) = (HYPRE_Real) hypre_ParCSRMatrixNumNonzeros(P);
    hypre_MatvecCommPkgCreate(P);
 
    /* Set output pointer */
-   *P_ptr = P;
-
-   /* Free memory */
-   hypre_ParCSRMatrixDestroy(A_FF);
-   hypre_ParCSRMatrixDestroy(A_FC);
-   if (method <= 0)
-   {
-      hypre_CSRMatrixDestroy(W_diag);
-      hypre_CSRMatrixDestroy(W_offd);
-   }
+   *P_ptr  = P;
+   *Wp_ptr = Wp;
 
    return hypre_error_flag;
 }
@@ -2248,8 +2290,8 @@ hypre_MGRTruncateAcfCPR(hypre_ParCSRMatrix  *A_CF,
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-hypre_MGRBuildRFromWHost(HYPRE_Int           *C_map,
-                         HYPRE_Int           *F_map,
+hypre_MGRBuildRFromWHost(hypre_IntArray      *C_map,
+                         hypre_IntArray      *F_map,
                          hypre_ParCSRMatrix  *W,
                          hypre_ParCSRMatrix  *R)
 {
@@ -2259,6 +2301,8 @@ hypre_MGRBuildRFromWHost(HYPRE_Int           *C_map,
    HYPRE_Int             *W_diag_j        = hypre_CSRMatrixJ(W_diag);
    HYPRE_Complex         *W_diag_a        = hypre_CSRMatrixData(W_diag);
    HYPRE_Int              W_diag_num_rows = hypre_CSRMatrixNumRows(W_diag);
+   HYPRE_Int             *C_map_data      = hypre_IntArrayData(C_map);
+   HYPRE_Int             *F_map_data      = hypre_IntArrayData(F_map);
 
    /* Output matrix */
    hypre_CSRMatrix       *R_diag          = hypre_ParCSRMatrixDiag(R);
@@ -2276,13 +2320,13 @@ hypre_MGRBuildRFromWHost(HYPRE_Int           *C_map,
       /* Set CF connections */
       for (j = W_diag_i[i]; j < W_diag_i[i + 1]; j++)
       {
-         R_diag_j[nnz_diag] = F_map[W_diag_j[j]];
+         R_diag_j[nnz_diag] = F_map_data[W_diag_j[j]];
          R_diag_a[nnz_diag] = - W_diag_a[j];
          nnz_diag++;
       }
 
       /* Set CC connection */
-      R_diag_j[nnz_diag] = C_map[i];
+      R_diag_j[nnz_diag] = C_map_data[i];
       R_diag_a[nnz_diag] = one;
       nnz_diag++;
 
@@ -2300,8 +2344,8 @@ hypre_MGRBuildRFromWHost(HYPRE_Int           *C_map,
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-hypre_MGRBuildRFromW(HYPRE_Int            *C_map,
-                     HYPRE_Int            *F_map,
+hypre_MGRBuildRFromW(hypre_IntArray       *C_map,
+                     hypre_IntArray       *F_map,
                      HYPRE_BigInt          global_num_rows_R,
                      HYPRE_BigInt          global_num_cols_R,
                      HYPRE_BigInt         *row_starts_R,
@@ -2358,6 +2402,8 @@ hypre_MGRBuildRFromW(HYPRE_Int            *C_map,
       /* TODO (VPM): Implement hypre_MGRBuildRFromWDevice */
       hypre_ParCSRMatrixMigrate(W, HYPRE_MEMORY_HOST);
       hypre_ParCSRMatrixMigrate(R, HYPRE_MEMORY_HOST);
+      hypre_IntArrayMigrate(C_map, HYPRE_MEMORY_HOST);
+      hypre_IntArrayMigrate(F_map, HYPRE_MEMORY_HOST);
       hypre_MGRBuildRFromWHost(C_map, F_map, W, R);
       hypre_ParCSRMatrixMigrate(W, HYPRE_MEMORY_DEVICE);
       hypre_ParCSRMatrixMigrate(R, HYPRE_MEMORY_DEVICE);
@@ -2446,8 +2492,8 @@ hypre_MGRColLumpedRestrict(hypre_ParCSRMatrix  *A,
    hypre_IntArraySeparateByValue(num_points, points, sizes, CF_marker, &CF_maps);
 
    /* Build restriction from W (R = [-W  I]) */
-   hypre_MGRBuildRFromW(hypre_IntArrayArrayEntryIData(CF_maps, 0),
-                        hypre_IntArrayArrayEntryIData(CF_maps, 1),
+   hypre_MGRBuildRFromW(hypre_IntArrayArrayEntryI(CF_maps, 0),
+                        hypre_IntArrayArrayEntryI(CF_maps, 1),
                         hypre_ParCSRMatrixGlobalNumRows(A_CF),
                         hypre_ParCSRMatrixGlobalNumCols(A),
                         hypre_ParCSRMatrixRowStarts(A_CF),
@@ -2561,8 +2607,8 @@ hypre_MGRBlockColLumpedRestrict(hypre_ParCSRMatrix  *A,
    hypre_IntArraySeparateByValue(num_points, points, sizes, CF_marker, &CF_maps);
 
    /* Build restriction from W (R = [-W  I]) */
-   hypre_MGRBuildRFromW(hypre_IntArrayArrayEntryIData(CF_maps, 0),
-                        hypre_IntArrayArrayEntryIData(CF_maps, 1),
+   hypre_MGRBuildRFromW(hypre_IntArrayArrayEntryI(CF_maps, 0),
+                        hypre_IntArrayArrayEntryI(CF_maps, 1),
                         hypre_ParCSRMatrixGlobalNumRows(A_CF),
                         hypre_ParCSRMatrixGlobalNumCols(A),
                         hypre_ParCSRMatrixRowStarts(A_CF),
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr_rap.c b/3rd_party/hypre/src/parcsr_ls/par_mgr_rap.c
new file mode 100644
index 000000000..e75eabda7
--- /dev/null
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr_rap.c
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+#include "_hypre_parcsr_ls.h"
+#include "par_mgr.h"
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRNonGalerkinTruncate
+ *
+ * Applies filtering in-place to the input matrix "A" based on the maximum
+ * number of nonzero entries per row. This algorithm is tailored to the needs
+ * of the Non-Galerkin approach in MGR.
+ *
+ *  - max_elmts == 0: no filtering
+ *  - max_elmts == 1 and blk_dim == 1: keep diagonal entries
+ *  - max_elmts == 1 and  blk_dim > 1: keep block diagonal entries
+ *  - max_elmts > 1 and blk_dim == 1: keep diagonal entries and
+ *                                    (max_elmts - 1) largest ones per row
+ *  - max_elmts > blk_dim and blk_dim > 1: keep block diagonal entries and
+ *                                         (max_elmts - blk_dim) largest ones
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRNonGalerkinTruncate(hypre_ParCSRMatrix *A,
+                             HYPRE_Int           ordering,
+                             HYPRE_Int           blk_dim,
+                             HYPRE_Int           max_elmts)
+{
+   HYPRE_MemoryLocation   memory_location = hypre_ParCSRMatrixMemoryLocation(A);
+   HYPRE_Int              nrows = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
+
+   hypre_CSRMatrix *A_diag    = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Complex   *A_diag_a  = hypre_CSRMatrixData(A_diag);
+   HYPRE_Int       *A_diag_i  = hypre_CSRMatrixI(A_diag);
+   HYPRE_Int       *A_diag_j  = hypre_CSRMatrixJ(A_diag);
+   HYPRE_Int        ncol_diag = hypre_CSRMatrixNumCols(A_diag);
+
+   hypre_CSRMatrix *A_offd    = hypre_ParCSRMatrixOffd(A);
+   HYPRE_Complex   *A_offd_a  = hypre_CSRMatrixData(A_offd);
+   HYPRE_Int       *A_offd_i  = hypre_CSRMatrixI(A_offd);
+   HYPRE_Int       *A_offd_j  = hypre_CSRMatrixJ(A_offd);
+
+   HYPRE_Int        i, i1, jj;
+
+   HYPRE_Int       *A_diag_i_new, *A_diag_j_new;
+   HYPRE_Complex   *A_diag_a_new;
+   HYPRE_Int        num_nonzeros_diag_new = 0;
+
+   HYPRE_Int       *A_offd_i_new, *A_offd_j_new;
+   HYPRE_Complex   *A_offd_a_new;
+   HYPRE_Int        num_nonzeros_offd_new = 0;
+   HYPRE_Int        num_nonzeros_max = (blk_dim + max_elmts) * nrows;
+   HYPRE_Int        num_nonzeros_offd_max = max_elmts * nrows;
+
+   HYPRE_Int        max_num_nonzeros;
+   HYPRE_Int       *aux_j = NULL;
+   HYPRE_Real      *aux_data = NULL;
+   HYPRE_Int        row_start, row_stop, cnt;
+   HYPRE_Int        col_idx;
+   HYPRE_Real       col_value;
+
+   /* Return if max_elmts is zero, i.e., no truncation */
+   if (max_elmts == 0)
+   {
+      return hypre_error_flag;
+   }
+
+   /* Allocate new memory */
+   if (ordering == 0)
+   {
+#if defined (HYPRE_USING_GPU)
+      if (hypre_GetExecPolicy1(memory_location) == HYPRE_EXEC_DEVICE)
+      {
+         hypre_ParCSRMatrixMigrate(A, HYPRE_MEMORY_HOST);
+      }
+#endif
+
+      A_diag_i_new = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST);
+      A_diag_j_new = hypre_CTAlloc(HYPRE_Int, num_nonzeros_max, HYPRE_MEMORY_HOST);
+      A_diag_a_new = hypre_CTAlloc(HYPRE_Complex, num_nonzeros_max, HYPRE_MEMORY_HOST);
+      A_offd_i_new = hypre_CTAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_HOST);
+      A_offd_j_new = hypre_CTAlloc(HYPRE_Int, num_nonzeros_offd_max, HYPRE_MEMORY_HOST);
+      A_offd_a_new = hypre_CTAlloc(HYPRE_Complex, num_nonzeros_offd_max, HYPRE_MEMORY_HOST);
+
+      if (max_elmts > 0)
+      {
+         max_num_nonzeros = 0;
+         for (i = 0; i < nrows; i++)
+         {
+            max_num_nonzeros = hypre_max(max_num_nonzeros,
+                                         (A_diag_i[i + 1] - A_diag_i[i]) +
+                                         (A_offd_i[i + 1] - A_offd_i[i]));
+         }
+         aux_j = hypre_CTAlloc(HYPRE_Int, max_num_nonzeros, HYPRE_MEMORY_HOST);
+         aux_data = hypre_CTAlloc(HYPRE_Real, max_num_nonzeros, HYPRE_MEMORY_HOST);
+      }
+
+      for (i = 0; i < nrows; i++)
+      {
+         row_start = i - (i % blk_dim);
+         row_stop  = row_start + blk_dim - 1;
+
+         /* Copy (block) diagonal data to new arrays */
+         for (jj = A_diag_i[i]; jj < A_diag_i[i + 1]; jj++)
+         {
+            i1 = A_diag_j[jj];
+            if (i1 >= row_start && i1 <= row_stop)
+            {
+               A_diag_j_new[num_nonzeros_diag_new] = i1;
+               A_diag_a_new[num_nonzeros_diag_new] = A_diag_a[jj];
+               ++num_nonzeros_diag_new;
+            }
+         }
+
+         /* Add other connections? */
+         if (max_elmts > 0)
+         {
+            cnt = 0;
+            for (jj = A_offd_i[i]; jj < A_offd_i[i + 1]; jj++)
+            {
+               aux_j[cnt] = A_offd_j[jj] + ncol_diag;
+               aux_data[cnt] = A_offd_a[jj];
+               cnt++;
+            }
+
+            for (jj = A_diag_i[i]; jj < A_diag_i[i + 1]; jj++)
+            {
+               aux_j[cnt] = A_diag_j[jj];
+               aux_data[cnt] = A_diag_a[jj];
+               cnt++;
+            }
+            hypre_qsort2_abs(aux_j, aux_data, 0, cnt - 1);
+
+            for (jj = 0; jj < hypre_min(max_elmts, cnt); jj++)
+            {
+               col_idx   = aux_j[jj];
+               col_value = aux_data[jj];
+               if (col_idx < ncol_diag && (col_idx < row_start || col_idx > row_stop))
+               {
+                  A_diag_j_new[num_nonzeros_diag_new] = col_idx;
+                  A_diag_a_new[num_nonzeros_diag_new] = col_value;
+                  ++num_nonzeros_diag_new;
+               }
+               else if (col_idx >= ncol_diag)
+               {
+                  A_offd_j_new[num_nonzeros_offd_new] = col_idx - ncol_diag;
+                  A_offd_a_new[num_nonzeros_offd_new] = col_value;
+                  ++num_nonzeros_offd_new;
+               }
+            }
+         }
+
+         A_diag_i_new[i + 1] = num_nonzeros_diag_new;
+         A_offd_i_new[i + 1] = num_nonzeros_offd_new;
+      }
+
+      hypre_TFree(aux_j, HYPRE_MEMORY_HOST);
+      hypre_TFree(aux_data, HYPRE_MEMORY_HOST);
+
+      /* Update input matrix */
+      hypre_TFree(A_diag_i, HYPRE_MEMORY_HOST);
+      hypre_TFree(A_diag_j, HYPRE_MEMORY_HOST);
+      hypre_TFree(A_diag_a, HYPRE_MEMORY_HOST);
+      hypre_CSRMatrixI(A_diag) = A_diag_i_new;
+      hypre_CSRMatrixJ(A_diag) = A_diag_j_new;
+      hypre_CSRMatrixData(A_diag) = A_diag_a_new;
+      hypre_CSRMatrixNumNonzeros(A_diag) = num_nonzeros_diag_new;
+
+      hypre_TFree(A_offd_i, HYPRE_MEMORY_HOST);
+      hypre_TFree(A_offd_j, HYPRE_MEMORY_HOST);
+      hypre_TFree(A_offd_a, HYPRE_MEMORY_HOST);
+      hypre_CSRMatrixI(A_offd) = A_offd_i_new;
+      hypre_CSRMatrixJ(A_offd) = A_offd_j_new;
+      hypre_CSRMatrixData(A_offd) = A_offd_a_new;
+      hypre_CSRMatrixNumNonzeros(A_offd) = num_nonzeros_offd_new;
+
+#if defined (HYPRE_USING_GPU)
+      if (hypre_GetExecPolicy1(memory_location) == HYPRE_EXEC_DEVICE)
+      {
+         hypre_ParCSRMatrixMigrate(A, memory_location);
+      }
+#endif
+   }
+   else
+   {
+      /* Keep only the diagonal portion of A
+         TODO (VPM): consider other combinations of max_elmts and blk_dim */
+      hypre_CSRMatrixNumCols(A_offd) = 0;
+      hypre_CSRMatrixNumNonzeros(A_offd) = 0;
+      hypre_CSRMatrixNumRownnz(A_offd) = 0;
+      hypre_TFree(hypre_CSRMatrixRownnz(A_offd), memory_location);
+      hypre_TFree(hypre_CSRMatrixI(A_offd), memory_location);
+      hypre_TFree(hypre_CSRMatrixJ(A_offd), memory_location);
+      hypre_TFree(hypre_CSRMatrixData(A_offd), memory_location);
+      hypre_TFree(hypre_ParCSRMatrixColMapOffd(A), HYPRE_MEMORY_HOST);
+      hypre_TFree(hypre_ParCSRMatrixDeviceColMapOffd(A), memory_location);
+      hypre_CSRMatrixI(A_offd) = hypre_CTAlloc(HYPRE_Int, nrows + 1, memory_location);
+
+      hypre_CSRMatrixTruncateDiag(A_diag);
+   }
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRBuildNonGalerkinCoarseOperatorHost
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRBuildNonGalerkinCoarseOperatorHost(hypre_ParCSRMatrix    *A_FF,
+                                            hypre_ParCSRMatrix    *A_FC,
+                                            hypre_ParCSRMatrix    *A_CF,
+                                            hypre_ParCSRMatrix    *A_CC,
+                                            hypre_ParCSRMatrix    *Wp,
+                                            hypre_ParCSRMatrix    *Wr,
+                                            HYPRE_Int              fine_blk_dim,
+                                            HYPRE_Int              coarse_blk_dim,
+                                            HYPRE_Int              ordering,
+                                            HYPRE_Int              method,
+                                            HYPRE_Int              max_elmts,
+                                            hypre_ParCSRMatrix   **A_H_ptr)
+{
+   hypre_ParCSRMatrix    *A_H = NULL;
+   hypre_ParCSRMatrix    *A_Hc = NULL;
+   hypre_ParCSRMatrix    *Wp_tmp = NULL;
+   hypre_ParCSRMatrix    *Wr_tmp = NULL;
+   hypre_ParCSRMatrix    *A_CF_truncated = NULL;
+   hypre_ParCSRMatrix    *A_FF_inv = NULL;
+   hypre_ParCSRMatrix    *minus_Wp = NULL;
+
+   HYPRE_Int              blk_inv_size;
+   HYPRE_Real             neg_one = -1.0;
+   HYPRE_Real             one = 1.0;
+   HYPRE_Real             beta = neg_one;
+
+   if (Wp != NULL && max_elmts > 0)
+   {
+      /* A_Hc = diag(A_CF * Wp) */
+      hypre_ParCSRMatMatDiag(A_CF, Wp, &A_Hc);
+
+      /* Coarse grid / Schur complement
+         Note that beta is one since A_Hc has positive sign */
+      hypre_ParCSRMatrixAdd(one, A_CC, one, A_Hc, &A_H);
+
+      /* Free memory */
+      hypre_ParCSRMatrixDestroy(A_Hc);
+
+      /* Set output pointer */
+      *A_H_ptr = A_H;
+
+      return hypre_error_flag;
+   }
+
+   if (method == 1)
+   {
+      if (Wp != NULL)
+      {
+         A_Hc = hypre_ParCSRMatMat(A_CF, Wp);
+         beta = one;
+      }
+      else
+      {
+         // Build block diagonal inverse for A_FF
+         hypre_ParCSRMatrixBlockDiagMatrix(A_FF, fine_blk_dim, -1, NULL, 1, &A_FF_inv);
+
+         // compute Wp = A_FF_inv * A_FC
+         // NOTE: Use hypre_ParMatmul here instead of hypre_ParCSRMatMat to avoid padding
+         // zero entries at diagonals for the latter routine. Use MatMat once this padding
+         // issue is resolved since it is more efficient.
+         //         hypre_ParCSRMatrix *Wp_tmp = hypre_ParCSRMatMat(A_FF_inv, A_FC);
+         Wp_tmp = hypre_ParMatmul(A_FF_inv, A_FC);
+
+         /* Compute correction A_Hc = A_CF * (A_FF_inv * A_FC); */
+         A_Hc = hypre_ParCSRMatMat(A_CF, Wp_tmp);
+         hypre_ParCSRMatrixDestroy(Wp_tmp);
+         hypre_ParCSRMatrixDestroy(A_FF_inv);
+      }
+   }
+   else if (method == 2 || method == 3)
+   {
+      /* Extract the diagonal of A_CF */
+      hypre_MGRTruncateAcfCPR(A_CF, &A_CF_truncated);
+      if (Wp != NULL)
+      {
+         A_Hc = hypre_ParCSRMatMat(A_CF_truncated, Wp);
+      }
+      else
+      {
+         blk_inv_size = method == 2 ? 1 : fine_blk_dim;
+         hypre_ParCSRMatrixBlockDiagMatrix(A_FF, blk_inv_size, -1, NULL, 1, &A_FF_inv);
+
+         /* TODO (VPM): We shouldn't need to compute Wr_tmp since we are passing in Wr already */
+         HYPRE_UNUSED_VAR(Wr);
+         Wr_tmp = hypre_ParCSRMatMat(A_CF_truncated, A_FF_inv);
+         A_Hc = hypre_ParCSRMatMat(Wr_tmp, A_FC);
+         hypre_ParCSRMatrixDestroy(Wr_tmp);
+         hypre_ParCSRMatrixDestroy(A_FF_inv);
+      }
+      hypre_ParCSRMatrixDestroy(A_CF_truncated);
+   }
+   else if (method == 4)
+   {
+      /* Approximate inverse for ideal interploation */
+      hypre_MGRApproximateInverse(A_FF, &A_FF_inv);
+
+      minus_Wp = hypre_ParCSRMatMat(A_FF_inv, A_FC);
+      A_Hc = hypre_ParCSRMatMat(A_CF, minus_Wp);
+
+      hypre_ParCSRMatrixDestroy(minus_Wp);
+   }
+
+   /* Drop small entries in the correction term A_Hc */
+   hypre_MGRNonGalerkinTruncate(A_Hc, ordering, coarse_blk_dim, max_elmts);
+
+   /* Coarse grid / Schur complement */
+   hypre_ParCSRMatrixAdd(one, A_CC, beta, A_Hc, &A_H);
+
+   /* Free memory */
+   hypre_ParCSRMatrixDestroy(A_Hc);
+
+   /* Set output pointer */
+   *A_H_ptr = A_H;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRBuildNonGalerkinCoarseOperatorDevice
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRBuildNonGalerkinCoarseOperatorDevice(hypre_ParCSRMatrix    *A_FF,
+                                              hypre_ParCSRMatrix    *A_FC,
+                                              hypre_ParCSRMatrix    *A_CF,
+                                              hypre_ParCSRMatrix    *A_CC,
+                                              hypre_ParCSRMatrix    *Wp,
+                                              hypre_ParCSRMatrix    *Wr,
+                                              HYPRE_Int              fine_blk_dim,
+                                              HYPRE_Int              coarse_blk_dim,
+                                              HYPRE_Int              ordering,
+                                              HYPRE_Int              method,
+                                              HYPRE_Int              max_elmts,
+                                              hypre_ParCSRMatrix   **A_H_ptr)
+{
+   /* Local variables */
+   hypre_ParCSRMatrix   *A_H;
+   hypre_ParCSRMatrix   *A_Hc;
+   hypre_ParCSRMatrix   *A_CF_trunc;
+   hypre_ParCSRMatrix   *Wp_tmp = Wp;
+   HYPRE_Complex         alpha  = -1.0;
+
+   hypre_GpuProfilingPushRange("MGRComputeNonGalerkinCG");
+
+   /* Truncate A_CF according to the method */
+   if (method == 2 || method == 3)
+   {
+      hypre_MGRTruncateAcfCPRDevice(A_CF, &A_CF_trunc);
+   }
+   else
+   {
+      A_CF_trunc = A_CF;
+   }
+
+   /* Compute Wp/Wr if not passed in */
+   if (!Wp && (method == 1 || method == 2))
+   {
+      hypre_ParVector      *D_FF_inv;
+      HYPRE_Complex        *data;
+
+      /* Create vector to store A_FF's diagonal inverse  */
+      D_FF_inv = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_FF),
+                                       hypre_ParCSRMatrixGlobalNumRows(A_FF),
+                                       hypre_ParCSRMatrixRowStarts(A_FF));
+      hypre_ParVectorInitialize_v2(D_FF_inv, HYPRE_MEMORY_DEVICE);
+      data = hypre_ParVectorLocalData(D_FF_inv);
+
+      /* Compute the inverse of A_FF and compute its inverse */
+      hypre_CSRMatrixExtractDiagonal(hypre_ParCSRMatrixDiag(A_FF), data, 2);
+      hypre_ParVectorScale(-1.0, D_FF_inv);
+
+      /* Compute D_FF_inv*A_FC */
+      Wp_tmp = hypre_ParCSRMatrixClone(A_FC, 1);
+      hypre_ParCSRMatrixDiagScale(Wp_tmp, D_FF_inv, NULL);
+
+      /* Free memory */
+      hypre_ParVectorDestroy(D_FF_inv);
+   }
+   else if (!Wp && (method == 3))
+   {
+      hypre_ParCSRMatrix  *B_FF_inv;
+
+      /* Compute the block diagonal inverse of A_FF */
+      hypre_ParCSRMatrixBlockDiagMatrix(A_FF, fine_blk_dim, -1, NULL, 1, &B_FF_inv);
+
+      /* Compute Wp = A_FF_inv * A_FC */
+      Wp_tmp = hypre_ParCSRMatMat(B_FF_inv, A_FC);
+      hypre_ParCSRMatrixScale(Wp_tmp, -1.0);
+
+      /* Free memory */
+      hypre_ParCSRMatrixDestroy(B_FF_inv);
+   }
+
+   /* Compute A_Hc (the correction for A_H) */
+   if (Wp_tmp)
+   {
+      if (max_elmts > 0)
+      {
+         /* A_Hc = diag(A_CF * Wp) */
+         hypre_ParCSRMatMatDiag(A_CF_trunc, Wp_tmp, &A_Hc);
+
+         /* Coarse grid / Schur complement */
+         hypre_ParCSRMatrixAdd(1.0, A_CC, 1.0, A_Hc, &A_H);
+
+         /* Free memory */
+         hypre_ParCSRMatrixDestroy(A_Hc);
+         if (method == 2 || method == 3)
+         {
+            hypre_ParCSRMatrixDestroy(A_CF_trunc);
+         }
+         if (Wp_tmp != Wp)
+         {
+            hypre_ParCSRMatrixDestroy(Wp_tmp);
+         }
+
+         /* Set output pointer */
+         *A_H_ptr = A_H;
+
+         hypre_GpuProfilingPopRange();
+
+         return hypre_error_flag;
+      }
+
+      A_Hc = hypre_ParCSRMatMat(A_CF_trunc, Wp_tmp);
+   }
+   else if (Wr)
+   {
+      A_Hc = hypre_ParCSRMatMat(Wr, A_FC);
+   }
+   else
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Wp/Wr matrices was not provided!");
+      hypre_GpuProfilingPopRange();
+
+      return hypre_error_flag;
+   }
+
+   /* Filter A_Hc */
+   hypre_MGRNonGalerkinTruncate(A_Hc, ordering, coarse_blk_dim, max_elmts);
+
+   /* Coarse grid (Schur complement) computation */
+   hypre_ParCSRMatrixAdd(1.0, A_CC, alpha, A_Hc, &A_H);
+
+   /* Free memory */
+   hypre_ParCSRMatrixDestroy(A_Hc);
+   if (Wp_tmp != Wp)
+   {
+      hypre_ParCSRMatrixDestroy(Wp_tmp);
+   }
+   if (method == 2 || method == 3)
+   {
+      hypre_ParCSRMatrixDestroy(A_CF_trunc);
+   }
+
+   /* Set output pointer to coarse grid matrix */
+   *A_H_ptr = A_H;
+
+   hypre_GpuProfilingPopRange();
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRBuildNonGalerkinCoarseOperator
+ *
+ * Computes the coarse level operator A_H = RAP via a Non-Galerkin approach.
+ *
+ * Available methods:
+ *   1: inv(A_FF) approximated by its (block) diagonal inverse
+ *   2: CPR-like approx. with inv(A_FF) approx. by its diagonal inverse
+ *   3: CPR-like approx. with inv(A_FF) approx. by its block diagonal inverse
+ *   4: inv(A_FF) approximated by sparse approximate inverse
+ *
+ * Methods 1-4 assume that restriction is the injection operator.
+ *
+ * TODO (VPM): inv(A_FF)*A_FC might have been computed before. Reuse it!
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRBuildNonGalerkinCoarseOperator(hypre_ParCSRMatrix    *A_FF,
+                                        hypre_ParCSRMatrix    *A_FC,
+                                        hypre_ParCSRMatrix    *A_CF,
+                                        hypre_ParCSRMatrix    *A_CC,
+                                        hypre_ParCSRMatrix    *Wp,
+                                        hypre_ParCSRMatrix    *Wr,
+                                        HYPRE_Int              fine_blk_dim,
+                                        HYPRE_Int              coarse_blk_dim,
+                                        HYPRE_Int              ordering,
+                                        HYPRE_Int              method,
+                                        HYPRE_Int              max_elmts,
+                                        hypre_ParCSRMatrix   **A_H_ptr)
+{
+   hypre_ParCSRMatrix   *matrices[6] = {A_FF, A_FC, A_CF, A_CC, Wp, Wr};
+   HYPRE_Int             i;
+
+   /* Check that the memory locations of the input matrices match */
+   for (i = 0; i < 5; i++)
+   {
+      if (matrices[i] && matrices[i + 1] &&
+          hypre_ParCSRMatrixMemoryLocation(matrices[i]) !=
+          hypre_ParCSRMatrixMemoryLocation(matrices[i + 1]))
+      {
+         hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Memory locations do not match!");
+         return hypre_error_flag;
+      }
+   }
+
+#if defined (HYPRE_USING_GPU)
+   HYPRE_MemoryLocation  memory_location = hypre_ParCSRMatrixMemoryLocation(A_FF);
+
+   if (hypre_GetExecPolicy1(memory_location) == HYPRE_EXEC_DEVICE)
+   {
+      hypre_MGRBuildNonGalerkinCoarseOperatorDevice(A_FF, A_FC, A_CF, A_CC, Wp, Wr,
+                                                    fine_blk_dim, coarse_blk_dim,
+                                                    ordering, method, max_elmts, A_H_ptr);
+   }
+   else
+#endif
+   {
+      hypre_MGRBuildNonGalerkinCoarseOperatorHost(A_FF, A_FC, A_CF, A_CC, Wp, Wr,
+                                                  fine_blk_dim, coarse_blk_dim,
+                                                  ordering, method, max_elmts, A_H_ptr);
+   }
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MGRBuildCoarseOperator
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MGRBuildCoarseOperator(void                *mgr_vdata,
+                             hypre_ParCSRMatrix  *A_FF,
+                             hypre_ParCSRMatrix  *A_FC,
+                             hypre_ParCSRMatrix  *A_CF,
+                             hypre_ParCSRMatrix **A_CC_ptr,
+                             hypre_ParCSRMatrix  *Wp,
+                             hypre_ParCSRMatrix  *Wr,
+                             HYPRE_Int            level)
+{
+   hypre_ParMGRData      *mgr_data = (hypre_ParMGRData*) mgr_vdata;
+   hypre_ParCSRMatrix    *A  = (mgr_data -> A_array)[level];
+   hypre_ParCSRMatrix    *P  = (mgr_data -> P_array)[level];
+   hypre_ParCSRMatrix    *R  = (mgr_data -> R_array)[level];
+   hypre_ParCSRMatrix    *RT = (mgr_data -> RT_array)[level];
+   hypre_ParCSRMatrix    *A_CC = *A_CC_ptr;
+
+   HYPRE_Int             *blk_dims = (mgr_data -> block_num_coarse_indexes);
+   HYPRE_Int              block_size = (mgr_data -> block_size);
+   HYPRE_Int              method = (mgr_data -> coarse_grid_method)[level];
+   HYPRE_Int              num_coarse_levels = (mgr_data -> max_num_coarse_levels);
+   HYPRE_Int              ordering = (mgr_data -> set_c_points_method);
+   HYPRE_Int              max_elmts = (mgr_data -> nonglk_max_elmts)[level];
+   HYPRE_Real             threshold = (mgr_data -> truncate_coarse_grid_threshold);
+
+   hypre_ParCSRMatrix    *AP, *RAP, *RAP_c;
+   HYPRE_Int              fine_blk_dim = (level) ? blk_dims[level - 1] - blk_dims[level] :
+                                         block_size - blk_dims[level];
+   HYPRE_Int              coarse_blk_dim = blk_dims[level];
+   HYPRE_Int              rebuild_commpkg = 0;
+
+   HYPRE_ANNOTATE_FUNC_BEGIN;
+   hypre_GpuProfilingPushRange("RAP");
+
+   if (!method)
+   {
+      /* Galerkin path */
+      if (Wr && !Wp)
+      {
+         /* Prolongation is the injection operator (Wp == NULL) and
+            Restriction is not the injection operator (Wr != NULL) */
+         RAP_c = hypre_ParCSRMatMat(Wr, A_FC);
+         hypre_ParCSRMatrixAdd(1.0, A_CC, -1.0, RAP_c, &RAP);
+         hypre_ParCSRMatrixDestroy(RAP_c);
+      }
+      else if (RT)
+      {
+         RAP = hypre_ParCSRMatrixRAPKT(RT, A, P, 1);
+      }
+      else if (R)
+      {
+         AP  = hypre_ParCSRMatMat(A, P);
+         RAP = hypre_ParCSRMatMat(R, AP);
+         hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(RAP));
+         hypre_ParCSRMatrixDestroy(AP);
+      }
+      else
+      {
+         hypre_GpuProfilingPopRange();
+         hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Expected either R or RT!");
+         return hypre_error_flag;
+      }
+   }
+   else if (method == 5)
+   {
+      /* Approximate the coarse level matrix as A_CC */
+      RAP = *A_CC_ptr;
+      *A_CC_ptr = NULL;
+   }
+   else
+   {
+      /* Non-Galerkin path */
+      hypre_MGRBuildNonGalerkinCoarseOperator(A_FF, A_FC, A_CF, A_CC, Wp, Wr,
+                                              fine_blk_dim, coarse_blk_dim,
+                                              ordering, method, max_elmts, &RAP);
+   }
+
+   /* Truncate coarse level matrix based on input threshold */
+   if (threshold > 0.0)
+   {
+#if defined (HYPRE_USING_GPU)
+      HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(RAP);
+
+      if (hypre_GetExecPolicy1(memory_location) == HYPRE_EXEC_DEVICE)
+      {
+         hypre_ParCSRMatrixDropSmallEntriesDevice(RAP, threshold, -1);
+         rebuild_commpkg = 1;
+      }
+      else
+#endif
+      {
+         hypre_ParCSRMatrixTruncate(RAP, threshold, 0, 0, 0);
+      }
+   }
+
+   /* Compute/rebuild communication package */
+   if (rebuild_commpkg)
+   {
+      if (hypre_ParCSRMatrixCommPkg(RAP))
+      {
+         hypre_MatvecCommPkgDestroy(hypre_ParCSRMatrixCommPkg(RAP));
+      }
+      hypre_MatvecCommPkgCreate(RAP);
+   }
+   if (!hypre_ParCSRMatrixCommPkg(RAP))
+   {
+      hypre_MatvecCommPkgCreate(RAP);
+   }
+
+   /* Set coarse grid matrix */
+   (mgr_data -> A_array)[level + 1] = RAP;
+   if ((level + 1) == num_coarse_levels)
+   {
+      (mgr_data -> RAP) = RAP;
+   }
+
+   hypre_GpuProfilingPopRange();
+   HYPRE_ANNOTATE_FUNC_END;
+
+   return hypre_error_flag;
+}
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr_setup.c b/3rd_party/hypre/src/parcsr_ls/par_mgr_setup.c
index 619c53a25..ef484ea0a 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_mgr_setup.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr_setup.c
@@ -34,13 +34,9 @@ hypre_MGRSetup( void               *mgr_vdata,
    hypre_ParCSRMatrix  *R  = NULL;
    hypre_ParCSRMatrix  *P = NULL;
    hypre_ParCSRMatrix  *S = NULL;
-   hypre_ParCSRMatrix  *ST = NULL;
-   hypre_ParCSRMatrix  *AT = NULL;
    hypre_ParCSRMatrix  *Wp = NULL;
    hypre_ParCSRMatrix  *Wr = NULL;
-   hypre_ParCSRMatrix  *AP = NULL;
 
-   HYPRE_Int           *dof_func_buff_data = NULL;
    HYPRE_BigInt         coarse_pnts_global[2]; // TODO: Change to row_starts_cpts
    HYPRE_BigInt         row_starts_fpts[2];
    hypre_Vector       **l1_norms = NULL;
@@ -81,7 +77,6 @@ hypre_MGRSetup( void               *mgr_vdata,
    hypre_ParCSRMatrix  **P_array = (mgr_data -> P_array);
    hypre_ParCSRMatrix  **R_array = (mgr_data -> RT_array);
    hypre_ParCSRMatrix  **RT_array = (mgr_data -> RT_array);
-   hypre_ParCSRMatrix   *RAP_ptr = NULL;
 
    hypre_ParCSRMatrix  *A_FF = NULL;
    hypre_ParCSRMatrix  *A_FC = NULL;
@@ -134,12 +129,12 @@ hypre_MGRSetup( void               *mgr_vdata,
 
    HYPRE_Int *Frelax_type = (mgr_data -> Frelax_type);
 
-   HYPRE_Int *mgr_coarse_grid_method = (mgr_data -> mgr_coarse_grid_method);
+   HYPRE_Int *coarse_grid_method = (mgr_data -> coarse_grid_method);
 
-   HYPRE_Int use_air = 0;
    HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);
+#if defined(HYPRE_USING_GPU)
    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1(memory_location);
-   HYPRE_Real truncate_cg_threshold = (mgr_data -> truncate_coarse_grid_threshold);
+#endif
    char        region_name[1024];
    char        msg[2048];
 
@@ -147,6 +142,7 @@ hypre_MGRSetup( void               *mgr_vdata,
    HYPRE_ANNOTATE_FUNC_BEGIN;
    hypre_GpuProfilingPushRange("MGRSetup");
    hypre_GpuProfilingPushRange("MGRSetup-Init");
+   hypre_MemoryPrintUsage(comm, hypre_HandleLogLevel(hypre_handle()), "MGR setup begin", 0);
 
    block_size = (mgr_data -> block_size);
    block_jacobi_bsize = (mgr_data -> block_jacobi_bsize);
@@ -225,6 +221,7 @@ hypre_MGRSetup( void               *mgr_vdata,
 
       HYPRE_ANNOTATE_FUNC_END;
       hypre_GpuProfilingPopRange();
+      hypre_GpuProfilingPopRange();
 
       return hypre_error_flag;
    }
@@ -617,7 +614,7 @@ hypre_MGRSetup( void               *mgr_vdata,
    /* Allocate memory for level structure */
    if (A_array == NULL)
    {
-      A_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+      A_array = hypre_CTAlloc(hypre_ParCSRMatrix*, max_num_coarse_levels + 1, HYPRE_MEMORY_HOST);
    }
    if (B_array == NULL)
    {
@@ -715,29 +712,24 @@ hypre_MGRSetup( void               *mgr_vdata,
    }
 #endif
 
-   /* Set default for using non-Galerkin coarse grid */
-   if (mgr_coarse_grid_method == NULL)
+   /* Set default for using Non-Galerkin coarse grid */
+   if (coarse_grid_method == NULL)
+   {
+      coarse_grid_method = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+      (mgr_data -> coarse_grid_method) = coarse_grid_method;
+   }
+
+   /* Set default for Non-Galerkin correction truncation */
+   if ((mgr_data -> nonglk_max_elmts) == NULL)
    {
-      mgr_coarse_grid_method = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
+      (mgr_data -> nonglk_max_elmts) = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
       for (i = 0; i < max_num_coarse_levels; i++)
       {
-         mgr_coarse_grid_method[i] = 0;
+         (mgr_data -> nonglk_max_elmts)[i] = 1;
       }
-      (mgr_data -> mgr_coarse_grid_method) = mgr_coarse_grid_method;
    }
 
-   /*
-   if (Frelax_num_functions== NULL)
-   {
-     Frelax_num_functions = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
-     for (i = 0; i < max_num_coarse_levels; i++)
-     {
-       Frelax_num_functions[i] = 1;
-     }
-     (mgr_data -> Frelax_num_functions) = Frelax_num_functions;
-   }
-   */
-   /* Set default for interp_type and restrict_type if not set already */
+   /* Set default options for the interpolation type at each level if not already set */
    if (interp_type == NULL)
    {
       interp_type = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
@@ -747,6 +739,8 @@ hypre_MGRSetup( void               *mgr_vdata,
       }
       (mgr_data -> interp_type) = interp_type;
    }
+
+   /* Set default options for restriction type at each level if not already set */
    if (restrict_type == NULL)
    {
       restrict_type = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
@@ -756,6 +750,8 @@ hypre_MGRSetup( void               *mgr_vdata,
       }
       (mgr_data -> restrict_type) = restrict_type;
    }
+
+   /* Set default number of sweeps at each level if not already set */
    if (num_relax_sweeps == NULL)
    {
       num_relax_sweeps = hypre_CTAlloc(HYPRE_Int, max_num_coarse_levels, HYPRE_MEMORY_HOST);
@@ -990,12 +986,12 @@ hypre_MGRSetup( void               *mgr_vdata,
    /* begin coarsening loop */
    num_coarsening_levs = max_num_coarse_levels;
 
-   /* initialize level data matrix here */
-   RAP_ptr = A;
-
    /* Close MGRSetup-Init region */
    hypre_GpuProfilingPopRange();
 
+   /* Initialize first entry in A_array to the input matrix */
+   A_array[0] = A;
+
    /* loop over levels of coarsening */
    for (lev = 0; lev < num_coarsening_levs; lev++)
    {
@@ -1009,8 +1005,7 @@ hypre_MGRSetup( void               *mgr_vdata,
       /* Set level's block size */
       level_blk_size = (lev == 0) ? block_size : block_num_coarse_indexes[lev - 1];
 
-      /* Initialize A_array */
-      A_array[lev] = RAP_ptr;
+      /* Get number of local unknowns */
       nloc = hypre_ParCSRMatrixNumRows(A_array[lev]);
 
       /* Reset pointers */
@@ -1020,11 +1015,21 @@ hypre_MGRSetup( void               *mgr_vdata,
       hypre_sprintf(region_name, "Global-Relax");
       hypre_GpuProfilingPushRange(region_name);
       HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
+
+      /* TODO (VPM): Change option types for block-Jacobi and block-GS to 30 and 31 and
+            make them accessible through hypre_BoomerAMGRelax? */
       if (level_smooth_iters[lev] > 0)
       {
-         /* TODO (VPM): Change option types for block-Jacobi and block-GS to 30 and 31 and
-            make them accessible through hypre_BoomerAMGRelax? */
-         if (level_smooth_type[lev] == 0 || level_smooth_type[lev] == 1)
+         if (level_smoother[lev])
+         {
+            hypre_Solver *smoother_base = (hypre_Solver*) level_smoother[lev];
+
+            /* Call setup function */
+            hypre_SolverSetup(smoother_base)((HYPRE_Solver) level_smoother[lev],
+                                             (HYPRE_Matrix) A_array[lev],
+                                             NULL, NULL);
+         }
+         else if (level_smooth_type[lev] == 0 || level_smooth_type[lev] == 1)
          {
             /* TODO (VPM): move this to hypre_MGRBlockRelaxSetup and change its declaration */
 #if defined (HYPRE_USING_GPU)
@@ -1128,32 +1133,24 @@ hypre_MGRSetup( void               *mgr_vdata,
       hypre_ParCSRMatrixGenerateFFFC(A_array[lev], hypre_IntArrayData(FC_marker), row_starts_fpts,
                                      NULL, &A_CF, &A_CC);
 
-      /* Build MGR interpolation */
-      hypre_sprintf(region_name, "Interp");
-      hypre_GpuProfilingPushRange(region_name);
-      HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
+      /* Build interpolation operator */
+      hypre_MGRBuildInterp(A_array[lev], A_FF, A_FC, S, CF_marker_array[lev],
+                           coarse_pnts_global, trunc_factor, P_max_elmts[lev],
+                           block_jacobi_bsize, interp_type[lev], num_interp_sweeps,
+                           &Wp, &P);
+      P_array[lev] = P;
 
-      if (interp_type[lev] == 12)
+      /* Build Restriction operator */
+      if (block_jacobi_bsize == 1 && restrict_type[lev] == 12)
       {
-         if (mgr_coarse_grid_method[lev] != 0)
-         {
-            hypre_MGRBuildBlockJacobiWp(A_FF, A_FC, block_jacobi_bsize, &Wp);
-         }
-         hypre_MGRBuildInterp(A_array[lev], A_FF, A_FC, CF_marker, Wp,
-                              coarse_pnts_global, trunc_factor, P_max_elmts[lev],
-                              block_jacobi_bsize, &P, interp_type[lev],
-                              num_interp_sweeps);
-      }
-      else
-      {
-         hypre_MGRBuildInterp(A_array[lev], A_FF, A_FC, CF_marker, S,
-                              coarse_pnts_global, trunc_factor, P_max_elmts[lev],
-                              block_jacobi_bsize, &P, interp_type[lev],
-                              num_interp_sweeps);
+         restrict_type[lev] = 2;
       }
-
-      hypre_GpuProfilingPopRange();
-      HYPRE_ANNOTATE_REGION_END("%s", region_name);
+      hypre_MGRBuildRestrict(A_array[lev], A_FF, A_FC, A_CF, CF_marker_array[lev],
+                             coarse_pnts_global, trunc_factor, P_max_elmts[lev],
+                             strong_threshold, max_row_sum, block_jacobi_bsize,
+                             restrict_type[lev], &Wr, &R, &RT);
+      R_array[lev]  = R;
+      RT_array[lev] = RT;
 
       /* Use block Jacobi F-relaxation with block Jacobi interpolation */
       hypre_sprintf(region_name, "F-Relax");
@@ -1192,7 +1189,10 @@ hypre_MGRSetup( void               *mgr_vdata,
                                                    -1, CF_marker, inv_size, 1, diag_inv);
             frelax_diaginv[lev] = diag_inv;
             blk_size[lev] = block_jacobi_bsize;
-            hypre_MGRBuildAff(A_array[lev], CF_marker, debug_flag, &A_FF);
+            if (!A_FF)
+            {
+               hypre_MGRBuildAff(A_array[lev], CF_marker, debug_flag, &A_FF);
+            }
          }
 
          /* Set A_ff pointer */
@@ -1213,196 +1213,18 @@ hypre_MGRSetup( void               *mgr_vdata,
       hypre_GpuProfilingPopRange();
       HYPRE_ANNOTATE_REGION_END("%s", region_name);
 
-      P_array[lev] = P;
-
-      if (restrict_type[lev] == 4)
-      {
-         use_air = 1;
-      }
-      else if (restrict_type[lev] == 5)
-      {
-         use_air = 2;
-      }
-      else
-      {
-         use_air = 0;
-      }
-
-      if (use_air)
-      {
-         HYPRE_Real    filter_thresholdR = 0.0;
-         HYPRE_Int     gmres_switch = 64;
-         HYPRE_Int     is_triangular = 0;
-
-         hypre_sprintf(region_name, "Restrict");
-         hypre_GpuProfilingPushRange(region_name);
-         HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
-
-         /* for AIR, need absolute value SOC */
-         hypre_BoomerAMGCreateSabs(A_array[lev], strong_threshold, 1.0, 1, NULL, &ST);
-
-         /* !!! Ensure that CF_marker contains -1 or 1 !!! */
-         /*
-         for (i = 0; i < hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array[level])); i++)
-         {
-           CF_marker[i] = CF_marker[i] > 0 ? 1 : -1;
-         }
-         */
-         if (use_air == 1) /* distance-1 AIR */
-         {
-            hypre_BoomerAMGBuildRestrAIR(A_array[lev], CF_marker,
-                                         ST, coarse_pnts_global, 1,
-                                         dof_func_buff_data, filter_thresholdR,
-                                         debug_flag, &R, is_triangular, gmres_switch);
-         }
-         else /* distance-1.5 AIR - distance 2 locally and distance 1 across procs. */
-         {
-            hypre_BoomerAMGBuildRestrDist2AIR(A_array[lev], CF_marker,
-                                              ST, coarse_pnts_global, 1,
-                                              dof_func_buff_data, filter_thresholdR,
-                                              debug_flag, &R, 1, is_triangular, gmres_switch);
-         }
-         R_array[lev] = R;
-         hypre_GpuProfilingPopRange();
-         HYPRE_ANNOTATE_REGION_END("%s", region_name);
-
-         /* Use two matrix products to generate A_H */
-         hypre_ParCSRMatrix *AP = NULL;
-
-         hypre_sprintf(region_name, "RAP");
-         hypre_GpuProfilingPushRange(region_name);
-         HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
-         AP      = hypre_ParMatmul(A_array[lev], P_array[lev]);
-         RAP_ptr = hypre_ParMatmul(R, AP);
-         if (num_procs > 1)
-         {
-            hypre_MatvecCommPkgCreate(RAP_ptr);
-         }
-
-         /* Delete AP */
-         hypre_ParCSRMatrixDestroy(AP);
-         hypre_GpuProfilingPopRange();
-         HYPRE_ANNOTATE_REGION_END("%s", region_name);
-      }
-      else
-      {
-         if (mgr_coarse_grid_method[lev] != 0)
-         {
-            HYPRE_Int block_num_f_points = level_blk_size - block_num_coarse_indexes[lev];
-
-            hypre_sprintf(region_name, "Restrict");
-            hypre_GpuProfilingPushRange(region_name);
-            HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
-            if (block_num_f_points == 1 && restrict_type[lev] == 12)
-            {
-               restrict_type[lev] = 2;
-            }
-
-            hypre_MGRBuildRestrict(A_array[lev], A_FF, A_FC, A_CF, CF_marker_array[lev],
-                                   coarse_pnts_global, trunc_factor, P_max_elmts[lev],
-                                   strong_threshold, max_row_sum, block_num_f_points,
-                                   restrict_type[lev], &Wr, &R, &RT);
-            R_array[lev]  = R;
-            RT_array[lev] = RT;
-
-            hypre_GpuProfilingPopRange();
-            HYPRE_ANNOTATE_REGION_END("%s", region_name);
-
-            hypre_sprintf(region_name, "RAP");
-            hypre_GpuProfilingPushRange(region_name);
-            HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
-
-#if defined (HYPRE_USING_GPU)
-            if (exec == HYPRE_EXEC_DEVICE)
-            {
-               hypre_MGRComputeNonGalerkinCGDevice(A_FF, A_FC, A_CF, A_CC,
-                                                   Wp, Wr, block_num_f_points,
-                                                   mgr_coarse_grid_method[lev],
-                                                   truncate_cg_threshold,
-                                                   &RAP_ptr);
-            }
-            else
-#endif
-            {
-               hypre_MGRComputeNonGalerkinCoarseGrid(A_FF, A_FC, A_CF, A_CC, Wp, Wr,
-                                                     block_num_f_points, set_c_points_method,
-                                                     mgr_coarse_grid_method[lev],
-                                                     P_max_elmts[lev], &RAP_ptr);
-            }
-
-            if (interp_type[lev] == 12)
-            {
-               hypre_ParCSRMatrixDeviceColMapOffd(Wp) = NULL;
-               hypre_ParCSRMatrixColMapOffd(Wp)       = NULL;
-               hypre_ParCSRMatrixDestroy(Wp);
-               Wp = NULL;
-            }
-            hypre_GpuProfilingPopRange();
-            HYPRE_ANNOTATE_REGION_END("%s", region_name);
-         }
-         else
-         {
-            hypre_sprintf(region_name, "Restrict");
-            hypre_GpuProfilingPushRange(region_name);
-            HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
-            if (block_jacobi_bsize == 1 && restrict_type[lev] == 12)
-            {
-               restrict_type[lev] = 2;
-            }
-            hypre_MGRBuildRestrict(A_array[lev], A_FF, A_FC, A_CF, CF_marker_array[lev],
-                                   coarse_pnts_global, trunc_factor, P_max_elmts[lev],
-                                   strong_threshold, max_row_sum, block_jacobi_bsize,
-                                   restrict_type[lev], &Wr, &R, &RT);
-            R_array[lev]  = R;
-            RT_array[lev] = RT;
-            hypre_GpuProfilingPopRange();
-            HYPRE_ANNOTATE_REGION_END("%s", region_name);
-
-            hypre_sprintf(region_name, "RAP");
-            hypre_GpuProfilingPushRange(region_name);
-            HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
-            if (RT)
-            {
-               RAP_ptr = hypre_ParCSRMatrixRAPKT(RT, A_array[lev], P, 1);
-            }
-            else if (R)
-            {
-               AP      = hypre_ParCSRMatMat(A_array[lev], P);
-               RAP_ptr = hypre_ParCSRMatMat(R, AP);
-               hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag(RAP_ptr));
-               hypre_ParCSRMatrixDestroy(AP);
-            }
-            else
-            {
-               hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Expected either R or RT!");
-               return hypre_error_flag;
-            }
-            hypre_GpuProfilingPopRange();
-            HYPRE_ANNOTATE_REGION_END("%s", region_name);
-         }
-      }
-
-      /* TODO (VPM): truncation is also performed in hypre_MGRComputeNonGalerkinCoarseGrid */
-      if (truncate_cg_threshold > 0.0)
-      {
-         /* Truncate the coarse grid */
-         if (exec == HYPRE_EXEC_HOST)
-         {
-            hypre_ParCSRMatrixTruncate(RAP_ptr, truncate_cg_threshold, 0, 0, 0);
-         }
-#if defined (HYPRE_USING_GPU)
-         else
-         {
-            hypre_ParCSRMatrixDropSmallEntriesDevice(RAP_ptr, truncate_cg_threshold, -1);
-         }
-#endif
-      }
+      /* Compute coarse level matrix */
+      hypre_MGRBuildCoarseOperator(mgr_vdata, A_FF, A_FC, A_CF, &A_CC, Wp, Wr, lev);
 
       /* Destroy temporary variables */
       hypre_ParCSRMatrixDestroy(A_FC), A_FC = NULL;
       hypre_ParCSRMatrixDestroy(A_CF), A_CF = NULL;
-      hypre_ParCSRMatrixDestroy(A_CC), A_CF = NULL;
+      hypre_ParCSRMatrixDestroy(A_CC), A_CC = NULL;
       hypre_ParCSRMatrixDestroy(Wr); Wr = NULL;
+      if (Wp)
+      {
+         hypre_ParCSRMatrixDestroy(Wp); Wp = NULL;
+      }
 
       /* User-prescribed F-solver */
       if (Frelax_type[lev] == 2  ||
@@ -1427,6 +1249,7 @@ hypre_MGRSetup( void               *mgr_vdata,
                                        "F-relaxation solver has not been setup\n");
                      HYPRE_ANNOTATE_FUNC_END;
                      hypre_GpuProfilingPopRange();
+                     hypre_GpuProfilingPopRange();
 
                      return hypre_error_flag;
                   }
@@ -1499,17 +1322,23 @@ hypre_MGRSetup( void               *mgr_vdata,
          }
 
          /* TODO: Check use of A_ff_array[lev], vectors at (lev + 1) are correct? (VPM) */
-         F_fine_array[lev + 1] =
-            hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_ff_array[lev]),
-                                  hypre_ParCSRMatrixGlobalNumRows(A_ff_array[lev]),
-                                  hypre_ParCSRMatrixRowStarts(A_ff_array[lev]));
-         hypre_ParVectorInitialize(F_fine_array[lev + 1]);
+         if (!F_fine_array[lev + 1])
+         {
+            F_fine_array[lev + 1] =
+               hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_ff_array[lev]),
+                                     hypre_ParCSRMatrixGlobalNumRows(A_ff_array[lev]),
+                                     hypre_ParCSRMatrixRowStarts(A_ff_array[lev]));
+            hypre_ParVectorInitialize(F_fine_array[lev + 1]);
+         }
 
-         U_fine_array[lev + 1] =
-            hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_ff_array[lev]),
-                                  hypre_ParCSRMatrixGlobalNumRows(A_ff_array[lev]),
-                                  hypre_ParCSRMatrixRowStarts(A_ff_array[lev]));
-         hypre_ParVectorInitialize(U_fine_array[lev + 1]);
+         if (!U_fine_array[lev + 1])
+         {
+            U_fine_array[lev + 1] =
+               hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_ff_array[lev]),
+                                     hypre_ParCSRMatrixGlobalNumRows(A_ff_array[lev]),
+                                     hypre_ParCSRMatrixRowStarts(A_ff_array[lev]));
+            hypre_ParVectorInitialize(U_fine_array[lev + 1]);
+         }
       }
 
       /* TODO: refactor this block (VPM) */
@@ -1621,28 +1450,19 @@ hypre_MGRSetup( void               *mgr_vdata,
 
       /* allocate space for solution and rhs arrays */
       F_array[lev + 1] =
-         hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_ptr),
-                               hypre_ParCSRMatrixGlobalNumRows(RAP_ptr),
-                               hypre_ParCSRMatrixRowStarts(RAP_ptr));
+         hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_array[lev + 1]),
+                               hypre_ParCSRMatrixGlobalNumRows(A_array[lev + 1]),
+                               hypre_ParCSRMatrixRowStarts(A_array[lev + 1]));
       hypre_ParVectorInitialize(F_array[lev + 1]);
 
       U_array[lev + 1] =
-         hypre_ParVectorCreate(hypre_ParCSRMatrixComm(RAP_ptr),
-                               hypre_ParCSRMatrixGlobalNumRows(RAP_ptr),
-                               hypre_ParCSRMatrixRowStarts(RAP_ptr));
+         hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A_array[lev + 1]),
+                               hypre_ParCSRMatrixGlobalNumRows(A_array[lev + 1]),
+                               hypre_ParCSRMatrixRowStarts(A_array[lev + 1]));
       hypre_ParVectorInitialize(U_array[lev + 1]);
 
       /* free memory before starting next level */
-      hypre_ParCSRMatrixDestroy(S);
-      S = NULL;
-
-      if (!use_air)
-      {
-         hypre_ParCSRMatrixDestroy(AT);
-         AT = NULL;
-      }
-      hypre_ParCSRMatrixDestroy(ST);
-      ST = NULL;
+      hypre_ParCSRMatrixDestroy(S); S = NULL;
 
       /* check if Vcycle smoother setup required */
       if ((mgr_data -> max_local_lvls) > 1)
@@ -1683,8 +1503,7 @@ hypre_MGRSetup( void               *mgr_vdata,
    }
 
    /* set pointer to last level matrix */
-   (mgr_data->num_coarse_levels) = num_c_levels;
-   (mgr_data->RAP) = RAP_ptr;
+   (mgr_data -> num_coarse_levels) = num_c_levels;
 
    /* setup default coarsest grid solver (BoomerAMG) */
    if (use_default_cgrid_solver)
@@ -1712,7 +1531,7 @@ hypre_MGRSetup( void               *mgr_vdata,
    /* keep reserved coarse indexes to coarsest grid */
    if (reserved_coarse_size > 0 && lvl_to_keep_cpoints == 0)
    {
-      ilower = hypre_ParCSRMatrixFirstRowIndex(RAP_ptr);
+      ilower = hypre_ParCSRMatrixFirstRowIndex(A_array[num_c_levels]);
       for (i = 0; i < reserved_coarse_size; i++)
       {
          reserved_coarse_indexes[i] = (HYPRE_BigInt) (reserved_Cpoint_local_indexes[i] + ilower);
@@ -1722,13 +1541,14 @@ hypre_MGRSetup( void               *mgr_vdata,
                                 reserved_coarse_indexes);
    }
 
-   /* setup coarse grid solver */
+   /* Setup coarse grid solver */
    hypre_sprintf(region_name, "%s-%d", "MGR_Level", num_c_levels);
    hypre_GpuProfilingPushRange(region_name);
    HYPRE_ANNOTATE_REGION_BEGIN("%s", region_name);
 
    cgrid_solver_setup((mgr_data -> coarse_grid_solver),
-                      RAP_ptr, F_array[num_c_levels],
+                      A_array[num_c_levels],
+                      F_array[num_c_levels],
                       U_array[num_c_levels]);
 
    hypre_GpuProfilingPopRange();
@@ -1889,8 +1709,9 @@ hypre_MGRSetup( void               *mgr_vdata,
    /* Print MGR and linear system info according to print level */
    hypre_MGRDataPrint(mgr_vdata);
 
-   HYPRE_ANNOTATE_FUNC_END;
+   hypre_MemoryPrintUsage(comm, hypre_HandleLogLevel(hypre_handle()), "MGR setup end", 0);
    hypre_GpuProfilingPopRange();
+   HYPRE_ANNOTATE_FUNC_END;
 
    return hypre_error_flag;
 }
@@ -1911,6 +1732,7 @@ hypre_MGRSetupFrelaxVcycleData( void               *mgr_vdata,
    MPI_Comm           comm = hypre_ParCSRMatrixComm(A);
    hypre_ParMGRData   *mgr_data = (hypre_ParMGRData*) mgr_vdata;
    hypre_ParAMGData    **FrelaxVcycleData = mgr_data -> FrelaxVcycleData;
+   HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);
 
    HYPRE_Int i, j, num_procs, my_id;
 
@@ -2084,7 +1906,7 @@ hypre_MGRSetupFrelaxVcycleData( void               *mgr_vdata,
 
    while (not_finished)
    {
-      local_size = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A_array_local[lev_local]));
+      local_size = hypre_ParCSRMatrixNumRows(A_array_local[lev_local]);
       dof_func_data = NULL;
       if (dof_func_array[lev_local])
       {
@@ -2093,12 +1915,14 @@ hypre_MGRSetupFrelaxVcycleData( void               *mgr_vdata,
 
       if (lev_local == 0)
       {
-         /* use the CF_marker from the outer MGR cycle to create the strength connection matrix */
-         hypre_BoomerAMGCreateSFromCFMarker(A_array_local[lev_local], strong_threshold,
+         /* use the CF_marker from the outer MGR cycle
+            to create the strength connection matrix */
+         hypre_BoomerAMGCreateSFromCFMarker(A_array_local[lev_local],
+                                            strong_threshold,
                                             max_row_sum,
                                             hypre_IntArrayData(CF_marker_array[lev]),
-                                            num_functions, dof_func_data, smrk_local, &S_local);
-         //hypre_ParCSRMatrixPrintIJ(S_local, 0, 0, "S_mat");
+                                            num_functions, dof_func_data,
+                                            smrk_local, &S_local);
       }
       else if (lev_local > 0)
       {
@@ -2108,7 +1932,7 @@ hypre_MGRSetupFrelaxVcycleData( void               *mgr_vdata,
       }
 
       CF_marker_array_local[lev_local] = hypre_IntArrayCreate(local_size);
-      hypre_IntArrayInitialize(CF_marker_array_local[lev_local]);
+      hypre_IntArrayInitialize_v2(CF_marker_array_local[lev_local], memory_location);
       CF_marker_local = hypre_IntArrayData(CF_marker_array_local[lev_local]);
 
       hypre_BoomerAMGCoarsenHMIS(S_local, A_array_local[lev_local], measure_type,
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr_solve.c b/3rd_party/hypre/src/parcsr_ls/par_mgr_solve.c
index 244ff1472..3d75cb343 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_mgr_solve.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr_solve.c
@@ -78,6 +78,7 @@ hypre_MGRSolve( void               *mgr_vdata,
       return hypre_error_flag;
    }
 
+   A_array[0] = A;
    U_array[0] = u;
    F_array[0] = f;
 
@@ -705,7 +706,6 @@ hypre_MGRCycle( void              *mgr_vdata,
                                               level_diaginv, Vtemp);
                   }
                }
-               hypre_ParVectorAllZeros(U_array[fine_grid]) = 0;
             }
             else if ((level_smooth_type[fine_grid] > 1) &&
                      (level_smooth_type[fine_grid] < 7))
@@ -733,16 +733,17 @@ hypre_MGRCycle( void              *mgr_vdata,
 
                   /* Update solution */
                   hypre_ParVectorAxpy(fp_one, Utemp, U_array[fine_grid]);
-                  hypre_ParVectorAllZeros(U_array[fine_grid]) = 0;
                }
             }
-            else if (level_smooth_type[fine_grid] == 16)
+            else if ((mgr_data -> level_smoother)[fine_grid])
             {
-               /* hypre_ILU smoother */
-               HYPRE_ILUSolve((mgr_data -> level_smoother)[fine_grid],
-                              A_array[fine_grid], F_array[fine_grid],
-                              U_array[fine_grid]);
-               hypre_ParVectorAllZeros(U_array[fine_grid]) = 0;
+               /* Generic smoother object */
+               hypre_Solver *base = (hypre_Solver*) (mgr_data -> level_smoother)[fine_grid];
+
+               hypre_SolverSolve(base)((mgr_data -> level_smoother)[fine_grid],
+                                       (HYPRE_Matrix) A_array[fine_grid],
+                                       (HYPRE_Vector) F_array[fine_grid],
+                                       (HYPRE_Vector) U_array[fine_grid]);
             }
             else
             {
@@ -755,6 +756,7 @@ hypre_MGRCycle( void              *mgr_vdata,
                                        U_array[fine_grid], Vtemp, Ztemp);
                }
             }
+            hypre_ParVectorAllZeros(U_array[fine_grid]) = 0;
 
             /* Error checking */
             if (HYPRE_GetError())
@@ -813,8 +815,8 @@ hypre_MGRCycle( void              *mgr_vdata,
                   {
                      hypre_MGRBlockRelaxSolveDevice(B_FF_array[fine_grid],
                                                     A_ff_array[fine_grid],
-                                                    F_fine_array[fine_grid],
-                                                    U_fine_array[fine_grid],
+                                                    F_fine_array[coarse_grid],
+                                                    U_fine_array[coarse_grid],
                                                     Vtemp, fp_one);
                   }
                   else
@@ -848,20 +850,25 @@ hypre_MGRCycle( void              *mgr_vdata,
                if (relax_type == 18)
                {
 #if defined(HYPRE_USING_GPU)
-                  for (i = 0; i < nsweeps[fine_grid]; i++)
+                  if (exec == HYPRE_EXEC_DEVICE)
                   {
-                     hypre_MGRRelaxL1JacobiDevice(A_array[fine_grid], F_array[fine_grid],
-                                                  CF_marker_data, relax_points, relax_weight,
-                                                  l1_norms, U_array[fine_grid], Vtemp);
+                     for (i = 0; i < nsweeps[fine_grid]; i++)
+                     {
+                        hypre_MGRRelaxL1JacobiDevice(A_array[fine_grid], F_array[fine_grid],
+                                                     CF_marker_data, relax_points, relax_weight,
+                                                     l1_norms, U_array[fine_grid], Vtemp);
+                     }
                   }
-#else
-                  for (i = 0; i < nsweeps[fine_grid]; i++)
+                  else
+#endif
                   {
-                     hypre_ParCSRRelax_L1_Jacobi(A_array[fine_grid], F_array[fine_grid],
-                                                 CF_marker_data, relax_points, relax_weight,
-                                                 l1_norms, U_array[fine_grid], Vtemp);
+                     for (i = 0; i < nsweeps[fine_grid]; i++)
+                     {
+                        hypre_ParCSRRelax_L1_Jacobi(A_array[fine_grid], F_array[fine_grid],
+                                                    CF_marker_data, relax_points, relax_weight,
+                                                    l1_norms, U_array[fine_grid], Vtemp);
+                     }
                   }
-#endif
                }
                else
                {
@@ -885,8 +892,8 @@ hypre_MGRCycle( void              *mgr_vdata,
             //                                    F_array[fine_grid], Vtemp);
             //  convergence_factor_frelax = hypre_ParVectorInnerProd(Vtemp, Vtemp);
 
-            HYPRE_Real resnorm, init_resnorm;
-            HYPRE_Real rhs_norm, old_resnorm;
+            HYPRE_Real resnorm = 0.0, init_resnorm;
+            HYPRE_Real rhs_norm = 0.0, old_resnorm;
             HYPRE_Real rel_resnorm = fp_one;
             HYPRE_Real conv_factor = fp_one;
             if (frelax_print_level > 1)
@@ -971,13 +978,18 @@ hypre_MGRCycle( void              *mgr_vdata,
                                                F_array[fine_grid], Vtemp);
 
             /* Restrict to F points */
-#if defined (HYPRE_USING_GPU)
-            hypre_ParCSRMatrixMatvecT(fp_one, P_FF_array[fine_grid], Vtemp,
-                                      fp_zero, F_fine_array[coarse_grid]);
-#else
-            hypre_MGRAddVectorR(CF_marker[fine_grid], FMRK, fp_one, Vtemp,
-                                fp_zero, &(F_fine_array[coarse_grid]));
+#if defined(HYPRE_USING_GPU)
+            if (exec == HYPRE_EXEC_DEVICE)
+            {
+               hypre_ParCSRMatrixMatvecT(fp_one, P_FF_array[fine_grid], Vtemp,
+                                         fp_zero, F_fine_array[coarse_grid]);
+            }
+            else
 #endif
+            {
+               hypre_MGRAddVectorR(CF_marker[fine_grid], FMRK, fp_one, Vtemp,
+                                   fp_zero, &(F_fine_array[coarse_grid]));
+            }
 
             /* Set initial guess to zeros */
             hypre_ParVectorSetZeros(U_fine_array[coarse_grid]);
@@ -1011,15 +1023,20 @@ hypre_MGRCycle( void              *mgr_vdata,
             }
 
             /* Interpolate the solution back to the fine grid level */
-#if defined (HYPRE_USING_GPU)
-            hypre_ParCSRMatrixMatvec(fp_one, P_FF_array[fine_grid],
-                                     U_fine_array[coarse_grid], fp_one,
-                                     U_array[fine_grid]);
-#else
-            hypre_MGRAddVectorP(CF_marker[fine_grid], FMRK, fp_one,
-                                U_fine_array[coarse_grid], fp_one,
-                                &(U_array[fine_grid]));
+#if defined(HYPRE_USING_GPU)
+            if (exec == HYPRE_EXEC_DEVICE)
+            {
+               hypre_ParCSRMatrixMatvec(fp_one, P_FF_array[fine_grid],
+                                        U_fine_array[coarse_grid], fp_one,
+                                        U_array[fine_grid]);
+            }
+            else
 #endif
+            {
+               hypre_MGRAddVectorP(CF_marker[fine_grid], FMRK, fp_one,
+                                   U_fine_array[coarse_grid], fp_one,
+                                   &(U_array[fine_grid]));
+            }
          }
          else
          {
@@ -1214,6 +1231,16 @@ hypre_MGRCycle( void              *mgr_vdata,
                               A_array[fine_grid], F_array[fine_grid],
                               U_array[fine_grid]);
             }
+            else if ((mgr_data -> level_smoother)[fine_grid])
+            {
+               /* User smoother */
+               hypre_Solver *base = (hypre_Solver*) (mgr_data -> level_smoother)[fine_grid];
+
+               hypre_SolverSolve(base)((mgr_data -> level_smoother)[fine_grid],
+                                       (HYPRE_Matrix) A_array[fine_grid],
+                                       (HYPRE_Vector) F_array[fine_grid],
+                                       (HYPRE_Vector) U_array[fine_grid]);
+            }
             else
             {
                /* Generic relaxation interface */
diff --git a/3rd_party/hypre/src/parcsr_ls/par_mgr_stats.c b/3rd_party/hypre/src/parcsr_ls/par_mgr_stats.c
index b58bc4e7d..3f3314692 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_mgr_stats.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_mgr_stats.c
@@ -26,9 +26,6 @@ hypre_MGRGetGlobalRelaxName(hypre_ParMGRData  *mgr_data,
 
    switch (smoother_type)
    {
-      case -1:
-         return "--";
-
       case 0:
          return "Blk-Jacobi";
 
@@ -290,7 +287,7 @@ hypre_MGRGetCoarseGridName(hypre_ParMGRData  *mgr_data,
          return "NG-ApproxInv";
 
       case 5:
-         return "Glk-RAI";
+         return "NG-A_CC";
 
       default:
          return "Unknown";
@@ -317,6 +314,9 @@ hypre_MGRSetupStats(void *mgr_vdata)
    HYPRE_Solver               coarse_solver   = hypre_ParMGRDataCoarseGridSolver(mgr_data);
    HYPRE_Solver             **A_FF_solver     = hypre_ParMGRDataAFFsolver(mgr_data);
    HYPRE_Int                 *Frelax_type     = hypre_ParMGRDataFRelaxType(mgr_data);
+   HYPRE_Int                  block_size      = hypre_ParMGRDataBlockSize(mgr_data);
+   HYPRE_Int                 *block_num_Cpts  = hypre_ParMGRDataBlockNumCoarseIndexes(mgr_data);
+   HYPRE_Int                **block_CF_marker = hypre_ParMGRDataBlockCFMarker(mgr_data);
 
    /* Finest matrix variables */
    MPI_Comm                   comm            = hypre_ParCSRMatrixComm(A_finest);
@@ -328,6 +328,7 @@ hypre_MGRSetupStats(void *mgr_vdata)
    hypre_ParAMGData          *coarse_amg_solver = NULL;
    hypre_ParCSRMatrix       **A_array;
    hypre_ParCSRMatrix       **P_array;
+   hypre_ParCSRMatrix       **R_array;
    hypre_ParCSRMatrix       **RT_array;
    hypre_MatrixStatsArray    *stats_array;
 
@@ -335,6 +336,9 @@ hypre_MGRSetupStats(void *mgr_vdata)
    HYPRE_Real                *opcomp;
    HYPRE_Real                *memcomp;
 
+   HYPRE_Int                  max_length;
+   char                      *coarse_dofs_str = NULL;
+
    HYPRE_Int                  coarsest_mgr_level;
    HYPRE_Int                  num_levels_total;
    HYPRE_Int                  num_levels[2];
@@ -406,6 +410,18 @@ hypre_MGRSetupStats(void *mgr_vdata)
 
    if (!myid)
    {
+      /* Determine max_length for printing coarse DOFs */
+      if (block_num_Cpts)
+      {
+         coarse_dofs_str = hypre_ConvertIndicesToString(block_num_Cpts[0], block_CF_marker[0]);
+         max_length = hypre_min(strlen(coarse_dofs_str) + 2, 50);
+         hypre_TFree(coarse_dofs_str, HYPRE_MEMORY_HOST);
+      }
+      else
+      {
+         max_length = 10;
+      }
+
       hypre_printf("\n\n");
       hypre_printf(" Num MPI tasks = %d\n",  num_procs);
       hypre_printf(" Num OpenMP threads = %d\n", num_threads);
@@ -416,22 +432,34 @@ hypre_MGRSetupStats(void *mgr_vdata)
       hypre_printf(" coarse AMG num levels = %d\n", num_sublevels_amg[coarsest_mgr_level]);
       hypre_printf("      Total num levels = %d\n\n", num_levels_total);
 
-      divisors[0] = 84;
+      divisors[0] = 83 + max_length;
       //hypre_printf("\nMGR level options:\n\n");
-      hypre_printf("%18s %14s %16s\n", "Global", "Fine", "Coarse");
-      hypre_printf("%3s %14s %14s %16s %16s %16s\n", "lev",
-                   "relaxation", "relaxation", "grid method",
-                   "Prolongation", "Restriction");
+      hypre_printf("%10s %*s %11s %14s %14s\n",
+                   "Total", max_length, "Coarse", "Global", "Fine", "Coarse");
+      hypre_printf("%3s %6s %*s %11s %14s %14s %14s %14s\n", "lev",
+                   "DOFs", max_length, "DOFs", "relaxation", "relaxation",
+                   "grid method", "Prolongation", "Restriction");
       HYPRE_PRINT_TOP_DIVISOR(1, divisors);
       for (i = 0; i < num_levels_mgr; i++)
       {
-         hypre_printf("%3d %14s %14s %16s %16s %16s\n",
+         if (block_num_Cpts)
+         {
+            coarse_dofs_str = hypre_ConvertIndicesToString(block_num_Cpts[i],
+                                                           block_CF_marker[i]);
+         }
+
+         hypre_printf("%3d %6d %*s %11s %14s %14s %14s %14s\n",
                       i,
+                      (i > 0 && block_num_Cpts) ? block_num_Cpts[i - 1] : block_size,
+                      max_length,
+                      (block_num_Cpts) ? coarse_dofs_str : "--",
                       hypre_MGRGetGlobalRelaxName(mgr_data, i),
                       hypre_MGRGetFRelaxName(mgr_data, i),
                       hypre_MGRGetCoarseGridName(mgr_data, i),
                       hypre_MGRGetProlongationName(mgr_data, i),
                       hypre_MGRGetRestrictionName(mgr_data, i));
+
+         hypre_TFree(coarse_dofs_str, HYPRE_MEMORY_HOST);
       }
       hypre_printf("\n\n");
    }
@@ -475,18 +503,33 @@ hypre_MGRSetupStats(void *mgr_vdata)
 
    /* Set pointer to level matrices */
    P_array  = hypre_TAlloc(hypre_ParCSRMatrix *, max_levels, HYPRE_MEMORY_HOST);
+   R_array  = hypre_TAlloc(hypre_ParCSRMatrix *, max_levels, HYPRE_MEMORY_HOST);
    RT_array = hypre_TAlloc(hypre_ParCSRMatrix *, max_levels, HYPRE_MEMORY_HOST);
    for (i = 0; i < num_levels_mgr; i++)
    {
       P_array[i] = hypre_ParMGRDataP(mgr_data, i);
+
+      if (hypre_ParMGRDataR(mgr_data, i))
+      {
+         R_array[i] = hypre_ParMGRDataR(mgr_data, i);
+      }
+      else if (hypre_ParMGRDataRT(mgr_data, i))
+      {
+         hypre_ParCSRMatrixTranspose(hypre_ParMGRDataRT(mgr_data, i), &R_array[i], 1);
+      }
+      else
+      {
+         R_array[i] = NULL;
+      }
    }
 
    for (i = 0; i < num_sublevels_amg[coarsest_mgr_level]; i++)
    {
       P_array[num_levels_mgr + i] = hypre_ParAMGDataPArray(coarse_amg_solver)[i];
+      R_array[num_levels_mgr + i] = NULL;
    }
 
-   /* Compute statistics data structure */
+   /* Compute statistics data structure for Prolongation operator */
    hypre_ParCSRMatrixStatsArrayCompute(num_levels_total - 1, P_array, stats_array);
 
    if (!myid)
@@ -502,6 +545,17 @@ hypre_MGRSetupStats(void *mgr_vdata)
       hypre_MatrixStatsArrayPrint(2, num_levels, 1, 0, msg, stats_array);
    }
 
+   /* Compute statistics data structure for Restriction operator (only for MGR) */
+   hypre_ParCSRMatrixStatsArrayCompute(num_levels_mgr, R_array, stats_array);
+
+   if (!myid)
+   {
+      const char *msg[] = { "MGR Restriction Matrix Hierarchy Information:\n\n" };
+
+      num_levels[0] = num_levels_mgr;
+      hypre_MatrixStatsArrayPrint(1, num_levels, 1, 0, msg, stats_array);
+   }
+
    /*-------------------------------------------------
     *  Print MGR F-relaxation info
     *-------------------------------------------------*/
@@ -669,9 +723,18 @@ hypre_MGRSetupStats(void *mgr_vdata)
     *  Free memory
     *-------------------------------------------------*/
 
+   for (i = 0; i < num_levels_mgr; i++)
+   {
+      if (hypre_ParMGRDataRT(mgr_data, i))
+      {
+         hypre_ParCSRMatrixDestroy(R_array[i]);
+      }
+   }
+
    hypre_MatrixStatsArrayDestroy(stats_array);
    hypre_TFree(A_array, HYPRE_MEMORY_HOST);
    hypre_TFree(P_array, HYPRE_MEMORY_HOST);
+   hypre_TFree(R_array, HYPRE_MEMORY_HOST);
    hypre_TFree(RT_array, HYPRE_MEMORY_HOST);
    hypre_TFree(num_sublevels_amg, HYPRE_MEMORY_HOST);
    hypre_TFree(gridcomp, HYPRE_MEMORY_HOST);
diff --git a/3rd_party/hypre/src/parcsr_ls/par_multi_interp.c b/3rd_party/hypre/src/parcsr_ls/par_multi_interp.c
index eba0f98da..2bb118d55 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_multi_interp.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_multi_interp.c
@@ -79,7 +79,7 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix  *A,
    HYPRE_Int       *int_buf_data = NULL;
    HYPRE_BigInt    *big_buf_data = NULL;
    HYPRE_Int       *send_map_start = NULL;
-   HYPRE_Int       *send_map_elmt;
+   HYPRE_Int       *send_map_elmt = NULL;
    HYPRE_Int       *send_procs = NULL;
    HYPRE_Int        num_recvs = 0;
    HYPRE_Int       *recv_vec_start = NULL;
@@ -888,6 +888,7 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix  *A,
           * P_marker, are initialized and de-allocated internally to the
           * parallel region. */
 
+         P_marker_offd = NULL;
          my_thread_num = hypre_GetThreadNum();
          num_threads = hypre_NumActiveThreads();
          thread_start = (pass_length / num_threads) * my_thread_num;
@@ -1186,6 +1187,8 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix  *A,
           * weights only over each thread's range of rows.  Rows are divided
           * up evenly amongst the threads. */
 
+         alfa = beta = 1.0;
+         P_marker_offd = C_array_offd = NULL;
          P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
          for (i = 0; i < n_fine; i++)
          {   P_marker[i] = -1; }
@@ -1393,6 +1396,8 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix  *A,
              * weights only over each thread's range of rows.  Rows are divided
              * up evenly amongst the threads. */
 
+            alfa = beta = 1.0;
+            P_marker_offd = NULL;
             P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
             for (i = 0; i < n_fine; i++)
             {   P_marker[i] = -1; }
@@ -1623,6 +1628,7 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix  *A,
           * up evenly amongst the threads. */
 
          /* Initialize thread-wise variables */
+         alfa = 1.0;
          tmp_marker = NULL;
          if (n_fine)
          {   tmp_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); }
@@ -1790,6 +1796,7 @@ hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix  *A,
              * up evenly amongst the threads. */
 
             /* Initialize thread-wise variables */
+            alfa = beta = 1.0;
             tmp_marker = NULL;
             if (n_fine)
             {    tmp_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); }
diff --git a/3rd_party/hypre/src/parcsr_ls/par_nongalerkin.c b/3rd_party/hypre/src/parcsr_ls/par_nongalerkin.c
index 760e4ef57..d052dc128 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_nongalerkin.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_nongalerkin.c
@@ -1019,6 +1019,8 @@ hypre_NonGalerkinSparsityPattern(hypre_ParCSRMatrix *R_IAP,
    HYPRE_Int           i, j, Cpt, row_start, row_end;
    HYPRE_BigInt        global_row, global_col;
 
+   HYPRE_ANNOTATE_FUNC_BEGIN;
+
    /* Other Setup */
    if (num_cols_RAP_offd)
    {
@@ -1233,6 +1235,8 @@ hypre_NonGalerkinSparsityPattern(hypre_ParCSRMatrix *R_IAP,
       hypre_TFree(ijbuf_sym_numcols, memory_location_RAP);
    }
 
+   HYPRE_ANNOTATE_FUNC_END;
+
    return Pattern_CSR;
 }
 
@@ -1368,6 +1372,8 @@ hypre_BoomerAMGBuildNonGalerkinCoarseOperator( hypre_ParCSRMatrix **RAP_ptr,
    HYPRE_BigInt        *ijbuf_sym_cols, *ijbuf_sym_rownums;
    HYPRE_Int           *ijbuf_sym_numcols;
 
+   HYPRE_ANNOTATE_FUNC_BEGIN;
+
    /* Further Initializations */
    if (num_cols_RAP_offd)
    {   RAP_offd_data = hypre_CSRMatrixData(RAP_offd); }
@@ -2352,5 +2358,7 @@ hypre_BoomerAMGBuildNonGalerkinCoarseOperator( hypre_ParCSRMatrix **RAP_ptr,
    HYPRE_IJMatrixSetObjectType(ijmatrix, -1);
    HYPRE_IJMatrixDestroy(ijmatrix);
 
+   HYPRE_ANNOTATE_FUNC_END;
+
    return hypre_error_flag;
 }
diff --git a/3rd_party/hypre/src/parcsr_ls/par_rap.c b/3rd_party/hypre/src/parcsr_ls/par_rap.c
index cd5d65297..c432b3694 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_rap.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_rap.c
@@ -51,7 +51,7 @@ hypre_BoomerAMGBuildCoarseOperatorKT( hypre_ParCSRMatrix  *RT,
    HYPRE_Int              num_recvs_RT = 0;
    HYPRE_Int              num_sends_RT = 0;
    HYPRE_Int             *send_map_starts_RT = NULL;
-   HYPRE_Int             *send_map_elmts_RT;
+   HYPRE_Int             *send_map_elmts_RT = NULL;
 
    hypre_CSRMatrix       *A_diag = hypre_ParCSRMatrixDiag(A);
    HYPRE_Complex         *A_diag_data = hypre_CSRMatrixData(A_diag);
@@ -93,7 +93,7 @@ hypre_BoomerAMGBuildCoarseOperatorKT( hypre_ParCSRMatrix  *RT,
    HYPRE_Int             *RAP_int_i;
    HYPRE_BigInt          *RAP_int_j;
 
-   hypre_CSRMatrix       *RAP_ext;
+   hypre_CSRMatrix       *RAP_ext = NULL;
    HYPRE_Complex         *RAP_ext_data = NULL;
    HYPRE_Int             *RAP_ext_i = NULL;
    HYPRE_BigInt          *RAP_ext_j = NULL;
@@ -123,10 +123,10 @@ hypre_BoomerAMGBuildCoarseOperatorKT( hypre_ParCSRMatrix  *RT,
    HYPRE_Int             *R_diag_i;
    HYPRE_Int             *R_diag_j;
 
-   hypre_CSRMatrix       *R_offd;
-   HYPRE_Complex         *R_offd_data;
-   HYPRE_Int             *R_offd_i;
-   HYPRE_Int             *R_offd_j;
+   hypre_CSRMatrix       *R_offd = NULL;
+   HYPRE_Complex         *R_offd_data = NULL;
+   HYPRE_Int             *R_offd_i = NULL;
+   HYPRE_Int             *R_offd_j = NULL;
 
    HYPRE_Real            *RA_diag_data_array = NULL;
    HYPRE_Int             *RA_diag_j_array = NULL;
@@ -573,6 +573,10 @@ hypre_BoomerAMGBuildCoarseOperatorKT( hypre_ParCSRMatrix  *RT,
                                              HYPRE_MEMORY_HOST);
             P_marker = P_mark_array[ii];
          }
+         else
+         {
+            P_marker = NULL;
+         }
          A_mark_array[ii] = hypre_CTAlloc(HYPRE_Int,  num_nz_cols_A, HYPRE_MEMORY_HOST);
          A_marker = A_mark_array[ii];
          /*-----------------------------------------------------------------------
@@ -761,6 +765,7 @@ hypre_BoomerAMGBuildCoarseOperatorKT( hypre_ParCSRMatrix  *RT,
 #endif
       for (ii = 0; ii < num_threads; ii++)
       {
+         P_marker = NULL;
          size = num_cols_offd_RT / num_threads;
          rest = num_cols_offd_RT - size * num_threads;
          if (ii < rest)
diff --git a/3rd_party/hypre/src/parcsr_ls/par_stats.c b/3rd_party/hypre/src/parcsr_ls/par_stats.c
index c31fdd76d..16ef79c62 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_stats.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_stats.c
@@ -65,6 +65,8 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
 
 
    HYPRE_Int      num_levels;
+   HYPRE_Int      num_functions;
+   HYPRE_Int      filter_functions;
    HYPRE_Int      coarsen_type;
    HYPRE_Int      interp_type;
    HYPRE_Int      restri_type;
@@ -160,6 +162,8 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
    A_array = hypre_ParAMGDataAArray(amg_data);
    P_array = hypre_ParAMGDataPArray(amg_data);
    num_levels = hypre_ParAMGDataNumLevels(amg_data);
+   num_functions = hypre_ParAMGDataNumFunctions(amg_data);
+   filter_functions = hypre_ParAMGDataFilterFunctions(amg_data);
    coarsen_type = hypre_ParAMGDataCoarsenType(amg_data);
    interp_type = hypre_ParAMGDataInterpType(amg_data);
    restri_type = hypre_ParAMGDataRestriction(amg_data); /* RL */
@@ -403,6 +407,12 @@ hypre_BoomerAMGSetupStats( void               *amg_vdata,
                       restri_type - 3);
       }
 
+      if (num_functions > 1)
+      {
+         hypre_printf(" Number of functions = %d\n", num_functions);
+         hypre_printf(" Functions filtering is %s\n", (filter_functions > 0) ? "on" : "off");
+      }
+
       if (block_mode)
       {
          hypre_printf( "\nBlock Operator Matrix Information:\n");
@@ -1764,6 +1774,10 @@ hypre_BoomerAMGPrintGeneralInfo(hypre_ParAMGData *amg_data,
                              "Number of functions = %d\n",
                              hypre_ParAMGDataNumFunctions(amg_data));
 
+   HYPRE_PRINT_SHIFTED_PARAM(shift,
+                             "Functions filtering is %s\n",
+                             (hypre_ParAMGDataFilterFunctions(amg_data) > 0) ? "on" : "off");
+
    HYPRE_PRINT_SHIFTED_PARAM(shift,
                              "Coarsening type = %s\n",
                              hypre_BoomerAMGGetCoarseningName(amg_data));
diff --git a/3rd_party/hypre/src/parcsr_ls/par_strength.c b/3rd_party/hypre/src/parcsr_ls/par_strength.c
index dc667cdfe..f514737e0 100644
--- a/3rd_party/hypre/src/parcsr_ls/par_strength.c
+++ b/3rd_party/hypre/src/parcsr_ls/par_strength.c
@@ -584,48 +584,48 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
    hypre_profile_times[HYPRE_TIMER_ID_CREATES] -= hypre_MPI_Wtime();
 #endif
 
-   MPI_Comm                 comm     = hypre_ParCSRMatrixComm(A);
-   hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
-   hypre_ParCSRCommHandle  *comm_handle;
-   hypre_CSRMatrix    *A_diag          = hypre_ParCSRMatrixDiag(A);
-   HYPRE_Int          *A_diag_i        = hypre_CSRMatrixI(A_diag);
-   HYPRE_Real         *A_diag_data     = hypre_CSRMatrixData(A_diag);
-
-
-   hypre_CSRMatrix    *A_offd          = hypre_ParCSRMatrixOffd(A);
-   HYPRE_Int          *A_offd_i        = hypre_CSRMatrixI(A_offd);
-   HYPRE_Real         *A_offd_data = NULL;
-   HYPRE_Int          *A_diag_j        = hypre_CSRMatrixJ(A_diag);
-   HYPRE_Int          *A_offd_j        = hypre_CSRMatrixJ(A_offd);
-
-   HYPRE_BigInt       *row_starts      = hypre_ParCSRMatrixRowStarts(A);
-   HYPRE_Int           num_variables   = hypre_CSRMatrixNumRows(A_diag);
-   HYPRE_BigInt        global_num_vars = hypre_ParCSRMatrixGlobalNumRows(A);
-   HYPRE_Int           num_nonzeros_diag;
-   HYPRE_Int           num_nonzeros_offd = 0;
-   HYPRE_Int           num_cols_offd = 0;
+   HYPRE_MemoryLocation     memloc        = hypre_ParCSRMatrixMemoryLocation(A);
+   MPI_Comm                 comm          = hypre_ParCSRMatrixComm(A);
+   hypre_ParCSRCommPkg     *comm_pkg      = hypre_ParCSRMatrixCommPkg(A);
+   hypre_CSRMatrix         *A_diag        = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Int               *A_diag_i      = hypre_CSRMatrixI(A_diag);
+   HYPRE_Real              *A_diag_data   = hypre_CSRMatrixData(A_diag);
+
+   hypre_CSRMatrix         *A_offd        = hypre_ParCSRMatrixOffd(A);
+   HYPRE_Int               *A_offd_i      = hypre_CSRMatrixI(A_offd);
+   HYPRE_Real              *A_offd_data   = NULL;
+   HYPRE_Int               *A_diag_j      = hypre_CSRMatrixJ(A_diag);
+   HYPRE_Int               *A_offd_j      = hypre_CSRMatrixJ(A_offd);
+
+   HYPRE_BigInt            *row_starts    = hypre_ParCSRMatrixRowStarts(A);
+   HYPRE_Int                num_variables = hypre_CSRMatrixNumRows(A_diag);
+   HYPRE_BigInt             global_num_vars = hypre_ParCSRMatrixGlobalNumRows(A);
+   HYPRE_Int                num_nonzeros_diag;
+   HYPRE_Int                num_nonzeros_offd = 0;
+   HYPRE_Int                num_cols_offd = 0;
 
-   hypre_ParCSRMatrix *S;
-   hypre_CSRMatrix    *S_diag;
-   HYPRE_Int          *S_diag_i;
-   HYPRE_Int          *S_diag_j;
+   hypre_ParCSRCommHandle  *comm_handle;
+   hypre_ParCSRMatrix      *S;
+   hypre_CSRMatrix         *S_diag;
+   HYPRE_Int               *S_diag_i;
+   HYPRE_Int               *S_diag_j;
    /* HYPRE_Real         *S_diag_data; */
-   hypre_CSRMatrix    *S_offd;
-   HYPRE_Int          *S_offd_i = NULL;
-   HYPRE_Int          *S_offd_j = NULL;
+   hypre_CSRMatrix         *S_offd;
+   HYPRE_Int               *S_offd_i = NULL;
+   HYPRE_Int               *S_offd_j = NULL;
    /* HYPRE_Real         *S_offd_data; */
-   HYPRE_Int          *dof_func_offd = NULL;
+   HYPRE_Int               *dof_func_offd = NULL;
 
-   HYPRE_Real          diag, row_scale, row_sum;
-   HYPRE_Int           i, jj, jA, jS;
-   HYPRE_Int           num_sends, start, j, index;
-   HYPRE_Int          *int_buf_data;
+   HYPRE_Real               diag, row_scale, row_sum;
+   HYPRE_Int                i, jj, jA, jS;
+   HYPRE_Int                num_sends, start, j, index;
+   HYPRE_Int               *int_buf_data;
 
-   HYPRE_Int           ierr = 0;
-   HYPRE_Int          *CF_marker_offd = NULL;
+   HYPRE_Int                ierr = 0;
+   HYPRE_Int               *CF_marker_offd = NULL;
 
-   HYPRE_Int          *prefix_sum_workspace;
-   HYPRE_Int           my_id;
+   HYPRE_Int               *prefix_sum_workspace;
+   HYPRE_Int                my_id;
 
    /*--------------------------------------------------------------
     * Compute a  ParCSR strength matrix, S.
@@ -703,8 +703,9 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
    if (num_functions > 1)
    {
-      int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg,
-                                                                              num_sends), HYPRE_MEMORY_HOST);
+      int_buf_data = hypre_CTAlloc(HYPRE_Int,
+                                   hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
+                                   HYPRE_MEMORY_HOST);
       index = 0;
       for (i = 0; i < num_sends; i++)
       {
@@ -715,7 +716,7 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
          }
       }
 
-      comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, dof_func_offd);
+      comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data, dof_func_offd);
 
       hypre_ParCSRCommHandleDestroy(comm_handle);
       hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
@@ -724,15 +725,16 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
    /*-------------------------------------------------------------------
     * Get the CF_marker data for the off-processor columns
     *-------------------------------------------------------------------*/
-   if (num_cols_offd) { CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
+   CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
    if (!comm_pkg)
    {
       hypre_MatvecCommPkgCreate(A);
       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
    }
    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
-   int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg,
-                                                                           num_sends), HYPRE_MEMORY_HOST);
+   int_buf_data = hypre_CTAlloc(HYPRE_Int,
+                                hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
+                                HYPRE_MEMORY_HOST);
 
    index = 0;
    for (i = 0; i < num_sends; i++)
@@ -743,16 +745,17 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, j)];
    }
 
-   comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
-                                               CF_marker_offd);
+   comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
+                                              CF_marker_offd);
    hypre_ParCSRCommHandleDestroy(comm_handle);
    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
 
    /*HYPRE_Int prefix_sum_workspace[2*(hypre_NumThreads() + 1)];*/
-   prefix_sum_workspace = hypre_TAlloc(HYPRE_Int,  2 * (hypre_NumThreads() + 1), HYPRE_MEMORY_HOST);
+   prefix_sum_workspace = hypre_TAlloc(HYPRE_Int,
+                                       2 * (hypre_NumThreads() + 1),
+                                       HYPRE_MEMORY_HOST);
 
    /* give S same nonzero structure as A */
-
 #ifdef HYPRE_USING_OPENMP
    #pragma omp parallel private(i,diag,row_scale,row_sum,jA,jS)
 #endif
@@ -792,7 +795,8 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
                   for (jA = A_offd_i[i]; jA < A_offd_i[i + 1]; jA++)
                   {
                      jj = A_offd_j[jA];
-                     if ((CF_marker_offd[jj] == SMRK) && (dof_func[i] == dof_func_offd[jj]))
+                     if ((CF_marker_offd[jj] == SMRK) &&
+                         (dof_func[i] == dof_func_offd[jj]))
                      {
                         row_scale = hypre_max(row_scale, A_offd_data[jA]);
                         row_sum += A_offd_data[jA];
@@ -813,7 +817,8 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
                   for (jA = A_offd_i[i]; jA < A_offd_i[i + 1]; jA++)
                   {
                      jj = A_offd_j[jA];
-                     if ((CF_marker_offd[jj] == SMRK) && (dof_func[i] == dof_func_offd[A_offd_j[jA]]))
+                     if ((CF_marker_offd[jj] == SMRK) &&
+                         (dof_func[i] == dof_func_offd[A_offd_j[jA]]))
                      {
                         row_scale = hypre_min(row_scale, A_offd_data[jA]);
                         row_sum += A_offd_data[jA];
@@ -870,7 +875,8 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
 
             /* compute row entries of S */
             S_temp_diag_j[A_diag_i[i]] = -1;
-            if ((hypre_abs(row_sum) > hypre_abs(diag)*max_row_sum) && (max_row_sum < 1.0))
+            if ((hypre_abs(row_sum) > hypre_abs(diag)*max_row_sum) &&
+                (max_row_sum < 1.0))
             {
                /* make all dependencies weak */
                for (jA = A_diag_i[i] + 1; jA < A_diag_i[i + 1]; jA++)
@@ -1089,8 +1095,8 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
          } /* CF_marker != SMRK */
       } /* for each variable */
 
-      hypre_prefix_sum_pair(&jS_diag, S_diag_i + num_variables, &jS_offd, S_offd_i + num_variables,
-                            prefix_sum_workspace);
+      hypre_prefix_sum_pair(&jS_diag, S_diag_i + num_variables, &jS_offd,
+                            S_offd_i + num_variables, prefix_sum_workspace);
 
       /*--------------------------------------------------------------
        * "Compress" the strength matrix.
@@ -1140,7 +1146,9 @@ hypre_BoomerAMGCreateSFromCFMarker(hypre_ParCSRMatrix   *A,
 
    hypre_ParCSRMatrixCommPkg(S) = NULL;
 
-   *S_ptr        = S;
+   hypre_ParCSRMatrixMigrate(S, memloc);
+
+   *S_ptr = S;
 
    hypre_TFree(prefix_sum_workspace, HYPRE_MEMORY_HOST);
    hypre_TFree(S_temp_diag_j, HYPRE_MEMORY_HOST);
diff --git a/3rd_party/hypre/src/parcsr_ls/partial.c b/3rd_party/hypre/src/parcsr_ls/partial.c
index 3bd452322..ec763ed05 100644
--- a/3rd_party/hypre/src/parcsr_ls/partial.c
+++ b/3rd_party/hypre/src/parcsr_ls/partial.c
@@ -994,11 +994,10 @@ hypre_BoomerAMGBuildPartialStdInterp(hypre_ParCSRMatrix  *A,
    /* Definitions */
    HYPRE_Real       zero = 0.0;
    HYPRE_Real       one  = 1.0;
-   HYPRE_Real       wall_time;
-   HYPRE_Real       wall_1 = 0;
-   HYPRE_Real       wall_2 = 0;
-   HYPRE_Real       wall_3 = 0;
-
+   HYPRE_Real       wall_time = 0.0;
+   HYPRE_Real       wall_1 = 0.0;
+   HYPRE_Real       wall_2 = 0.0;
+   HYPRE_Real       wall_3 = 0.0;
 
    hypre_ParCSRCommPkg   *extend_comm_pkg = NULL;
 
@@ -2003,8 +2002,7 @@ hypre_BoomerAMGBuildPartialExtInterp(hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker
    /* Definitions */
    HYPRE_Real       zero = 0.0;
    HYPRE_Real       one  = 1.0;
-   HYPRE_Real       wall_time;
-
+   HYPRE_Real       wall_time = 0.0;
 
    hypre_ParCSRCommPkg   *extend_comm_pkg = NULL;
 
diff --git a/3rd_party/hypre/src/parcsr_ls/protos.h b/3rd_party/hypre/src/parcsr_ls/protos.h
index dcfd4b563..354f2b461 100644
--- a/3rd_party/hypre/src/parcsr_ls/protos.h
+++ b/3rd_party/hypre/src/parcsr_ls/protos.h
@@ -527,6 +527,8 @@ HYPRE_Int HYPRE_BoomerAMGSetFSAIThreshold ( HYPRE_Solver solver, HYPRE_Real thre
 HYPRE_Int HYPRE_BoomerAMGSetFSAIKapTolerance ( HYPRE_Solver solver, HYPRE_Real kap_tolerance );
 HYPRE_Int HYPRE_BoomerAMGSetNumFunctions ( HYPRE_Solver solver, HYPRE_Int num_functions );
 HYPRE_Int HYPRE_BoomerAMGGetNumFunctions ( HYPRE_Solver solver, HYPRE_Int *num_functions );
+HYPRE_Int HYPRE_BoomerAMGSetFilterFunctions ( HYPRE_Solver solver, HYPRE_Int filter_functions );
+HYPRE_Int HYPRE_BoomerAMGGetFilterFunctions ( HYPRE_Solver solver, HYPRE_Int *filter_functions );
 HYPRE_Int HYPRE_BoomerAMGSetNodal ( HYPRE_Solver solver, HYPRE_Int nodal );
 HYPRE_Int HYPRE_BoomerAMGSetNodalLevels ( HYPRE_Solver solver, HYPRE_Int nodal_levels );
 HYPRE_Int HYPRE_BoomerAMGSetNodalDiag ( HYPRE_Solver solver, HYPRE_Int nodal );
@@ -1115,6 +1117,8 @@ HYPRE_Int hypre_BoomerAMGSetCoordinates ( void *data, float *coordinates );
 HYPRE_Int hypre_BoomerAMGGetGridHierarchy(void *data, HYPRE_Int *cgrid );
 HYPRE_Int hypre_BoomerAMGSetNumFunctions ( void *data, HYPRE_Int num_functions );
 HYPRE_Int hypre_BoomerAMGGetNumFunctions ( void *data, HYPRE_Int *num_functions );
+HYPRE_Int hypre_BoomerAMGSetFilterFunctions ( void *data, HYPRE_Int filter_functions );
+HYPRE_Int hypre_BoomerAMGGetFilterFunctions ( void *data, HYPRE_Int *filter_functions );
 HYPRE_Int hypre_BoomerAMGSetNodal ( void *data, HYPRE_Int nodal );
 HYPRE_Int hypre_BoomerAMGSetNodalLevels ( void *data, HYPRE_Int nodal_levels );
 HYPRE_Int hypre_BoomerAMGSetNodalDiag ( void *data, HYPRE_Int nodal );
@@ -2153,7 +2157,7 @@ HYPRE_Int hypre_MGRSetFSolver( void *mgr_vdata,
                                HYPRE_Int (*fine_grid_solver_solve)(void*, void*, void*, void*),
                                HYPRE_Int (*fine_grid_solver_setup)(void*, void*, void*, void*),
                                void *fsolver );
-HYPRE_Int hypre_MGRSetFSolverAtLevel( HYPRE_Int level, void *mgr_vdata, void *fsolver );
+HYPRE_Int hypre_MGRSetFSolverAtLevel( void *mgr_vdata, void *fsolver, HYPRE_Int level );
 HYPRE_Int hypre_MGRSetup( void *mgr_vdata, hypre_ParCSRMatrix *A,
                           hypre_ParVector *f, hypre_ParVector *u );
 HYPRE_Int hypre_MGRSolve( void *mgr_vdata, hypre_ParCSRMatrix *A,
@@ -2195,14 +2199,6 @@ HYPRE_Int hypre_MGRAddVectorR( hypre_IntArray *CF_marker, HYPRE_Int point_type,
                                hypre_ParVector **toVector );
 HYPRE_Int hypre_MGRTruncateAcfCPRDevice( hypre_ParCSRMatrix  *A_CF,
                                          hypre_ParCSRMatrix **A_CF_new_ptr );
-HYPRE_Int hypre_MGRComputeNonGalerkinCoarseGrid(hypre_ParCSRMatrix *A_FF,
-                                                hypre_ParCSRMatrix *A_FC,
-                                                hypre_ParCSRMatrix *A_CF,
-                                                hypre_ParCSRMatrix *A_CC,
-                                                hypre_ParCSRMatrix *Wp, hypre_ParCSRMatrix *Wr,
-                                                HYPRE_Int bsize, HYPRE_Int ordering,
-                                                HYPRE_Int method, HYPRE_Int max_elmts,
-                                                hypre_ParCSRMatrix **A_H_ptr);
 HYPRE_Int hypre_MGRSetAffSolverType( void *systg_vdata, HYPRE_Int *aff_solver_type );
 HYPRE_Int hypre_MGRSetCoarseSolverType( void *systg_vdata, HYPRE_Int coarse_solver_type );
 HYPRE_Int hypre_MGRSetCoarseSolverIter( void *systg_vdata, HYPRE_Int coarse_solver_iter );
@@ -2216,6 +2212,8 @@ HYPRE_Int hypre_MGRSetLevelFRelaxMethod( void *mgr_vdata, HYPRE_Int *relax_metho
 HYPRE_Int hypre_MGRSetLevelFRelaxType( void *mgr_vdata, HYPRE_Int *relax_type );
 HYPRE_Int hypre_MGRSetLevelFRelaxNumFunctions( void *mgr_vdata, HYPRE_Int *num_functions );
 HYPRE_Int hypre_MGRSetCoarseGridMethod( void *mgr_vdata, HYPRE_Int *cg_method );
+HYPRE_Int hypre_MGRSetNonGalerkinMaxElmts( void *mgr_vdata, HYPRE_Int max_elmts );
+HYPRE_Int hypre_MGRSetLevelNonGalerkinMaxElmts( void *mgr_vdata, HYPRE_Int *max_elmts );
 HYPRE_Int hypre_MGRSetRestrictType( void *mgr_vdata, HYPRE_Int restrictType );
 HYPRE_Int hypre_MGRSetLevelRestrictType( void *mgr_vdata, HYPRE_Int *restrictType );
 HYPRE_Int hypre_MGRSetInterpType( void *mgr_vdata, HYPRE_Int interpType );
@@ -2227,6 +2225,8 @@ HYPRE_Int hypre_MGRSetNumRestrictSweeps( void *mgr_vdata, HYPRE_Int nsweeps );
 HYPRE_Int hypre_MGRSetLevelSmoothType( void *mgr_vdata, HYPRE_Int *level_smooth_type );
 HYPRE_Int hypre_MGRSetLevelSmoothIters( void *mgr_vdata, HYPRE_Int *level_smooth_iters );
 HYPRE_Int hypre_MGRSetGlobalSmoothCycle( void *mgr_vdata, HYPRE_Int global_smooth_cycle );
+HYPRE_Int hypre_MGRSetGlobalSmootherAtLevel( void *mgr_vdata, HYPRE_Solver smoother,
+                                             HYPRE_Int level );
 HYPRE_Int hypre_MGRSetPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
 HYPRE_Int hypre_MGRSetFrelaxPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
 HYPRE_Int hypre_MGRSetCoarseGridPrintLevel( void *mgr_vdata, HYPRE_Int print_level );
@@ -2253,12 +2253,12 @@ HYPRE_Int hypre_MGRGetCoarseGridConvergenceFactor( void *mgr_data, HYPRE_Real *c
 
 /* par_mgr_interp.c */
 HYPRE_Int hypre_MGRBuildInterp( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
-                                hypre_ParCSRMatrix *A_FC, HYPRE_Int *CF_marker,
-                                hypre_ParCSRMatrix *S, HYPRE_BigInt *num_cpts_global,
+                                hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *S,
+                                hypre_IntArray *CF_marker, HYPRE_BigInt *num_cpts_global,
                                 HYPRE_Real trunc_factor, HYPRE_Int max_elmts,
-                                HYPRE_Int block_jacobi_bsize,
-                                hypre_ParCSRMatrix **P_tr, HYPRE_Int method,
-                                HYPRE_Int num_sweeps_post );
+                                HYPRE_Int block_jacobi_bsize, HYPRE_Int method,
+                                HYPRE_Int num_sweeps_post, hypre_ParCSRMatrix **Wp_ptr,
+                                hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildRestrict( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                   hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *A_CF,
                                   hypre_IntArray *CF_marker, HYPRE_BigInt *num_cpts_global,
@@ -2273,32 +2273,39 @@ HYPRE_Int hypre_MGRBuildPFromWpHost( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *
                                      HYPRE_Int *CF_marker, hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildBlockJacobiWp( hypre_ParCSRMatrix *A_FF, hypre_ParCSRMatrix *A_FC,
                                        HYPRE_Int blk_size, hypre_ParCSRMatrix **Wp_ptr );
-HYPRE_Int hypre_MGRBuildPBlockJacobi( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
+HYPRE_Int hypre_MGRBuildBlockJacobiP( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                       hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *Wp,
                                       HYPRE_Int blk_size, HYPRE_Int *CF_marker,
                                       hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildP( hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker,
                            HYPRE_BigInt *num_cpts_global, HYPRE_Int method,
                            HYPRE_Int debug_flag, hypre_ParCSRMatrix **P_ptr );
-HYPRE_Int hypre_MGRBuildPHost( hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker,
+HYPRE_Int hypre_MGRBuildPHost( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
+                               hypre_ParCSRMatrix *A_FC, HYPRE_Int *CF_marker,
                                HYPRE_BigInt *num_cpts_global, HYPRE_Int method,
-                               hypre_ParCSRMatrix **P_ptr );
+                               hypre_ParCSRMatrix **Wp_ptr, hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRBuildInterpApproximateInverse( hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker,
                                                   HYPRE_BigInt *num_cpts_global,
                                                   hypre_ParCSRMatrix **P_ptr );
 HYPRE_Int hypre_MGRTruncateAcfCPR( hypre_ParCSRMatrix *A_CF, hypre_ParCSRMatrix **A_CF_new_ptr );
-HYPRE_Int hypre_MGRBuildRFromW( HYPRE_Int *C_map, HYPRE_Int *F_map,
+HYPRE_Int hypre_MGRBuildRFromW( hypre_IntArray *C_map, hypre_IntArray *F_map,
                                 HYPRE_BigInt global_num_rows_R, HYPRE_BigInt global_num_cols_R,
                                 HYPRE_BigInt *row_starts_R, HYPRE_BigInt *col_starts_R,
                                 hypre_ParCSRMatrix *W, hypre_ParCSRMatrix **R_ptr );
 HYPRE_Int hypre_MGRBlockColLumpedRestrict( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                            hypre_ParCSRMatrix *A_CF, hypre_IntArray *CF_marker,
-                                           HYPRE_Int block_dim, hypre_ParCSRMatrix **W_ptr,
+                                           HYPRE_Int blk_dim, hypre_ParCSRMatrix **W_ptr,
                                            hypre_ParCSRMatrix **R_ptr);
 HYPRE_Int hypre_MGRColLumpedRestrict(hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *A_FF,
                                      hypre_ParCSRMatrix *A_CF, hypre_IntArray *CF_marker,
                                      hypre_ParCSRMatrix **W_ptr, hypre_ParCSRMatrix **R_ptr);
 
+/* par_mgr_rap.c */
+HYPRE_Int hypre_MGRBuildCoarseOperator(void *mgr_data, hypre_ParCSRMatrix *A_FF,
+                                       hypre_ParCSRMatrix *A_FC, hypre_ParCSRMatrix *A_CF,
+                                       hypre_ParCSRMatrix **A_CC_ptr, hypre_ParCSRMatrix *Wp,
+                                       hypre_ParCSRMatrix *Wr, HYPRE_Int level);
+
 /* par_mgr_coarsen.c */
 HYPRE_Int hypre_MGRCoarseParms( MPI_Comm comm, HYPRE_Int num_rows, hypre_IntArray *CF_marker,
                                 HYPRE_BigInt *row_starts_cpts, HYPRE_BigInt *row_starts_fpts );
@@ -2323,12 +2330,6 @@ HYPRE_Int hypre_ParCSRMatrixBlockDiagMatrixDevice( hypre_ParCSRMatrix *A, HYPRE_
                                                    HYPRE_Int point_type, HYPRE_Int *CF_marker,
                                                    HYPRE_Int diag_type,
                                                    hypre_ParCSRMatrix **B_ptr );
-HYPRE_Int hypre_MGRComputeNonGalerkinCGDevice( hypre_ParCSRMatrix *A_FF, hypre_ParCSRMatrix *A_FC,
-                                               hypre_ParCSRMatrix *A_CF, hypre_ParCSRMatrix *A_CC,
-                                               hypre_ParCSRMatrix *Wp, hypre_ParCSRMatrix *Wr,
-                                               HYPRE_Int blk_size, HYPRE_Int method,
-                                               HYPRE_Complex threshold,
-                                               hypre_ParCSRMatrix **A_H_ptr );
 
 /* par_mgr_stats.c */
 HYPRE_Int hypre_MGRSetupStats( void *mgr_vdata );
diff --git a/3rd_party/hypre/src/parcsr_ls/schwarz.c b/3rd_party/hypre/src/parcsr_ls/schwarz.c
index cd9a0ea3f..c3645ea7f 100644
--- a/3rd_party/hypre/src/parcsr_ls/schwarz.c
+++ b/3rd_party/hypre/src/parcsr_ls/schwarz.c
@@ -398,7 +398,7 @@ hypre_ParMPSchwarzSolve(hypre_ParCSRMatrix  *par_A,
    HYPRE_Real *rhs_ext = NULL;
    HYPRE_Real *vtemp_data;
    HYPRE_Real *aux;
-   HYPRE_Real *buf_data;
+   HYPRE_Real *buf_data = NULL;
    /*hypre_Vector *x_vector;*/
    MPI_Comm comm = hypre_ParCSRMatrixComm(par_A);
    HYPRE_Int num_domains = hypre_CSRMatrixNumRows(domain_structure);
@@ -742,7 +742,7 @@ hypre_ParMPSchwarzSolve(hypre_ParCSRMatrix  *par_A,
 
    if (comm_pkg)
    {
-      comm_handle = hypre_ParCSRCommHandleCreate (2, comm_pkg, x_ext, buf_data);
+      comm_handle = hypre_ParCSRCommHandleCreate(2, comm_pkg, x_ext, buf_data);
 
       hypre_ParCSRCommHandleDestroy(comm_handle);
       comm_handle = NULL;
@@ -3064,7 +3064,7 @@ hypre_ParAdSchwarzSolve(hypre_ParCSRMatrix *A,
    HYPRE_Int num_variables;
    HYPRE_Int num_cols_offd;
    HYPRE_Real *scale_ext = NULL;
-   HYPRE_Real *buf_data;
+   HYPRE_Real *buf_data = NULL;
    HYPRE_Int index;
 
    HYPRE_Int piv_counter = 0;
@@ -3195,7 +3195,7 @@ hypre_ParAdSchwarzSolve(hypre_ParCSRMatrix *A,
 
    if (comm_pkg)
    {
-      comm_handle = hypre_ParCSRCommHandleCreate (2, comm_pkg, x_ext_data, buf_data);
+      comm_handle = hypre_ParCSRCommHandleCreate(2, comm_pkg, x_ext_data, buf_data);
 
       hypre_ParCSRCommHandleDestroy(comm_handle);
       comm_handle = NULL;
@@ -3825,8 +3825,8 @@ hypre_ParGenerateScale(hypre_ParCSRMatrix  *A,
 
    hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
    HYPRE_Int    num_sends = 0;
-   HYPRE_Int   *send_map_starts;
-   HYPRE_Int   *send_map_elmts;
+   HYPRE_Int   *send_map_starts = NULL;
+   HYPRE_Int   *send_map_elmts = NULL;
 
    HYPRE_Int    num_variables = hypre_ParCSRMatrixNumRows(A);
    HYPRE_Int    num_cols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A));
@@ -3862,7 +3862,7 @@ hypre_ParGenerateScale(hypre_ParCSRMatrix  *A,
 
    if (comm_pkg)
    {
-      scale_int = hypre_CTAlloc(HYPRE_Real,  send_map_starts[num_sends], HYPRE_MEMORY_HOST);
+      scale_int = hypre_CTAlloc(HYPRE_Real, send_map_starts[num_sends], HYPRE_MEMORY_HOST);
       comm_handle = hypre_ParCSRCommHandleCreate (2, comm_pkg, scale_ext, scale_int);
 
       hypre_ParCSRCommHandleDestroy(comm_handle);
diff --git a/3rd_party/hypre/src/parcsr_mv/CMakeLists.txt b/3rd_party/hypre/src/parcsr_mv/CMakeLists.txt
index 00ac6a5bf..d4ba5e433 100644
--- a/3rd_party/hypre/src/parcsr_mv/CMakeLists.txt
+++ b/3rd_party/hypre/src/parcsr_mv/CMakeLists.txt
@@ -24,9 +24,13 @@ set(SRCS
   par_csr_bool_matop.c
   par_csr_bool_matrix.c
   par_csr_communication.c
+  par_csr_filter.c
+  par_csr_filter_device.c
   par_csr_matop.c
   par_csr_matrix.c
   par_csr_matrix_stats.c
+  par_csr_matmat.c
+  par_csr_matmat_device.c
   par_csr_matop_marked.c
   par_csr_matvec.c
   par_csr_matvec_device.c
@@ -47,8 +51,10 @@ target_sources(${PROJECT_NAME}
 
 if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
   set(GPU_SRCS
+    par_csr_matmat_device.c
     par_csr_matvec_device.c
     par_csr_fffc_device.c
+    par_csr_filter_device.c
     par_csr_matop_device.c
     par_csr_triplemat_device.c
     par_vector_device.c
diff --git a/3rd_party/hypre/src/parcsr_mv/Makefile b/3rd_party/hypre/src/parcsr_mv/Makefile
index 9643afc64..20d218519 100644
--- a/3rd_party/hypre/src/parcsr_mv/Makefile
+++ b/3rd_party/hypre/src/parcsr_mv/Makefile
@@ -44,9 +44,11 @@ FILES =\
  par_csr_bool_matop.c\
  par_csr_bool_matrix.c\
  par_csr_communication.c\
+ par_csr_filter.c\
  par_csr_matop.c\
  par_csr_matrix.c\
  par_csr_matrix_stats.c\
+ par_csr_matmat.c\
  par_csr_matvec.c\
  par_csr_matop_marked.c\
  par_csr_triplemat.c\
@@ -56,7 +58,9 @@ FILES =\
 
 CUFILES =\
  par_csr_fffc_device.c\
+ par_csr_filter_device.c\
  par_csr_matop_device.c\
+ par_csr_matmat_device.c\
  par_csr_matvec_device.c\
  par_csr_triplemat_device.c\
  par_vector_device.c
diff --git a/3rd_party/hypre/src/parcsr_mv/_hypre_parcsr_mv.h b/3rd_party/hypre/src/parcsr_mv/_hypre_parcsr_mv.h
index 10e2ce4e1..0cacdd5a9 100644
--- a/3rd_party/hypre/src/parcsr_mv/_hypre_parcsr_mv.h
+++ b/3rd_party/hypre/src/parcsr_mv/_hypre_parcsr_mv.h
@@ -934,6 +934,10 @@ HYPRE_Int hypre_ParCSRFindExtendCommPkg(MPI_Comm comm, HYPRE_BigInt global_num_c
                                         hypre_IJAssumedPart *apart, HYPRE_Int indices_len, HYPRE_BigInt *indices,
                                         hypre_ParCSRCommPkg **extend_comm_pkg);
 
+/* par_csr_filter.c */
+HYPRE_Int hypre_ParCSRMatrixBlkFilter(hypre_ParCSRMatrix *A,
+                                      HYPRE_Int block_size, hypre_ParCSRMatrix **B_ptr);
+
 /* par_csr_matop.c */
 HYPRE_Int hypre_ParCSRMatrixScale(hypre_ParCSRMatrix *A, HYPRE_Complex scalar);
 void hypre_ParMatmul_RowSizes ( HYPRE_MemoryLocation memory_location, HYPRE_Int **C_diag_i,
@@ -1074,6 +1078,10 @@ HYPRE_Int hypre_ParCSRMatrixBlockColSum( hypre_ParCSRMatrix *A, HYPRE_Int row_ma
                                          hypre_DenseBlockMatrix **B_ptr );
 HYPRE_Int hypre_ParCSRMatrixColSum( hypre_ParCSRMatrix *A, hypre_ParVector **B_ptr );
 
+/* par_csr_filter_device.c */
+HYPRE_Int hypre_ParCSRMatrixBlkFilterDevice(hypre_ParCSRMatrix *A, HYPRE_Int block_size,
+                                            hypre_ParCSRMatrix **B_ptr);
+
 /* par_csr_matop_device.c */
 HYPRE_Int hypre_ParCSRMatrixDiagScaleDevice ( hypre_ParCSRMatrix *par_A, hypre_ParVector *par_ld,
                                               hypre_ParVector *par_rd );
@@ -1182,6 +1190,14 @@ HYPRE_Int hypre_ParCSRMatrixStatsArrayCompute( HYPRE_Int num_matrices,
                                                hypre_ParCSRMatrix **matrices,
                                                hypre_MatrixStatsArray *stats_array );
 
+/* par_csr_matmat_device.c */
+HYPRE_Int hypre_ParCSRMatMatDiagDevice( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *BT,
+                                        hypre_ParCSRMatrix *C );
+
+/* par_csr_matmat.c */
+HYPRE_Int hypre_ParCSRMatMatDiag( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *B,
+                                  hypre_ParCSRMatrix **C_ptr );
+
 /* par_csr_matvec.c */
 // y = alpha*A*x + beta*b
 HYPRE_Int hypre_ParCSRMatrixMatvecOutOfPlace ( HYPRE_Complex alpha, hypre_ParCSRMatrix *A,
diff --git a/3rd_party/hypre/src/parcsr_mv/par_csr_filter.c b/3rd_party/hypre/src/parcsr_mv/par_csr_filter.c
new file mode 100644
index 000000000..ae433d639
--- /dev/null
+++ b/3rd_party/hypre/src/parcsr_mv/par_csr_filter.c
@@ -0,0 +1,218 @@
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Member functions for hypre_ParCSRMatrix class.
+ *
+ *****************************************************************************/
+
+#include "_hypre_parcsr_mv.h"
+
+/*--------------------------------------------------------------------------
+ * hypre_ParCSRMatrixBlkFilterHost
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_ParCSRMatrixBlkFilterHost( hypre_ParCSRMatrix  *A,
+                                 HYPRE_Int            block_size,
+                                 hypre_ParCSRMatrix **B_ptr )
+{
+   MPI_Comm             comm              = hypre_ParCSRMatrixComm(A);
+   HYPRE_BigInt         global_num_rows   = hypre_ParCSRMatrixGlobalNumRows(A);
+   HYPRE_BigInt         global_num_cols   = hypre_ParCSRMatrixGlobalNumCols(A);
+   HYPRE_BigInt        *row_starts        = hypre_ParCSRMatrixRowStarts(A);
+   HYPRE_BigInt        *col_starts        = hypre_ParCSRMatrixColStarts(A);
+   HYPRE_BigInt        *col_map_offd_A    = hypre_ParCSRMatrixColMapOffd(A);
+   HYPRE_MemoryLocation memory_location   = hypre_ParCSRMatrixMemoryLocation(A);
+
+   hypre_CSRMatrix     *A_diag            = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Int            num_rows          = hypre_CSRMatrixNumRows(A_diag);
+   HYPRE_Int           *A_diag_i          = hypre_CSRMatrixI(A_diag);
+   HYPRE_Int           *A_diag_j          = hypre_CSRMatrixJ(A_diag);
+   HYPRE_Complex       *A_diag_a          = hypre_CSRMatrixData(A_diag);
+
+   hypre_CSRMatrix     *A_offd            = hypre_ParCSRMatrixOffd(A);
+   HYPRE_Int           *A_offd_i          = hypre_CSRMatrixI(A_offd);
+   HYPRE_Int           *A_offd_j          = hypre_CSRMatrixJ(A_offd);
+   HYPRE_Complex       *A_offd_a          = hypre_CSRMatrixData(A_offd);
+   HYPRE_Int            num_cols_offd_A   = hypre_CSRMatrixNumCols(A_offd);
+
+   /* Output matrix variables */
+   hypre_ParCSRMatrix  *B;
+   hypre_CSRMatrix     *B_diag, *B_offd;
+   HYPRE_Int           *B_diag_i, *B_offd_i;
+   HYPRE_Int           *B_diag_j, *B_offd_j;
+   HYPRE_Complex       *B_diag_a, *B_offd_a;
+   HYPRE_BigInt        *col_map_offd_B;
+   HYPRE_Int            num_cols_offd_B;
+   HYPRE_Int            B_diag_nnz, B_offd_nnz;
+
+   /* Local variables */
+   HYPRE_BigInt         big_block_size    = (HYPRE_BigInt) block_size;
+   HYPRE_Int            i, j, c;
+   HYPRE_Int           *marker;
+
+   /*-----------------------------------------------------------------------
+    *  Sanity checks
+    *-----------------------------------------------------------------------*/
+
+   if (block_size < 1)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "block size must be greater than one!\n");
+      return hypre_error_flag;
+   }
+
+   if (global_num_rows % big_block_size)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "block size is not a divisor of the number of rows!\n");
+      return hypre_error_flag;
+   }
+
+   if (row_starts[0] % big_block_size)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "block size is not a divisor of the first global row!\n");
+      return hypre_error_flag;
+   }
+
+   if (global_num_rows != global_num_cols)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Function not implemented for rectangular matrices!\n");
+      return hypre_error_flag;
+   }
+
+   /*-----------------------------------------------------------------------
+    *  First pass: compute nonzero counts of B
+    *-----------------------------------------------------------------------*/
+
+   B_diag_nnz = B_offd_nnz = 0;
+   for (i = 0; i < num_rows; i++)
+   {
+      c = i % block_size;
+
+      for (j = A_diag_i[i]; j < A_diag_i[i + 1]; j++)
+      {
+         if (c == (A_diag_j[j] % block_size))
+         {
+            B_diag_nnz++;
+         }
+      }
+
+      for (j = A_offd_i[i]; j < A_offd_i[i + 1]; j++)
+      {
+         if (c == (HYPRE_Int) (col_map_offd_A[A_offd_j[j]] % big_block_size))
+         {
+            B_offd_nnz++;
+         }
+      }
+   }
+
+   /*-----------------------------------------------------------------------
+    *  Create and initialize output matrix
+    *-----------------------------------------------------------------------*/
+
+   B = hypre_ParCSRMatrixCreate(comm, global_num_rows, global_num_cols,
+                                row_starts, col_starts, num_cols_offd_A,
+                                B_diag_nnz, B_offd_nnz);
+
+   hypre_ParCSRMatrixInitialize_v2(B, memory_location);
+
+   B_diag   = hypre_ParCSRMatrixDiag(B);
+   B_diag_i = hypre_CSRMatrixI(B_diag);
+   B_diag_j = hypre_CSRMatrixJ(B_diag);
+   B_diag_a = hypre_CSRMatrixData(B_diag);
+
+   B_offd   = hypre_ParCSRMatrixOffd(B);
+   B_offd_i = hypre_CSRMatrixI(B_offd);
+   B_offd_j = hypre_CSRMatrixJ(B_offd);
+   B_offd_a = hypre_CSRMatrixData(B_offd);
+
+   col_map_offd_B = hypre_ParCSRMatrixColMapOffd(B);
+
+   /*-----------------------------------------------------------------------
+    *  Second pass: Fill entries of B
+    *-----------------------------------------------------------------------*/
+
+   marker = hypre_CTAlloc(HYPRE_Int, num_cols_offd_A, HYPRE_MEMORY_HOST);
+
+   for (i = 0; i < num_rows; i++)
+   {
+      c = i % block_size;
+
+      B_diag_i[i + 1] = B_diag_i[i];
+      for (j = A_diag_i[i]; j < A_diag_i[i + 1]; j++)
+      {
+         if (c == (A_diag_j[j] % block_size))
+         {
+            B_diag_j[B_diag_i[i + 1]] = A_diag_j[j];
+            B_diag_a[B_diag_i[i + 1]] = A_diag_a[j];
+            B_diag_i[i + 1]++;
+         }
+      }
+
+      B_offd_i[i + 1] = B_offd_i[i];
+      for (j = A_offd_i[i]; j < A_offd_i[i + 1]; j++)
+      {
+         if (c == (HYPRE_Int) (col_map_offd_A[A_offd_j[j]] % big_block_size))
+         {
+            B_offd_j[B_offd_i[i + 1]] = A_offd_j[j];
+            B_offd_a[B_offd_i[i + 1]] = A_offd_a[j];
+            B_offd_i[i + 1]++;
+            marker[A_offd_j[j]] = 1;
+         }
+      }
+   }
+
+   /* Update col_map array */
+   num_cols_offd_B = 0;
+   for (i = 0; i < num_cols_offd_A; i++)
+   {
+      if (marker[i])
+      {
+         col_map_offd_B[num_cols_offd_B++] = col_map_offd_A[i];
+      }
+   }
+   hypre_CSRMatrixNumCols(B_offd) = num_cols_offd_B;
+   hypre_TFree(marker, HYPRE_MEMORY_HOST);
+
+   /* Update global nonzeros */
+   hypre_ParCSRMatrixSetDNumNonzeros(B);
+   hypre_ParCSRMatrixNumNonzeros(B) = (HYPRE_BigInt) hypre_ParCSRMatrixDNumNonzeros(B);
+   hypre_MatvecCommPkgCreate(B);
+
+   /* Set output pointer */
+   *B_ptr = B;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_ParCSRMatrixBlkFilter
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_ParCSRMatrixBlkFilter( hypre_ParCSRMatrix  *A,
+                             HYPRE_Int            block_size,
+                             hypre_ParCSRMatrix **B_ptr )
+{
+   HYPRE_ANNOTATE_FUNC_BEGIN;
+
+#if defined(HYPRE_USING_GPU)
+   if (hypre_GetExecPolicy1(hypre_ParCSRMatrixMemoryLocation(A)) == HYPRE_EXEC_DEVICE)
+   {
+      hypre_ParCSRMatrixBlkFilterDevice(A, block_size, B_ptr);
+   }
+   else
+#endif
+   {
+      hypre_ParCSRMatrixBlkFilterHost(A, block_size, B_ptr);
+   }
+
+   HYPRE_ANNOTATE_FUNC_END;
+
+   return hypre_error_flag;
+}
diff --git a/3rd_party/hypre/src/parcsr_mv/par_csr_filter_device.c b/3rd_party/hypre/src/parcsr_mv/par_csr_filter_device.c
new file mode 100644
index 000000000..f1571f758
--- /dev/null
+++ b/3rd_party/hypre/src/parcsr_mv/par_csr_filter_device.c
@@ -0,0 +1,356 @@
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+#include "_hypre_parcsr_mv.h"
+#include "_hypre_utilities.hpp"
+
+#if defined(HYPRE_USING_GPU)
+
+/*--------------------------------------------------------------------------
+ * hypreGPUKernel_ParCSRMatrixBlkFilterCount
+ *--------------------------------------------------------------------------*/
+
+__global__ void
+hypreGPUKernel_ParCSRMatrixBlkFilterCount(hypre_DeviceItem  &item,
+                                          HYPRE_Int          num_rows,
+                                          HYPRE_Int          block_size,
+                                          HYPRE_Int         *A_diag_i,
+                                          HYPRE_Int         *A_diag_j,
+                                          HYPRE_Int         *A_offd_i,
+                                          HYPRE_Int         *A_offd_j,
+                                          HYPRE_BigInt      *A_col_map_offd,
+                                          HYPRE_Int         *B_diag_i,
+                                          HYPRE_Int         *B_offd_i)
+{
+   HYPRE_Int    row  = hypre_gpu_get_grid_warp_id<1, 1>(item);
+   HYPRE_Int    lane = hypre_gpu_get_lane_id<1>(item);
+   HYPRE_BigInt big_block_size = (HYPRE_BigInt) block_size;
+
+   if (row < num_rows)
+   {
+      HYPRE_Int p = 0, q = 0, pA, qA;
+
+      if (lane < 2)
+      {
+         p = read_only_load(A_diag_i + row + lane);
+         q = read_only_load(A_offd_i + row + lane);
+      }
+      pA = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p, 1);
+      p  = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p, 0);
+      qA = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, q, 1);
+      q  = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, q, 0);
+
+      HYPRE_Int diag_count = 0;
+      HYPRE_Int offd_count = 0;
+
+      for (HYPRE_Int j = p + lane;
+           warp_any_sync(item, HYPRE_WARP_FULL_MASK, j < pA);
+           j += HYPRE_WARP_SIZE)
+      {
+         if (j < pA)
+         {
+            const HYPRE_Int col = read_only_load(A_diag_j + j);
+            if ((col % block_size) == (row % block_size))
+            {
+               diag_count++;
+            }
+         }
+      }
+
+      for (HYPRE_Int j = q + lane;
+           warp_any_sync(item, HYPRE_WARP_FULL_MASK, j < qA);
+           j += HYPRE_WARP_SIZE)
+      {
+         if (j < qA)
+         {
+            const HYPRE_Int col = read_only_load(A_offd_j + j);
+            const HYPRE_BigInt global_col = read_only_load(A_col_map_offd + col);
+            if ((HYPRE_Int) (global_col % big_block_size) == (row % block_size))
+            {
+               offd_count++;
+            }
+         }
+      }
+
+      diag_count = warp_reduce_sum(item, diag_count);
+      offd_count = warp_reduce_sum(item, offd_count);
+
+      if (lane == 0)
+      {
+         B_diag_i[row] = diag_count;
+         B_offd_i[row] = offd_count;
+      }
+   }
+}
+
+/*--------------------------------------------------------------------------
+ * hypreGPUKernel_ParCSRMatrixBlkFilterFill
+ *--------------------------------------------------------------------------*/
+
+__global__ void
+hypreGPUKernel_ParCSRMatrixBlkFilterFill(hypre_DeviceItem &item,
+                                         HYPRE_Int         num_rows,
+                                         HYPRE_Int         block_size,
+                                         HYPRE_Int         A_num_cols_offd,
+                                         HYPRE_Int        *A_diag_i,
+                                         HYPRE_Int        *A_diag_j,
+                                         HYPRE_Complex    *A_diag_a,
+                                         HYPRE_Int        *A_offd_i,
+                                         HYPRE_Int        *A_offd_j,
+                                         HYPRE_Complex    *A_offd_a,
+                                         HYPRE_BigInt     *A_col_map_offd,
+                                         HYPRE_Int        *B_diag_i,
+                                         HYPRE_Int        *B_diag_j,
+                                         HYPRE_Complex    *B_diag_a,
+                                         HYPRE_Int        *B_offd_i,
+                                         HYPRE_Int        *B_offd_j,
+                                         HYPRE_Complex    *B_offd_a,
+                                         HYPRE_Int        *col_map_marker)
+{
+   const HYPRE_Int  row = hypre_gpu_get_grid_warp_id<1, 1>(item);
+   const HYPRE_Int  lane = hypre_gpu_get_lane_id<1>(item);
+   HYPRE_Int        p = 0, q = 0, pA, qA;
+   HYPRE_BigInt     big_block_size = (HYPRE_BigInt) block_size;
+
+   if (row >= num_rows)
+   {
+      return;
+   }
+
+   if (lane < 2)
+   {
+      p = read_only_load(A_diag_i + row + lane);
+      q = read_only_load(A_offd_i + row + lane);
+   }
+   pA = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p, 1);
+   p  = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, p, 0);
+   qA = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, q, 1);
+   q  = warp_shuffle_sync(item, HYPRE_WARP_FULL_MASK, q, 0);
+
+   HYPRE_Int diag_offset = B_diag_i[row];
+   for (HYPRE_Int j = p + lane;
+        warp_any_sync(item, HYPRE_WARP_FULL_MASK, j < pA);
+        j += HYPRE_WARP_SIZE)
+   {
+      const HYPRE_Int col     = (j < pA) ? read_only_load(A_diag_j + j) : 0;
+      HYPRE_Int       write   = (j < pA && (col % block_size) == (row % block_size));
+      hypre_mask      ballot  = hypre_ballot_sync(HYPRE_WARP_FULL_MASK, write);
+      HYPRE_Int       laneoff = hypre_popc(ballot & ((hypre_mask_one << lane) - 1));
+
+      if (write)
+      {
+         HYPRE_Int idx = diag_offset + laneoff;
+         B_diag_j[idx] = col;
+         B_diag_a[idx] = A_diag_a[j];
+      }
+
+      diag_offset += hypre_popc(ballot);
+   }
+
+   if (col_map_marker)
+   {
+      HYPRE_Int offd_offset = B_offd_i[row];
+      for (HYPRE_Int j = q + lane;
+           warp_any_sync(item, HYPRE_WARP_FULL_MASK, j < qA);
+           j += HYPRE_WARP_SIZE)
+      {
+         const HYPRE_Int    col        = (j < qA) ? read_only_load(A_offd_j + j) : 0;
+         const HYPRE_BigInt global_col = (j < qA) ? read_only_load(A_col_map_offd + col) : 0;
+         HYPRE_Int          write      = (j < qA) &&
+                                         (HYPRE_Int) (global_col % big_block_size) == (row % block_size);
+         hypre_mask         ballot     = hypre_ballot_sync(HYPRE_WARP_FULL_MASK, write);
+         HYPRE_Int          laneoff    = hypre_popc(ballot & ((hypre_mask_one << lane) - 1));
+
+         if (write)
+         {
+            HYPRE_Int idx = offd_offset + laneoff;
+            B_offd_j[idx] = col;
+            B_offd_a[idx] = A_offd_a[j];
+
+#ifndef HYPRE_USING_SYCL
+            if (col < A_num_cols_offd)
+            {
+               atomicOr(col_map_marker + col, 1);
+            }
+#endif
+         }
+
+         offd_offset += hypre_popc(ballot);
+      }
+   }
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_ParCSRMatrixBlkFilterDevice
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_ParCSRMatrixBlkFilterDevice(hypre_ParCSRMatrix  *A,
+                                  HYPRE_Int            block_size,
+                                  hypre_ParCSRMatrix **B_ptr)
+{
+   MPI_Comm             comm            = hypre_ParCSRMatrixComm(A);
+   HYPRE_BigInt         global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
+   HYPRE_BigInt         global_num_cols = hypre_ParCSRMatrixGlobalNumCols(A);
+   HYPRE_BigInt        *row_starts      = hypre_ParCSRMatrixRowStarts(A);
+   HYPRE_BigInt        *col_starts      = hypre_ParCSRMatrixColStarts(A);
+   HYPRE_BigInt        *A_col_map_offd  = hypre_ParCSRMatrixDeviceColMapOffd(A);
+   HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);
+
+   hypre_CSRMatrix     *A_diag          = hypre_ParCSRMatrixDiag(A);
+   HYPRE_Int            num_rows        = hypre_CSRMatrixNumRows(A_diag);
+   HYPRE_Int           *A_diag_i        = hypre_CSRMatrixI(A_diag);
+   HYPRE_Int           *A_diag_j        = hypre_CSRMatrixJ(A_diag);
+   HYPRE_Complex       *A_diag_a        = hypre_CSRMatrixData(A_diag);
+
+   hypre_CSRMatrix     *A_offd          = hypre_ParCSRMatrixOffd(A);
+   HYPRE_Int           *A_offd_i        = hypre_CSRMatrixI(A_offd);
+   HYPRE_Int           *A_offd_j        = hypre_CSRMatrixJ(A_offd);
+   HYPRE_Complex       *A_offd_a        = hypre_CSRMatrixData(A_offd);
+   HYPRE_Int            num_cols_offd   = hypre_CSRMatrixNumCols(A_offd);
+
+   hypre_ParCSRMatrix  *B;
+   hypre_CSRMatrix     *B_diag;
+   hypre_CSRMatrix     *B_offd;
+   HYPRE_Int           *B_diag_i;
+   HYPRE_Int           *B_diag_j;
+   HYPRE_Complex       *B_diag_a;
+   HYPRE_Int           *B_offd_i;
+   HYPRE_Int           *B_offd_j;
+   HYPRE_Complex       *B_offd_a;
+
+   HYPRE_Int            B_diag_nnz, B_offd_nnz;
+   HYPRE_BigInt        *B_col_map_offd;
+   HYPRE_Int           *col_map_marker;
+   HYPRE_BigInt        *col_map_end;
+
+   const dim3           bDim = hypre_GetDefaultDeviceBlockDimension();
+   const dim3           gDim = hypre_GetDefaultDeviceGridDimension(num_rows,
+                                                                   "w", bDim);
+
+   hypre_GpuProfilingPushRange("ParCSRMatrixBlkFilter");
+
+   /* Create A's device column map */
+   if (!hypre_ParCSRMatrixDeviceColMapOffd(A) &&
+       hypre_ParCSRMatrixColMapOffd(A))
+   {
+      hypre_ParCSRMatrixCopyColMapOffdToDevice(A);
+      A_col_map_offd = hypre_ParCSRMatrixDeviceColMapOffd(A);
+   }
+
+   /* Create and initialize output matrix B */
+   B = hypre_ParCSRMatrixCreate(comm, global_num_rows, global_num_cols,
+                                row_starts, col_starts, num_cols_offd,
+                                0, 0);
+   hypre_ParCSRMatrixInitialize_v2(B, memory_location);
+
+   B_diag = hypre_ParCSRMatrixDiag(B);
+   B_offd = hypre_ParCSRMatrixOffd(B);
+
+   B_diag_i = hypre_CSRMatrixI(B_diag);
+   B_offd_i = hypre_CSRMatrixI(B_offd);
+
+   /* First pass: count nonzeros */
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_ParCSRMatrixBlkFilterCount, gDim, bDim,
+                     num_rows, block_size,
+                     A_diag_i, A_diag_j,
+                     A_offd_i, A_offd_j,
+                     A_col_map_offd,
+                     B_diag_i, B_offd_i );
+
+   /* Compute row pointers and get total number of nonzeros */
+   hypreDevice_IntegerExclusiveScan(num_rows + 1, B_diag_i);
+   hypreDevice_IntegerExclusiveScan(num_rows + 1, B_offd_i);
+   hypre_TMemcpy(&B_diag_nnz, B_diag_i + num_rows, HYPRE_Int, 1,
+                 HYPRE_MEMORY_HOST, memory_location);
+   hypre_TMemcpy(&B_offd_nnz, B_offd_i + num_rows, HYPRE_Int, 1,
+                 HYPRE_MEMORY_HOST, memory_location);
+
+   /* Allocate memory for B */
+   B_diag_j = hypre_TAlloc(HYPRE_Int, B_diag_nnz, memory_location);
+   B_offd_j = hypre_TAlloc(HYPRE_Int, B_offd_nnz, memory_location);
+   B_diag_a = hypre_TAlloc(HYPRE_Complex, B_diag_nnz, memory_location);
+   B_offd_a = hypre_TAlloc(HYPRE_Complex, B_offd_nnz, memory_location);
+
+   /* Create a marker for used columns */
+   if (num_cols_offd > 0)
+   {
+      col_map_marker = hypre_CTAlloc(HYPRE_Int, num_cols_offd, memory_location);
+   }
+   else
+   {
+      col_map_marker = NULL;
+   }
+
+   /* Second pass: fill B */
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_ParCSRMatrixBlkFilterFill, gDim, bDim,
+                     num_rows, block_size, num_cols_offd,
+                     A_diag_i, A_diag_j, A_diag_a,
+                     A_offd_i, A_offd_j, A_offd_a,
+                     A_col_map_offd,
+                     B_diag_i, B_diag_j, B_diag_a,
+                     B_offd_i, B_offd_j, B_offd_a,
+                     col_map_marker );
+
+   /* Update CSR matrix structures */
+   hypre_CSRMatrixJ(B_diag)           = B_diag_j;
+   hypre_CSRMatrixData(B_diag)        = B_diag_a;
+   hypre_CSRMatrixNumNonzeros(B_diag) = B_diag_nnz;
+   hypre_CSRMatrixJ(B_offd)           = B_offd_j;
+   hypre_CSRMatrixData(B_offd)        = B_offd_a;
+   hypre_CSRMatrixNumNonzeros(B_offd) = B_offd_nnz;
+
+   /* Set up B's col_map_offd */
+   if (B_offd_nnz > 0)
+   {
+      /* Create B's device column map */
+      hypre_ParCSRMatrixDeviceColMapOffd(B) = hypre_CTAlloc(HYPRE_BigInt,
+                                                            num_cols_offd,
+                                                            HYPRE_MEMORY_DEVICE);
+      B_col_map_offd = hypre_ParCSRMatrixDeviceColMapOffd(B);
+
+#ifndef HYPRE_USING_SYCL
+      /* Copy used columns to B's col_map_offd */
+      col_map_end = HYPRE_THRUST_CALL(copy_if,
+                                      A_col_map_offd,
+                                      A_col_map_offd + num_cols_offd,
+                                      col_map_marker,
+                                      B_col_map_offd,
+                                      thrust::identity<HYPRE_Int>());
+
+      hypre_CSRMatrixNumCols(B_offd) = (HYPRE_Int) (col_map_end - B_col_map_offd);
+
+      /* Copy B's column map to host */
+      hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(B),
+                    hypre_ParCSRMatrixDeviceColMapOffd(B),
+                    HYPRE_BigInt,
+                    hypre_CSRMatrixNumCols(B_offd),
+                    HYPRE_MEMORY_HOST,
+                    HYPRE_MEMORY_DEVICE);
+#else
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "SYCL path not implemented!");
+      hypre_GpuProfilingPopRange();
+      return hypre_error_flag;
+#endif
+   }
+
+   /* Update global nonzeros */
+   hypre_ParCSRMatrixSetDNumNonzeros(B);
+   hypre_ParCSRMatrixNumNonzeros(B) = (HYPRE_BigInt) hypre_ParCSRMatrixDNumNonzeros(B);
+
+   /* TODO (VPM): compute B's commpkg directly from A's commpkg */
+   hypre_MatvecCommPkgCreate(B);
+
+   /* Set output pointer */
+   *B_ptr = B;
+
+   hypre_TFree(col_map_marker, memory_location);
+   hypre_GpuProfilingPopRange();
+   return hypre_error_flag;
+}
+
+#endif /* if defined(HYPRE_USING_GPU) */
diff --git a/3rd_party/hypre/src/parcsr_mv/par_csr_matmat.c b/3rd_party/hypre/src/parcsr_mv/par_csr_matmat.c
new file mode 100644
index 000000000..4fa407cfd
--- /dev/null
+++ b/3rd_party/hypre/src/parcsr_mv/par_csr_matmat.c
@@ -0,0 +1,159 @@
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+#include "_hypre_parcsr_mv.h"
+
+/*--------------------------------------------------------------------------
+ * hypre_ParCSRMatMatDiagHost
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_ParCSRMatMatDiagHost(hypre_ParCSRMatrix  *A,
+                           hypre_ParCSRMatrix  *BT,
+                           hypre_ParCSRMatrix  *C)
+{
+   HYPRE_Int             num_rows        = hypre_ParCSRMatrixNumRows(A);
+   hypre_CSRMatrix      *A_diag          = hypre_ParCSRMatrixDiag(A);
+   hypre_CSRMatrix      *A_offd          = hypre_ParCSRMatrixOffd(A);
+
+   hypre_CSRMatrix      *BT_diag, *BT_offd, *C_diag;
+   HYPRE_Int            *A_diag_i, *BT_diag_i, *C_diag_i;
+   HYPRE_Int            *A_offd_i, *BT_offd_i;
+   HYPRE_Int            *A_diag_j, *BT_diag_j, *C_diag_j;
+   HYPRE_Int            *A_offd_j, *BT_offd_j;
+   HYPRE_Complex        *A_diag_a, *BT_diag_a, *C_diag_a;
+   HYPRE_Complex        *A_offd_a, *BT_offd_a;
+   HYPRE_BigInt         *A_col_map_offd;
+   HYPRE_BigInt         *BT_col_map_offd;
+
+   HYPRE_Int             i, kA, kB;
+   HYPRE_Complex         diag;
+
+   /* Load pointers */
+   BT_diag   = hypre_ParCSRMatrixDiag(BT);
+   BT_offd   = hypre_ParCSRMatrixOffd(BT);
+   BT_diag_i = hypre_CSRMatrixI(BT_diag);
+   BT_offd_i = hypre_CSRMatrixI(BT_offd);
+   BT_diag_j = hypre_CSRMatrixJ(BT_diag);
+   BT_offd_j = hypre_CSRMatrixJ(BT_offd);
+   BT_diag_a = hypre_CSRMatrixData(BT_diag);
+   BT_offd_a = hypre_CSRMatrixData(BT_offd);
+   A_diag_i  = hypre_CSRMatrixI(A_diag);
+   A_offd_i  = hypre_CSRMatrixI(A_offd);
+   A_diag_j  = hypre_CSRMatrixJ(A_diag);
+   A_offd_j  = hypre_CSRMatrixJ(A_offd);
+   A_diag_a  = hypre_CSRMatrixData(A_diag);
+   A_offd_a  = hypre_CSRMatrixData(A_offd);
+   C_diag    = hypre_ParCSRMatrixDiag(C);
+   C_diag_i  = hypre_CSRMatrixI(C_diag);
+   C_diag_j  = hypre_CSRMatrixJ(C_diag);
+   C_diag_a  = hypre_CSRMatrixData(C_diag);
+
+   BT_col_map_offd = hypre_ParCSRMatrixColMapOffd(BT);
+   A_col_map_offd  = hypre_ParCSRMatrixColMapOffd(A);
+
+   /* Compute C = diag(A .* BT) */
+#ifdef HYPRE_USING_OPENMP
+   #pragma omp parallel for private(i, kA, kB, diag)
+#endif
+   for (i = 0; i < num_rows; i++)
+   {
+      /* Compute diagonal matrix contributions */
+      diag = 0.0;
+      for (kA = A_diag_i[i], kB = BT_diag_i[i];
+           kA < A_diag_i[i + 1] && kB < BT_diag_i[i + 1];)
+      {
+         if (A_diag_j[kA] < BT_diag_j[kB])
+         {
+            kA++;
+         }
+         else if (A_diag_j[kA] > BT_diag_j[kB])
+         {
+            kB++;
+         }
+         else
+         {
+            diag += A_diag_a[kA] * BT_diag_a[kB];
+            kA++; kB++;
+         }
+      }
+
+      /* Compute off-diagonal matrix contributions */
+      for (kA = A_offd_i[i], kB = BT_offd_i[i];
+           kA < A_offd_i[i + 1] && kB < BT_offd_i[i + 1];)
+      {
+         if (A_col_map_offd[A_offd_j[kA]] < BT_col_map_offd[BT_offd_j[kB]])
+         {
+            kA++;
+         }
+         else if (A_col_map_offd[A_offd_j[kA]] > BT_col_map_offd[BT_offd_j[kB]])
+         {
+            kB++;
+         }
+         else
+         {
+            diag += A_offd_a[kA] * BT_offd_a[kB];
+            kA++; kB++;
+         }
+      }
+
+      C_diag_a[i] = diag;
+      C_diag_j[i] = i;
+      C_diag_i[i + 1] = i + 1;
+   }
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_ParCSRMatMatDiag
+ *
+ * Computes C = diag(A * B)
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_ParCSRMatMatDiag(hypre_ParCSRMatrix  *A,
+                       hypre_ParCSRMatrix  *B,
+                       hypre_ParCSRMatrix **C_ptr)
+{
+   MPI_Comm              comm            = hypre_ParCSRMatrixComm(A);
+   HYPRE_BigInt          global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
+   HYPRE_BigInt          global_num_cols = hypre_ParCSRMatrixGlobalNumCols(B);
+   HYPRE_Int             num_rows        = hypre_ParCSRMatrixNumRows(A);
+   HYPRE_BigInt         *row_starts      = hypre_ParCSRMatrixRowStarts(A);
+   HYPRE_MemoryLocation  memory_location = hypre_ParCSRMatrixMemoryLocation(A);
+   hypre_ParCSRMatrix   *C, *BT;
+
+   /* Create and initialize output matrix C */
+   C = hypre_ParCSRMatrixCreate(comm, global_num_rows, global_num_cols,
+                                row_starts, row_starts, 0, num_rows, 0);
+   hypre_ParCSRMatrixInitialize_v2(C, memory_location);
+
+   /* Transpose B for easier multiplication with A */
+   hypre_ParCSRMatrixTranspose(B, &BT, 1);
+
+#if defined(HYPRE_USING_GPU)
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2(hypre_ParCSRMatrixMemoryLocation(A),
+                                                     hypre_ParCSRMatrixMemoryLocation(BT));
+   if (exec == HYPRE_EXEC_DEVICE)
+   {
+      hypre_ParCSRMatMatDiagDevice(A, BT, C);
+   }
+   else
+#endif
+   {
+      hypre_ParCSRMatMatDiagHost(A, BT, C);
+   }
+
+   /* Output pointer */
+   *C_ptr = C;
+
+   /* Free memory */
+   hypre_ParCSRMatrixDestroy(BT);
+
+   return hypre_error_flag;
+}
diff --git a/3rd_party/hypre/src/parcsr_mv/par_csr_matmat_device.c b/3rd_party/hypre/src/parcsr_mv/par_csr_matmat_device.c
new file mode 100644
index 000000000..9731bfec0
--- /dev/null
+++ b/3rd_party/hypre/src/parcsr_mv/par_csr_matmat_device.c
@@ -0,0 +1,146 @@
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+#include "_hypre_parcsr_mv.h"
+#include "_hypre_utilities.hpp"
+
+#if defined(HYPRE_USING_GPU)
+
+/*--------------------------------------------------------------------------
+ * hypreGPUKernel_ParCSRMatMatDiag
+ *--------------------------------------------------------------------------*/
+
+__global__ void
+hypreGPUKernel_ParCSRMatMatDiag(hypre_DeviceItem &item,
+                                HYPRE_Int         num_rows,
+                                HYPRE_Int        *A_diag_i,
+                                HYPRE_Int        *A_diag_j,
+                                HYPRE_Complex    *A_diag_data,
+                                HYPRE_Int        *A_offd_i,
+                                HYPRE_Int        *A_offd_j,
+                                HYPRE_Complex    *A_offd_data,
+                                HYPRE_Int        *BT_diag_i,
+                                HYPRE_Int        *BT_diag_j,
+                                HYPRE_Complex    *BT_diag_data,
+                                HYPRE_Int        *BT_offd_i,
+                                HYPRE_Int        *BT_offd_j,
+                                HYPRE_Complex    *BT_offd_data,
+                                HYPRE_BigInt     *A_col_map_offd,
+                                HYPRE_BigInt     *BT_col_map_offd,
+                                HYPRE_Complex    *C_diag_data)
+{
+   const HYPRE_Int row = hypre_gpu_get_thread_id<1>(item);
+
+   if (row < num_rows)
+   {
+      HYPRE_Complex sum = 0.0;
+
+      /* Process diagonal part of A */
+      HYPRE_Int kA = A_diag_i[row];
+      HYPRE_Int kB = BT_diag_i[row];
+      while (kA < A_diag_i[row + 1] && kB < BT_diag_i[row + 1])
+      {
+         if (A_diag_j[kA] < BT_diag_j[kB])
+         {
+            kA++;
+         }
+         else if (A_diag_j[kA] > BT_diag_j[kB])
+         {
+            kB++;
+         }
+         else
+         {
+            sum += A_diag_data[kA] * BT_diag_data[kB];
+            kA++;
+            kB++;
+         }
+      }
+
+      /* Process off-diagonal part of A */
+      kA = A_offd_i[row];
+      kB = BT_offd_i[row];
+      while (kA < A_offd_i[row + 1] && kB < BT_offd_i[row + 1])
+      {
+         HYPRE_BigInt col_A = A_col_map_offd[A_offd_j[kA]];
+         HYPRE_BigInt col_B = BT_col_map_offd[BT_offd_j[kB]];
+         if (col_A < col_B)
+         {
+            kA++;
+         }
+         else if (col_A > col_B)
+         {
+            kB++;
+         }
+         else
+         {
+            sum += A_offd_data[kA] * BT_offd_data[kB];
+            kA++;
+            kB++;
+         }
+      }
+
+      C_diag_data[row] = sum;
+   }
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_ParCSRMatMatDiagDevice
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_ParCSRMatMatDiagDevice(hypre_ParCSRMatrix  *A,
+                             hypre_ParCSRMatrix  *BT,
+                             hypre_ParCSRMatrix  *C)
+{
+   HYPRE_Int             num_rows        = hypre_ParCSRMatrixNumRows(A);
+   hypre_CSRMatrix      *A_diag          = hypre_ParCSRMatrixDiag(A);
+   hypre_CSRMatrix      *A_offd          = hypre_ParCSRMatrixOffd(A);
+   hypre_CSRMatrix      *BT_diag         = hypre_ParCSRMatrixDiag(BT);
+   hypre_CSRMatrix      *BT_offd         = hypre_ParCSRMatrixOffd(BT);
+   hypre_CSRMatrix      *C_diag          = hypre_ParCSRMatrixDiag(C);
+
+   HYPRE_Int            *C_diag_i        = hypre_CSRMatrixI(C_diag);
+   HYPRE_Int            *C_diag_j        = hypre_CSRMatrixJ(C_diag);
+
+   hypre_GpuProfilingPushRange("ParCSRMatMatDiag");
+
+   /* Set up C_diag_i and C_diag_j */
+   HYPRE_THRUST_CALL(sequence, C_diag_i, C_diag_i + num_rows + 1, 0);
+   HYPRE_THRUST_CALL(sequence, C_diag_j, C_diag_j + num_rows, 0);
+
+   /* Update device column maps if needed */
+   hypre_ParCSRMatrixCopyColMapOffdToDevice(A);
+   hypre_ParCSRMatrixCopyColMapOffdToDevice(BT);
+
+   /* Launch GPU kernel */
+   const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   const dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "threads", bDim);
+
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_ParCSRMatMatDiag, gDim, bDim,
+                     num_rows,
+                     hypre_CSRMatrixI(A_diag),
+                     hypre_CSRMatrixJ(A_diag),
+                     hypre_CSRMatrixData(A_diag),
+                     hypre_CSRMatrixI(A_offd),
+                     hypre_CSRMatrixJ(A_offd),
+                     hypre_CSRMatrixData(A_offd),
+                     hypre_CSRMatrixI(BT_diag),
+                     hypre_CSRMatrixJ(BT_diag),
+                     hypre_CSRMatrixData(BT_diag),
+                     hypre_CSRMatrixI(BT_offd),
+                     hypre_CSRMatrixJ(BT_offd),
+                     hypre_CSRMatrixData(BT_offd),
+                     hypre_ParCSRMatrixDeviceColMapOffd(A),
+                     hypre_ParCSRMatrixDeviceColMapOffd(BT),
+                     hypre_CSRMatrixData(C_diag) );
+
+   hypre_GpuProfilingPopRange();
+
+   return hypre_error_flag;
+}
+
+#endif /* if defined(HYPRE_USING_GPU) */
diff --git a/3rd_party/hypre/src/parcsr_mv/par_csr_matop.c b/3rd_party/hypre/src/parcsr_mv/par_csr_matop.c
index 043a82778..4cae93eaa 100644
--- a/3rd_party/hypre/src/parcsr_mv/par_csr_matop.c
+++ b/3rd_party/hypre/src/parcsr_mv/par_csr_matop.c
@@ -694,7 +694,7 @@ hypre_ParMatmul( hypre_ParCSRMatrix  *A,
          HYPRE_Int i_begin, i_end;
          hypre_GetSimpleThreadPartition(&i_begin, &i_end, num_cols_offd_C);
 
-         HYPRE_Int cnt;
+         HYPRE_Int cnt = 0;
          if (i_end > i_begin)
          {
             cnt = hypre_BigLowerBound(col_map_offd_B,
@@ -1275,10 +1275,10 @@ void hypre_ParCSRMatrixExtractBExt_Arrays_Overlap(
    HYPRE_BigInt *B_int_j;
    HYPRE_Int *B_ext_i;
    HYPRE_BigInt * B_ext_j;
-   HYPRE_Complex * B_ext_data;
+   HYPRE_Complex * B_ext_data = NULL;
    HYPRE_Complex * B_int_data = NULL;
-   HYPRE_BigInt * B_int_row_map;
-   HYPRE_BigInt * B_ext_row_map;
+   HYPRE_BigInt * B_int_row_map = NULL;
+   HYPRE_BigInt * B_ext_row_map = NULL;
    HYPRE_Int num_procs, my_id;
    HYPRE_Int *jdata_recv_vec_starts;
    HYPRE_Int *jdata_send_map_starts;
@@ -3528,7 +3528,7 @@ hypre_ParTMatmul( hypre_ParCSRMatrix  *A,
 
    HYPRE_BigInt    *temp;
    HYPRE_Int       *send_map_starts_A = NULL;
-   HYPRE_Int       *send_map_elmts_A;
+   HYPRE_Int       *send_map_elmts_A = NULL;
    HYPRE_Int        num_sends_A = 0;
 
    HYPRE_Int        num_cols_offd_C = 0;
diff --git a/3rd_party/hypre/src/parcsr_mv/par_csr_matop_marked.c b/3rd_party/hypre/src/parcsr_mv/par_csr_matop_marked.c
index 0bc2a0107..b6c44e701 100644
--- a/3rd_party/hypre/src/parcsr_mv/par_csr_matop_marked.c
+++ b/3rd_party/hypre/src/parcsr_mv/par_csr_matop_marked.c
@@ -460,11 +460,7 @@ hypre_ParCSRMatrix * hypre_ParMatmul_FC(
       }
    }
 
-   if (num_cols_offd_C)
-   {
-      col_map_offd_C = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_HOST);
-   }
-
+   col_map_offd_C = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_HOST);
    for (i = 0; i < num_cols_offd_C; i++)
    {
       col_map_offd_C[i] = temp[i];
@@ -475,21 +471,26 @@ hypre_ParCSRMatrix * hypre_ParMatmul_FC(
       hypre_TFree(temp, HYPRE_MEMORY_HOST);
    }
 
-   for (i = 0 ; i < P_ext_offd_size; i++)
+   for (i = 0; i < P_ext_offd_size; i++)
+   {
       P_ext_offd_j[i] = hypre_BigBinarySearch(col_map_offd_C,
                                               Ps_ext_j[i],
                                               num_cols_offd_C);
+   }
+
    if (num_cols_offd_P)
    {
       map_P_to_C = hypre_CTAlloc(HYPRE_Int, num_cols_offd_P, HYPRE_MEMORY_HOST);
 
       cnt = 0;
       for (i = 0; i < num_cols_offd_C; i++)
+      {
          if (col_map_offd_C[i] == col_map_offd_P[cnt])
          {
             map_P_to_C[cnt++] = i;
             if (cnt == num_cols_offd_P) { break; }
          }
+      }
    }
 
    if (num_procs > 1) { hypre_CSRMatrixDestroy(Ps_ext); }
diff --git a/3rd_party/hypre/src/parcsr_mv/par_csr_matrix.c b/3rd_party/hypre/src/parcsr_mv/par_csr_matrix.c
index fa5072403..7fe2ce773 100644
--- a/3rd_party/hypre/src/parcsr_mv/par_csr_matrix.c
+++ b/3rd_party/hypre/src/parcsr_mv/par_csr_matrix.c
@@ -233,7 +233,8 @@ hypre_ParCSRMatrixInitialize_v2( hypre_ParCSRMatrix   *matrix,
    hypre_CSRMatrixInitialize_v2(hypre_ParCSRMatrixOffd(matrix), 0, memory_location);
 
    hypre_ParCSRMatrixColMapOffd(matrix) =
-      hypre_CTAlloc(HYPRE_BigInt, hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(matrix)),
+      hypre_CTAlloc(HYPRE_BigInt,
+                    hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(matrix)),
                     HYPRE_MEMORY_HOST);
 
    return hypre_error_flag;
@@ -653,7 +654,7 @@ hypre_ParCSRMatrixCreateFromParVector(hypre_ParVector *b,
    if (hypre_VectorOwnsData(local_vector))
    {
       hypre_CSRMatrixData(A_diag) = hypre_VectorData(local_vector);
-      hypre_VectorOwnsData(b) = 0;
+      hypre_VectorOwnsData(local_vector) = 0;
    }
    else
    {
@@ -700,6 +701,9 @@ hypre_ParCSRMatrixCreateFromParVector(hypre_ParVector *b,
       hypre_TMemcpy(hypre_CSRMatrixJ(A_diag), A_diag_j,
                     HYPRE_Int, num_nonzeros,
                     memory_location, HYPRE_MEMORY_HOST);
+
+      hypre_TFree(A_diag_i, HYPRE_MEMORY_HOST);
+      hypre_TFree(A_diag_j, HYPRE_MEMORY_HOST);
    }
    else
    {
@@ -2780,6 +2784,7 @@ hypre_FillResponseParToCSRMatrix( void       *p_recv_contact_buf,
 
 /*--------------------------------------------------------------------------
  * hypre_ParCSRMatrixUnion
+ *
  * Creates and returns a new matrix whose elements are the union of A and B.
  * Data is not copied, only structural information is created.
  * A and B must have the same communicator, numbers and distributions of rows
@@ -2848,14 +2853,17 @@ hypre_ParCSRMatrixUnion( hypre_ParCSRMatrix *A,
  * hypre_ParCSRMatrixTruncate
  *
  * Perform dual truncation of ParCSR matrix.
+ *
  * This code is adapted from original BoomerAMGInterpTruncate()
+ *
  * A: parCSR matrix to be modified
  * tol: relative tolerance or truncation factor for dropping small terms
  * max_row_elmts: maximum number of (largest) nonzero elements to keep.
  * rescale: Boolean on whether or not to scale resulting matrix. Scaling for
- * each row satisfies: sum(nonzero values before dropping)/ sum(nonzero values after dropping),
- * this way, the application of the truncated matrix on a constant vector is the same as that of
- * the original matrix.
+ *   each row satisfies: sum(nonzero values before dropping) /
+ *                       sum(nonzero values after dropping),
+ *   this way, the application of the truncated matrix on a constant vector
+ *   is the same as that of the original matrix.
  * nrm_type: type of norm used for dropping with tol.
  * -- 0 = infinity-norm
  * -- 1 = 1-norm
diff --git a/3rd_party/hypre/src/parcsr_mv/par_vector.c b/3rd_party/hypre/src/parcsr_mv/par_vector.c
index 5a9515679..738201b6e 100644
--- a/3rd_party/hypre/src/parcsr_mv/par_vector.c
+++ b/3rd_party/hypre/src/parcsr_mv/par_vector.c
@@ -1179,7 +1179,7 @@ hypre_ParVectorReadIJ( MPI_Comm          comm,
    hypre_Vector     *local_vector;
    HYPRE_Complex    *local_data;
    HYPRE_BigInt      big_local_size;
-   HYPRE_BigInt      partitioning[2];
+   HYPRE_BigInt      partitioning[2] = {0, 0};
    HYPRE_Int         base_j;
 
    HYPRE_Int         myid, num_procs, j;
diff --git a/3rd_party/hypre/src/parcsr_mv/protos.h b/3rd_party/hypre/src/parcsr_mv/protos.h
index b2666faf6..9d601d008 100644
--- a/3rd_party/hypre/src/parcsr_mv/protos.h
+++ b/3rd_party/hypre/src/parcsr_mv/protos.h
@@ -273,6 +273,10 @@ HYPRE_Int hypre_ParCSRFindExtendCommPkg(MPI_Comm comm, HYPRE_BigInt global_num_c
                                         hypre_IJAssumedPart *apart, HYPRE_Int indices_len, HYPRE_BigInt *indices,
                                         hypre_ParCSRCommPkg **extend_comm_pkg);
 
+/* par_csr_filter.c */
+HYPRE_Int hypre_ParCSRMatrixBlkFilter(hypre_ParCSRMatrix *A,
+                                      HYPRE_Int block_size, hypre_ParCSRMatrix **B_ptr);
+
 /* par_csr_matop.c */
 HYPRE_Int hypre_ParCSRMatrixScale(hypre_ParCSRMatrix *A, HYPRE_Complex scalar);
 void hypre_ParMatmul_RowSizes ( HYPRE_MemoryLocation memory_location, HYPRE_Int **C_diag_i,
@@ -413,6 +417,10 @@ HYPRE_Int hypre_ParCSRMatrixBlockColSum( hypre_ParCSRMatrix *A, HYPRE_Int row_ma
                                          hypre_DenseBlockMatrix **B_ptr );
 HYPRE_Int hypre_ParCSRMatrixColSum( hypre_ParCSRMatrix *A, hypre_ParVector **B_ptr );
 
+/* par_csr_filter_device.c */
+HYPRE_Int hypre_ParCSRMatrixBlkFilterDevice(hypre_ParCSRMatrix *A, HYPRE_Int block_size,
+                                            hypre_ParCSRMatrix **B_ptr);
+
 /* par_csr_matop_device.c */
 HYPRE_Int hypre_ParCSRMatrixDiagScaleDevice ( hypre_ParCSRMatrix *par_A, hypre_ParVector *par_ld,
                                               hypre_ParVector *par_rd );
@@ -521,6 +529,14 @@ HYPRE_Int hypre_ParCSRMatrixStatsArrayCompute( HYPRE_Int num_matrices,
                                                hypre_ParCSRMatrix **matrices,
                                                hypre_MatrixStatsArray *stats_array );
 
+/* par_csr_matmat_device.c */
+HYPRE_Int hypre_ParCSRMatMatDiagDevice( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *BT,
+                                        hypre_ParCSRMatrix *C );
+
+/* par_csr_matmat.c */
+HYPRE_Int hypre_ParCSRMatMatDiag( hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *B,
+                                  hypre_ParCSRMatrix **C_ptr );
+
 /* par_csr_matvec.c */
 // y = alpha*A*x + beta*b
 HYPRE_Int hypre_ParCSRMatrixMatvecOutOfPlace ( HYPRE_Complex alpha, hypre_ParCSRMatrix *A,
diff --git a/3rd_party/hypre/src/seq_block_mv/Makefile b/3rd_party/hypre/src/seq_block_mv/Makefile
index dd06baa3e..0ec88cf5c 100644
--- a/3rd_party/hypre/src/seq_block_mv/Makefile
+++ b/3rd_party/hypre/src/seq_block_mv/Makefile
@@ -36,7 +36,7 @@ SONAME = libHYPRE_seq_block_mv-${HYPRE_RELEASE_VERSION}${HYPRE_LIB_SUFFIX}
 ##################################################################
 
 all:
-	make lib
+	$(MAKE) lib
 
 lib: libHYPRE_seq_block_mv${HYPRE_LIB_SUFFIX}
 	cp -fR $(srcdir)/_hypre_seq_block_mv.h $(HYPRE_BUILD_DIR)/include
diff --git a/3rd_party/hypre/src/seq_mv/CMakeLists.txt b/3rd_party/hypre/src/seq_mv/CMakeLists.txt
index b41f60645..6a2a937d4 100644
--- a/3rd_party/hypre/src/seq_mv/CMakeLists.txt
+++ b/3rd_party/hypre/src/seq_mv/CMakeLists.txt
@@ -9,6 +9,7 @@ set(HDRS
 )
 
 set(SRCS
+  csr_filter.c
   csr_matop.c
   csr_matrix.c
   csr_matvec.c
diff --git a/3rd_party/hypre/src/seq_mv/Makefile b/3rd_party/hypre/src/seq_mv/Makefile
index a258d8bab..d7a41d87f 100644
--- a/3rd_party/hypre/src/seq_mv/Makefile
+++ b/3rd_party/hypre/src/seq_mv/Makefile
@@ -24,6 +24,7 @@ HEADERS =\
  vector.h
 
 FILES =\
+ csr_filter.c\
  csr_matop.c\
  csr_matrix.c\
  csr_matvec.c\
diff --git a/3rd_party/hypre/src/seq_mv/csr_filter.c b/3rd_party/hypre/src/seq_mv/csr_filter.c
new file mode 100644
index 000000000..af371cc95
--- /dev/null
+++ b/3rd_party/hypre/src/seq_mv/csr_filter.c
@@ -0,0 +1,52 @@
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Methods for matrix truncation/filtering
+ *
+ *****************************************************************************/
+
+#include "seq_mv.h"
+
+/*--------------------------------------------------------------------------
+ * hypre_CSRMatrixTruncateDiag
+ *
+ * Truncates the input matrix to its diagonal portion.
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_CSRMatrixTruncateDiag(hypre_CSRMatrix *A)
+{
+   HYPRE_MemoryLocation  memory_location = hypre_CSRMatrixMemoryLocation(A);
+   HYPRE_Int             num_rows        = hypre_CSRMatrixNumRows(A);
+   HYPRE_Complex        *A_a;
+   HYPRE_Int            *A_i, *A_j;
+
+   /* Extract diagonal */
+   A_a = hypre_TAlloc(HYPRE_Complex, num_rows, memory_location);
+   hypre_CSRMatrixExtractDiagonal(A, A_a, 0);
+
+   /* Free old matrix data */
+   hypre_TFree(hypre_CSRMatrixData(A), memory_location);
+   hypre_TFree(hypre_CSRMatrixI(A), memory_location);
+   hypre_TFree(hypre_CSRMatrixJ(A), memory_location);
+
+   /* Update matrix sparsity pattern */
+   A_i = hypre_TAlloc(HYPRE_Int, num_rows + 1, memory_location);
+   A_j = hypre_TAlloc(HYPRE_Int, num_rows, memory_location);
+   hypre_IntSequence(memory_location, num_rows + 1, A_i);
+   hypre_IntSequence(memory_location, num_rows, A_j);
+
+   /* Update matrix pointers and number of nonzero entries */
+   hypre_CSRMatrixNumNonzeros(A) = num_rows;
+   hypre_CSRMatrixI(A) = A_i;
+   hypre_CSRMatrixJ(A) = A_j;
+   hypre_CSRMatrixData(A) = A_a;
+
+   return hypre_error_flag;
+}
diff --git a/3rd_party/hypre/src/seq_mv/csr_matop.c b/3rd_party/hypre/src/seq_mv/csr_matop.c
index be51db25c..cbedfb8c2 100644
--- a/3rd_party/hypre/src/seq_mv/csr_matop.c
+++ b/3rd_party/hypre/src/seq_mv/csr_matop.c
@@ -1055,7 +1055,7 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix  *A,
    HYPRE_Int             num_nnzs_A = hypre_CSRMatrixNumNonzeros(A);
    HYPRE_MemoryLocation  memory_location = hypre_CSRMatrixMemoryLocation(A);
 
-   HYPRE_Complex        *AT_data;
+   HYPRE_Complex        *AT_data = NULL;
    HYPRE_Int            *AT_j;
    HYPRE_Int             num_rows_AT;
    HYPRE_Int             num_cols_AT;
diff --git a/3rd_party/hypre/src/seq_mv/csr_matop_device.c b/3rd_party/hypre/src/seq_mv/csr_matop_device.c
index c1bbad75c..e495fac8d 100644
--- a/3rd_party/hypre/src/seq_mv/csr_matop_device.c
+++ b/3rd_party/hypre/src/seq_mv/csr_matop_device.c
@@ -3331,9 +3331,6 @@ hypre_CSRMatrixILU0(hypre_CSRMatrix *A)
    HYPRE_Int                 zero_pivot;
    char                      errmsg[1024];
 
-   HYPRE_ANNOTATE_FUNC_BEGIN;
-   hypre_GpuProfilingPushRange("CSRMatrixILU0");
-
    /* Sanity check */
    if (num_rows != num_cols)
    {
@@ -3341,6 +3338,9 @@ hypre_CSRMatrixILU0(hypre_CSRMatrix *A)
       return hypre_error_flag;
    }
 
+   HYPRE_ANNOTATE_FUNC_BEGIN;
+   hypre_GpuProfilingPushRange("CSRMatrixILU0");
+
    /*-------------------------------------------------------------------------------------
     * 1. Sort columns inside each row first, we can't assume that's sorted
     *-------------------------------------------------------------------------------------*/
diff --git a/3rd_party/hypre/src/seq_mv/protos.h b/3rd_party/hypre/src/seq_mv/protos.h
index b2fe3beb8..6d16377a1 100644
--- a/3rd_party/hypre/src/seq_mv/protos.h
+++ b/3rd_party/hypre/src/seq_mv/protos.h
@@ -5,6 +5,9 @@
  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
  ******************************************************************************/
 
+/* csr_filter.c */
+HYPRE_Int hypre_CSRMatrixTruncateDiag(hypre_CSRMatrix *A);
+
 /* csr_matop.c */
 HYPRE_Int hypre_CSRMatrixAddFirstPass ( HYPRE_Int firstrow, HYPRE_Int lastrow, HYPRE_Int *marker,
                                         HYPRE_Int *twspace, HYPRE_Int *map_A2C, HYPRE_Int *map_B2C,
diff --git a/3rd_party/hypre/src/seq_mv/seq_mv.h b/3rd_party/hypre/src/seq_mv/seq_mv.h
index 8ef72b5cc..7a5e04061 100644
--- a/3rd_party/hypre/src/seq_mv/seq_mv.h
+++ b/3rd_party/hypre/src/seq_mv/seq_mv.h
@@ -284,6 +284,9 @@ typedef struct
  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
  ******************************************************************************/
 
+/* csr_filter.c */
+HYPRE_Int hypre_CSRMatrixTruncateDiag(hypre_CSRMatrix *A);
+
 /* csr_matop.c */
 HYPRE_Int hypre_CSRMatrixAddFirstPass ( HYPRE_Int firstrow, HYPRE_Int lastrow, HYPRE_Int *marker,
                                         HYPRE_Int *twspace, HYPRE_Int *map_A2C, HYPRE_Int *map_B2C,
diff --git a/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_ls.h b/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_ls.h
index d1c9d18c5..c4e2356fe 100644
--- a/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_ls.h
+++ b/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_ls.h
@@ -618,7 +618,7 @@ HYPRE_SStructMaxwellSetGrad(HYPRE_SStructSolver solver,
  **/
 HYPRE_Int
 HYPRE_SStructMaxwellSetRfactors(HYPRE_SStructSolver solver,
-                                HYPRE_Int           rfactors[HYPRE_MAXDIM]);
+                                HYPRE_Int          *rfactors);
 
 /**
  * Finds the physical boundary row ranks on all levels.
@@ -626,7 +626,7 @@ HYPRE_SStructMaxwellSetRfactors(HYPRE_SStructSolver solver,
 HYPRE_Int
 HYPRE_SStructMaxwellPhysBdy(HYPRE_SStructGrid  *grid_l,
                             HYPRE_Int           num_levels,
-                            HYPRE_Int           rfactors[HYPRE_MAXDIM],
+                            HYPRE_Int          *rfactors,
                             HYPRE_Int        ***BdryRanks_ptr,
                             HYPRE_Int         **BdryRanksCnt_ptr );
 
diff --git a/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_maxwell.c b/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_maxwell.c
index 9dbd095a8..02de04411 100644
--- a/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_maxwell.c
+++ b/3rd_party/hypre/src/sstruct_ls/HYPRE_sstruct_maxwell.c
@@ -111,7 +111,7 @@ HYPRE_SStructMaxwellSetGrad( HYPRE_SStructSolver  solver,
  *--------------------------------------------------------------------------*/
 HYPRE_Int
 HYPRE_SStructMaxwellSetRfactors( HYPRE_SStructSolver  solver,
-                                 HYPRE_Int            rfactors[3] )
+                                 HYPRE_Int           *rfactors )
 {
    return ( hypre_MaxwellSetRfactors( (void *)         solver,
                                       rfactors ) );
@@ -234,7 +234,7 @@ HYPRE_SStructMaxwellGetFinalRelativeResidualNorm( HYPRE_SStructSolver  solver,
 HYPRE_Int
 HYPRE_SStructMaxwellPhysBdy( HYPRE_SStructGrid  *grid_l,
                              HYPRE_Int           num_levels,
-                             HYPRE_Int           rfactors[3],
+                             HYPRE_Int          *rfactors,
                              HYPRE_Int        ***BdryRanks_ptr,
                              HYPRE_Int         **BdryRanksCnt_ptr )
 {
diff --git a/3rd_party/hypre/src/sstruct_ls/_hypre_sstruct_ls.h b/3rd_party/hypre/src/sstruct_ls/_hypre_sstruct_ls.h
index daac65e56..3aeb553c7 100644
--- a/3rd_party/hypre/src/sstruct_ls/_hypre_sstruct_ls.h
+++ b/3rd_party/hypre/src/sstruct_ls/_hypre_sstruct_ls.h
@@ -484,7 +484,7 @@ HYPRE_Int HYPRE_SStructMaxwellSolve2 ( HYPRE_SStructSolver solver, HYPRE_SStruct
 HYPRE_Int HYPRE_MaxwellGrad ( HYPRE_SStructGrid grid, HYPRE_ParCSRMatrix *T );
 HYPRE_Int HYPRE_SStructMaxwellSetGrad ( HYPRE_SStructSolver solver, HYPRE_ParCSRMatrix T );
 HYPRE_Int HYPRE_SStructMaxwellSetRfactors ( HYPRE_SStructSolver solver,
-                                            HYPRE_Int rfactors [HYPRE_MAXDIM]);
+                                            HYPRE_Int *rfactors);
 HYPRE_Int HYPRE_SStructMaxwellSetTol ( HYPRE_SStructSolver solver, HYPRE_Real tol );
 HYPRE_Int HYPRE_SStructMaxwellSetConstantCoef ( HYPRE_SStructSolver solver,
                                                 HYPRE_Int constant_coef );
@@ -502,7 +502,7 @@ HYPRE_Int HYPRE_SStructMaxwellGetNumIterations ( HYPRE_SStructSolver solver,
 HYPRE_Int HYPRE_SStructMaxwellGetFinalRelativeResidualNorm ( HYPRE_SStructSolver solver,
                                                              HYPRE_Real *norm );
 HYPRE_Int HYPRE_SStructMaxwellPhysBdy ( HYPRE_SStructGrid *grid_l, HYPRE_Int num_levels,
-                                        HYPRE_Int rfactors [HYPRE_MAXDIM], HYPRE_Int ***BdryRanks_ptr, HYPRE_Int **BdryRanksCnt_ptr );
+                                        HYPRE_Int *rfactors, HYPRE_Int ***BdryRanks_ptr, HYPRE_Int **BdryRanksCnt_ptr );
 HYPRE_Int HYPRE_SStructMaxwellEliminateRowsCols ( HYPRE_ParCSRMatrix parA, HYPRE_Int nrows,
                                                   HYPRE_Int *rows );
 HYPRE_Int HYPRE_SStructMaxwellZeroVector ( HYPRE_ParVector v, HYPRE_Int *rows, HYPRE_Int nrows );
diff --git a/3rd_party/hypre/src/sstruct_ls/fac_amr_fcoarsen.c b/3rd_party/hypre/src/sstruct_ls/fac_amr_fcoarsen.c
index a3e0cdafe..52cd8cf77 100644
--- a/3rd_party/hypre/src/sstruct_ls/fac_amr_fcoarsen.c
+++ b/3rd_party/hypre/src/sstruct_ls/fac_amr_fcoarsen.c
@@ -205,6 +205,7 @@ hypre_AMR_FCoarsen( hypre_SStructMatrix  *   A,
    hypre_BoxInit(&intersect_box, ndim);
    hypre_BoxInit(&loop_box, ndim);
    hypre_BoxInit(&coarse_cell_box, ndim);
+   hypre_SetIndex3(lindex, 0, 0, 0);
 
    /*--------------------------------------------------------------------------
     * Task: Coarsen the fbox and f/c connections to form the coarse grid
diff --git a/3rd_party/hypre/src/sstruct_ls/fac_interp2.c b/3rd_party/hypre/src/sstruct_ls/fac_interp2.c
index 71157ec76..d6031d33f 100644
--- a/3rd_party/hypre/src/sstruct_ls/fac_interp2.c
+++ b/3rd_party/hypre/src/sstruct_ls/fac_interp2.c
@@ -760,8 +760,8 @@ hypre_FAC_WeightedInterp2(void                  *fac_interp_vdata,
    HYPRE_Int               part_fine = 1;
 
    HYPRE_Real              xweight1, xweight2;
-   HYPRE_Real              yweight1, yweight2 = 0.0;
-   HYPRE_Real              zweight1, zweight2 = 0.0;
+   HYPRE_Real              yweight1 = 0.0, yweight2 = 0.0;
+   HYPRE_Real              zweight1 = 0.0, zweight2 = 0.0;
 
    /*-----------------------------------------------------------------------
     * Initialize some things
diff --git a/3rd_party/hypre/src/sstruct_ls/maxwell_TV.c b/3rd_party/hypre/src/sstruct_ls/maxwell_TV.c
index b597e76c2..19044c04e 100644
--- a/3rd_party/hypre/src/sstruct_ls/maxwell_TV.c
+++ b/3rd_party/hypre/src/sstruct_ls/maxwell_TV.c
@@ -169,7 +169,7 @@ hypre_MaxwellTVDestroy( void *maxwell_vdata )
  *--------------------------------------------------------------------------*/
 HYPRE_Int
 hypre_MaxwellSetRfactors(void         *maxwell_vdata,
-                         HYPRE_Int     rfactor[3] )
+                         HYPRE_Int     rfactor[HYPRE_MAXDIM] )
 {
    hypre_MaxwellData *maxwell_data   = (hypre_MaxwellData *)maxwell_vdata;
    hypre_Index       *maxwell_rfactor = (maxwell_data -> rfactor);
diff --git a/3rd_party/hypre/src/sstruct_ls/maxwell_semi_interp.c b/3rd_party/hypre/src/sstruct_ls/maxwell_semi_interp.c
index 14759d44c..b761983e6 100644
--- a/3rd_party/hypre/src/sstruct_ls/maxwell_semi_interp.c
+++ b/3rd_party/hypre/src/sstruct_ls/maxwell_semi_interp.c
@@ -1605,6 +1605,11 @@ hypre_Maxwell_PTopology(  hypre_SStructGrid    *fgrid_edge,
                m = rfactor[2];
                break;
             }
+
+            default:
+            {
+               m = 0;
+            }
          }
 
          for (i = nEdges; i < nEdges + j; i++) /*fill in the column size for Edge */
diff --git a/3rd_party/hypre/src/sstruct_ls/nd1_amge_interpolation.c b/3rd_party/hypre/src/sstruct_ls/nd1_amge_interpolation.c
index 052012a1a..8af35e781 100644
--- a/3rd_party/hypre/src/sstruct_ls/nd1_amge_interpolation.c
+++ b/3rd_party/hypre/src/sstruct_ls/nd1_amge_interpolation.c
@@ -161,6 +161,7 @@ HYPRE_Int hypre_ND1AMGeInterpolation (hypre_ParCSRMatrix       * Aee,
       }
       else
       {
+         col_ind1 = NULL;
          size1 = 0;
       }
 
diff --git a/3rd_party/hypre/src/sstruct_ls/node_relax.c b/3rd_party/hypre/src/sstruct_ls/node_relax.c
index bd2c6c92a..1f9c1bfed 100644
--- a/3rd_party/hypre/src/sstruct_ls/node_relax.c
+++ b/3rd_party/hypre/src/sstruct_ls/node_relax.c
@@ -743,8 +743,8 @@ hypre_NodeRelax(  void                 *relax_vdata,
                   HYPRE_Int vi, vj, err;
                   //HYPRE_Real *A_loc = tA_loc + hypre_BoxLoopBlock() * nvars * nvars;
                   //HYPRE_Real *x_loc = tx_loc + hypre_BoxLoopBlock() * nvars;
-                  HYPRE_Real A_loc[HYPRE_MAXVARS * HYPRE_MAXVARS];
-                  HYPRE_Real x_loc[HYPRE_MAXVARS];
+                  HYPRE_Real A_loc[HYPRE_MAXVARS * HYPRE_MAXVARS] = {0};
+                  HYPRE_Real x_loc[HYPRE_MAXVARS] = {0};
                   /*------------------------------------------------
                    * Copy rhs and matrix for diagonal coupling
                    * (intra-nodal) into local storage.
@@ -948,8 +948,8 @@ hypre_NodeRelax(  void                 *relax_vdata,
                   HYPRE_Real *A_loc = tA_loc + hypre_BoxLoopBlock() * nvars * nvars;
                   HYPRE_Real *x_loc = tx_loc + hypre_BoxLoopBlock() * nvars;
                   */
-                  HYPRE_Real A_loc[HYPRE_MAXVARS * HYPRE_MAXVARS];
-                  HYPRE_Real x_loc[HYPRE_MAXVARS];
+                  HYPRE_Real A_loc[HYPRE_MAXVARS * HYPRE_MAXVARS] = {0};
+                  HYPRE_Real x_loc[HYPRE_MAXVARS] = {0};
 
                   /*------------------------------------------------
                    * Copy rhs and matrix for diagonal coupling
diff --git a/3rd_party/hypre/src/struct_ls/CMakeLists.txt b/3rd_party/hypre/src/struct_ls/CMakeLists.txt
index 3c5fc60ff..2df3d0ab6 100644
--- a/3rd_party/hypre/src/struct_ls/CMakeLists.txt
+++ b/3rd_party/hypre/src/struct_ls/CMakeLists.txt
@@ -73,6 +73,8 @@ set(SRCS
   sparse_msg_setup.c
   sparse_msg_setup_rap.c
   sparse_msg_solve.c
+  F90_HYPRE_struct_flexgmres.c
+  F90_HYPRE_struct_lgmres.c
 )
 target_sources(${PROJECT_NAME}
   PRIVATE ${SRCS}
diff --git a/3rd_party/hypre/src/struct_ls/sparse_msg_solve.c b/3rd_party/hypre/src/struct_ls/sparse_msg_solve.c
index 7fe2bb2bc..863374ae4 100644
--- a/3rd_party/hypre/src/struct_ls/sparse_msg_solve.c
+++ b/3rd_party/hypre/src/struct_ls/sparse_msg_solve.c
@@ -68,7 +68,7 @@ hypre_SparseMSGSolve( void               *smsg_vdata,
 
    HYPRE_Int            *restrict_count;
 
-   HYPRE_Real            b_dot_b, r_dot_r, eps;
+   HYPRE_Real            b_dot_b = 0.0, r_dot_r, eps = 0.0;
    HYPRE_Real            e_dot_e = 1.0, x_dot_x = 1.0;
 
    HYPRE_Int             i, l, lx, ly, lz;
@@ -164,7 +164,7 @@ hypre_SparseMSGSolve( void               *smsg_vdata,
          if (logging > 0)
          {
             norms[i] = hypre_sqrt(r_dot_r);
-            if (b_dot_b > 0)
+            if (b_dot_b > 0.0)
             {
                rel_norms[i] = hypre_sqrt(r_dot_r / b_dot_b);
             }
diff --git a/3rd_party/hypre/src/struct_mv/_hypre_struct_mv.h b/3rd_party/hypre/src/struct_mv/_hypre_struct_mv.h
index 42d51c4a4..fdb019e16 100644
--- a/3rd_party/hypre/src/struct_mv/_hypre_struct_mv.h
+++ b/3rd_party/hypre/src/struct_mv/_hypre_struct_mv.h
@@ -211,6 +211,11 @@ if (hypre__num_blocks > 0)\
 {\
    hypre__div = hypre__tot / hypre__num_blocks;\
    hypre__mod = hypre__tot % hypre__num_blocks;\
+}\
+else\
+{\
+   hypre__div = 0;\
+   hypre__mod = 0;\
 }
 
 #define zypre_BoxLoopInitK(k, dboxk, startk, stridek, ik) \
@@ -408,7 +413,6 @@ for (I = 0; I < n[0]; I++)
 
 #endif
 #endif /* #ifndef hypre_BOX_HEADER */
-
 /******************************************************************************
  * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
diff --git a/3rd_party/hypre/src/struct_mv/box.h b/3rd_party/hypre/src/struct_mv/box.h
index 48a7d6f65..8d4c97dee 100644
--- a/3rd_party/hypre/src/struct_mv/box.h
+++ b/3rd_party/hypre/src/struct_mv/box.h
@@ -194,6 +194,11 @@ if (hypre__num_blocks > 0)\
 {\
    hypre__div = hypre__tot / hypre__num_blocks;\
    hypre__mod = hypre__tot % hypre__num_blocks;\
+}\
+else\
+{\
+   hypre__div = 0;\
+   hypre__mod = 0;\
 }
 
 #define zypre_BoxLoopInitK(k, dboxk, startk, stridek, ik) \
@@ -391,4 +396,3 @@ for (I = 0; I < n[0]; I++)
 
 #endif
 #endif /* #ifndef hypre_BOX_HEADER */
-
diff --git a/3rd_party/hypre/src/utilities/CMakeLists.txt b/3rd_party/hypre/src/utilities/CMakeLists.txt
index 6defb4f68..648c50b49 100644
--- a/3rd_party/hypre/src/utilities/CMakeLists.txt
+++ b/3rd_party/hypre/src/utilities/CMakeLists.txt
@@ -48,6 +48,7 @@ set(SRCS
   qsplit.c
   random.c
   state.c
+  stl_ops.c
   threading.c
   timer.c
   timing.c
@@ -68,6 +69,7 @@ if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
     memory_tracker.c
     nvtx.c
     omp_device.c
+    stl_ops.c
     HYPRE_handle.c
   )
   convert_filenames_to_full_paths(GPU_SRCS)
diff --git a/3rd_party/hypre/src/utilities/HYPRE_handle.c b/3rd_party/hypre/src/utilities/HYPRE_handle.c
index 2a12657b6..c052dc4cf 100644
--- a/3rd_party/hypre/src/utilities/HYPRE_handle.c
+++ b/3rd_party/hypre/src/utilities/HYPRE_handle.c
@@ -13,9 +13,20 @@
 
 #include "_hypre_utilities.h"
 
+/*--------------------------------------------------------------------------
+ * HYPRE_SetLogLevel
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetLogLevel( HYPRE_Int log_level )
+{
+   return hypre_SetLogLevel(log_level);
+}
+
 /*--------------------------------------------------------------------------
  * HYPRE_SetSpTransUseVendor
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_SetSpTransUseVendor( HYPRE_Int use_vendor )
 {
@@ -25,6 +36,7 @@ HYPRE_SetSpTransUseVendor( HYPRE_Int use_vendor )
 /*--------------------------------------------------------------------------
  * HYPRE_SetSpMVUseVendor
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_SetSpMVUseVendor( HYPRE_Int use_vendor )
 {
@@ -34,6 +46,7 @@ HYPRE_SetSpMVUseVendor( HYPRE_Int use_vendor )
 /*--------------------------------------------------------------------------
  * HYPRE_SetSpGemmUseVendor
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_SetSpGemmUseVendor( HYPRE_Int use_vendor )
 {
@@ -43,6 +56,7 @@ HYPRE_SetSpGemmUseVendor( HYPRE_Int use_vendor )
 /*--------------------------------------------------------------------------
  * HYPRE_SetUseGpuRand
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_SetUseGpuRand( HYPRE_Int use_gpu_rand )
 {
@@ -52,6 +66,7 @@ HYPRE_SetUseGpuRand( HYPRE_Int use_gpu_rand )
 /*--------------------------------------------------------------------------
  * HYPRE_SetGPUAwareMPI
  *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_SetGpuAwareMPI( HYPRE_Int use_gpu_aware_mpi )
 {
diff --git a/3rd_party/hypre/src/utilities/HYPRE_utilities.h b/3rd_party/hypre/src/utilities/HYPRE_utilities.h
index ba7032470..2256a6c7e 100644
--- a/3rd_party/hypre/src/utilities/HYPRE_utilities.h
+++ b/3rd_party/hypre/src/utilities/HYPRE_utilities.h
@@ -232,6 +232,44 @@ HYPRE_Int HYPRE_PrintErrorMessages(MPI_Comm comm);
 /* Print GPU information */
 HYPRE_Int HYPRE_PrintDeviceInfo(void);
 
+/**
+ * @brief Prints the memory usage of the current process.
+ *
+ * This function prints the memory usage details of the process to standard output.
+ * It provides information such as the virtual memory size, resident set size,
+ * and other related statistics including GPU memory usage for device builds.
+ *
+ * @param[in] comm      The MPI communicator. This parameter allows the function
+ *                      to print memory usage information for the process within
+ *                      the context of an MPI program.
+ *
+ * @param[in] level     The level of detail in the memory statistics output.
+ *                        - 1 : Display memory usage statistics for each MPI rank.
+ *                        - 2 : Display aggregate memory usage statistics over MPI ranks.
+ *
+ * @param[in] function  The name of the function from which `HYPRE_MemoryPrintUsage`
+ *                      is called. This is typically set to `__func__`, which
+ *                      automatically captures the name of the calling function.
+ *                      This variable can also be used to denote a region name.
+ *
+ * @param[in] line      The line number in the source file where `HYPRE_MemoryPrintUsage`
+ *                      is called. This is typically set to `__LINE__`, which
+ *                      automatically captures the line number. The line number can be
+ *                      omitted by passing a negative value to this variable.
+ *
+ * @return              Returns an integer status code. `0` indicates success, while
+ *                      a non-zero value indicates an error occurred.
+ *
+ * @note                The function is designed to be platform-independent but
+ *                      may provide different levels of detail depending on the
+ *                      underlying operating system (e.g., Linux, macOS). However,
+ *                      this function does not lead to correct memory usage statistics
+ *                      on Windows platforms.
+ */
+
+HYPRE_Int HYPRE_MemoryPrintUsage(MPI_Comm comm, HYPRE_Int level, const char *function,
+                                 HYPRE_Int line);
+
 /*--------------------------------------------------------------------------
  * HYPRE Version routines
  *--------------------------------------------------------------------------*/
@@ -341,12 +379,111 @@ HYPRE_Int HYPRE_SetGPUMemoryPoolSize(HYPRE_Int bin_growth, HYPRE_Int min_bin, HY
  * HYPRE handle
  *--------------------------------------------------------------------------*/
 
-HYPRE_Int HYPRE_SetSpTransUseVendor( HYPRE_Int use_vendor );
-HYPRE_Int HYPRE_SetSpMVUseVendor( HYPRE_Int use_vendor );
+/**
+ * Sets the logging level for the HYPRE library.
+ *
+ * The following options are available for \e log_level:
+ *
+ *    - 0 : (default) No messaging.
+ *    - 1 : Display memory usage statistics for each MPI rank.
+ *    - 2 : Display aggregate memory usage statistics over MPI ranks.
+ *
+ * @note Log level codes can be combined using bitwise OR to enable multiple
+ *       logging behaviors simultaneously.
+ *
+ * @param log_level The logging level to set.
+ *
+ * @return Returns hypre's global error code, where 0 indicates success.
+ **/
+HYPRE_Int HYPRE_SetLogLevel(HYPRE_Int log_level);
+
+/**
+ * Specifies the algorithm used for sparse matrix transposition in device builds.
+ *
+ * The following options are available for \e use_vendor:
+ *
+ *    - 0 : Use hypre's internal implementation.
+ *    - 1 : (default) Use the vendor library's implementation. This includes:
+ *          - cuSPARSE for CUDA (HYPRE_USING_CUSPARSE)
+ *          - rocSPARSE for HIP (HYPRE_USING_ROCSPARSE)
+ *          - oneMKL for SYCL   (HYPRE_USING_ONEMKLSPARSE)
+ *
+ * @param use_vendor Indicates whether to use the internal or vendor-provided implementation.
+ *
+ * @return Returns hypre's global error code, where 0 indicates success.
+ **/
+HYPRE_Int HYPRE_SetSpTransUseVendor(HYPRE_Int use_vendor);
+
+/**
+ * Specifies the algorithm used for sparse matrix/vector multiplication in device builds.
+ *
+ * The following options are available for \e use_vendor:
+ *
+ *    - 0 : Use hypre's internal implementation.
+ *    - 1 : (default) Use the vendor library's implementation. This includes:
+ *          - cuSPARSE for CUDA (HYPRE_USING_CUSPARSE)
+ *          - rocSPARSE for HIP (HYPRE_USING_ROCSPARSE)
+ *          - oneMKL for SYCL   (HYPRE_USING_ONEMKLSPARSE)
+ *
+ * @param use_vendor Indicates whether to use the internal or vendor-provided implementation.
+ *
+ * @return Returns hypre's global error code, where 0 indicates success.
+ **/
+HYPRE_Int HYPRE_SetSpMVUseVendor(HYPRE_Int use_vendor);
+
+/**
+ * Specifies the algorithm used for sparse matrix/matrix multiplication in device builds.
+ *
+ * The following options are available for \e use_vendor:
+ *
+ *    - 0 : Use hypre's internal implementation.
+ *    - 1 : Use the vendor library's implementation. This includes:
+ *          - cuSPARSE for CUDA (HYPRE_USING_CUSPARSE)
+ *          - rocSPARSE for HIP (HYPRE_USING_ROCSPARSE)
+ *          - oneMKL for SYCL   (HYPRE_USING_ONEMKLSPARSE)
+ *
+ * @param use_vendor Indicates whether to use the internal or vendor-provided implementation.
+ *
+ * @note The default value is 1, except for CUDA builds, which is zero.
+ *
+ * @return Returns hypre's global error code, where 0 indicates success.
+ **/
+HYPRE_Int HYPRE_SetSpGemmUseVendor( HYPRE_Int use_vendor );
 /* Backwards compatibility with HYPRE_SetSpGemmUseCusparse() */
 #define HYPRE_SetSpGemmUseCusparse(use_vendor) HYPRE_SetSpGemmUseVendor(use_vendor)
-HYPRE_Int HYPRE_SetSpGemmUseVendor( HYPRE_Int use_vendor );
+
+/**
+ * Specifies the algorithm used for generating random numbers in device builds.
+ *
+ * The following options are available for \e use_curand:
+ *
+ *    - 0 : random numbers are generated on the host and copied to device memory.
+ *    - 1 : (default) Use the vendor library's implementation. This includes:
+ *          - cuSPARSE for CUDA (HYPRE_USING_CUSPARSE)
+ *          - rocSPARSE for HIP (HYPRE_USING_ROCSPARSE)
+ *          - oneMKL for SYCL   (HYPRE_USING_ONEMKLSPARSE)
+ *
+ * @param use_curand Indicates whether to use the vendor-provided implementation or not.
+ *
+ * @return Returns hypre's global error code, where 0 indicates success.
+ **/
+
 HYPRE_Int HYPRE_SetUseGpuRand( HYPRE_Int use_curand );
+
+/**
+ * Configures the usage of GPU-aware MPI for communication in device builds.
+ *
+ * The following options are available for \e use_gpu_aware_mpi:
+ *
+ *    - 0 : MPI buffers are transferred between device and host memory. Communication occurs on the host.
+ *    - 1 : MPI communication is performed directly from the device using device-resident buffers.
+ *
+ * @param use_gpu_aware_mpi Specifies whether to enable GPU-aware MPI communication or not.
+ *
+ * @note This option requires hypre to be configured with GPU-aware MPI support for it to take effect.
+ *
+ * @return Returns hypre's global error code, where 0 indicates success.
+ **/
 HYPRE_Int HYPRE_SetGpuAwareMPI( HYPRE_Int use_gpu_aware_mpi );
 
 /*--------------------------------------------------------------------------
diff --git a/3rd_party/hypre/src/utilities/Makefile b/3rd_party/hypre/src/utilities/Makefile
index 81f1b4e14..8a4f60b12 100644
--- a/3rd_party/hypre/src/utilities/Makefile
+++ b/3rd_party/hypre/src/utilities/Makefile
@@ -76,7 +76,8 @@ CUFILES=\
  int_array_device.c\
  memory.c\
  omp_device.c\
- nvtx.c
+ nvtx.c\
+ stl_ops.c
 
 COBJS = ${FILES:.c=.o}
 CUOBJS = ${CUFILES:.c=.obj}
diff --git a/3rd_party/hypre/src/utilities/_hypre_utilities.h b/3rd_party/hypre/src/utilities/_hypre_utilities.h
index d9b7a430a..978041ba2 100644
--- a/3rd_party/hypre/src/utilities/_hypre_utilities.h
+++ b/3rd_party/hypre/src/utilities/_hypre_utilities.h
@@ -14,6 +14,154 @@
 extern "C" {
 #endif
 
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * General structures and values
+ *
+ *****************************************************************************/
+
+#ifndef HYPRE_HANDLE_H
+#define HYPRE_HANDLE_H
+
+#if defined(HYPRE_USING_UMPIRE)
+#include "umpire/config.hpp"
+#if UMPIRE_VERSION_MAJOR >= 2022
+#include "umpire/interface/c_fortran/umpire.h"
+#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_quick_pool
+#else
+#include "umpire/interface/umpire.h"
+#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_pool
+#endif /* UMPIRE_VERSION_MAJOR >= 2022 */
+#define HYPRE_UMPIRE_POOL_NAME_MAX_LEN 1024
+#endif /* defined(HYPRE_USING_UMPIRE) */
+
+struct hypre_DeviceData;
+typedef struct hypre_DeviceData hypre_DeviceData;
+typedef void (*GPUMallocFunc)(void **, size_t);
+typedef void (*GPUMfreeFunc)(void *);
+
+typedef struct
+{
+   HYPRE_Int              log_level;
+   HYPRE_Int              hypre_error;
+   HYPRE_MemoryLocation   memory_location;
+   HYPRE_ExecutionPolicy  default_exec_policy;
+
+   /* the device buffers needed to do MPI communication for struct comm */
+   HYPRE_Complex         *struct_comm_recv_buffer;
+   HYPRE_Complex         *struct_comm_send_buffer;
+   HYPRE_Int              struct_comm_recv_buffer_size;
+   HYPRE_Int              struct_comm_send_buffer_size;
+
+   /* GPU MPI */
+#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_DEVICE_OPENMP)
+   HYPRE_Int              use_gpu_aware_mpi;
+#endif
+
+#if defined(HYPRE_USING_GPU)
+   hypre_DeviceData      *device_data;
+   HYPRE_Int              device_gs_method; /* device G-S options */
+#endif
+
+   /* user malloc/free function pointers */
+   GPUMallocFunc          user_device_malloc;
+   GPUMfreeFunc           user_device_free;
+
+#if defined(HYPRE_USING_UMPIRE)
+   char                   umpire_device_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
+   char                   umpire_um_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
+   char                   umpire_host_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
+   char                   umpire_pinned_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
+   size_t                 umpire_device_pool_size;
+   size_t                 umpire_um_pool_size;
+   size_t                 umpire_host_pool_size;
+   size_t                 umpire_pinned_pool_size;
+   size_t                 umpire_block_size;
+   HYPRE_Int              own_umpire_device_pool;
+   HYPRE_Int              own_umpire_um_pool;
+   HYPRE_Int              own_umpire_host_pool;
+   HYPRE_Int              own_umpire_pinned_pool;
+   umpire_resourcemanager umpire_rm;
+#endif
+
+#if defined(HYPRE_USING_MAGMA)
+   magma_queue_t          magma_queue;
+#endif
+} hypre_Handle;
+
+/* accessor macros to hypre_Handle */
+#define hypre_HandleLogLevel(hypre_handle)                       ((hypre_handle) -> log_level)
+#define hypre_HandleMemoryLocation(hypre_handle)                 ((hypre_handle) -> memory_location)
+#define hypre_HandleDefaultExecPolicy(hypre_handle)              ((hypre_handle) -> default_exec_policy)
+
+#define hypre_HandleStructCommRecvBuffer(hypre_handle)           ((hypre_handle) -> struct_comm_recv_buffer)
+#define hypre_HandleStructCommSendBuffer(hypre_handle)           ((hypre_handle) -> struct_comm_send_buffer)
+#define hypre_HandleStructCommRecvBufferSize(hypre_handle)       ((hypre_handle) -> struct_comm_recv_buffer_size)
+#define hypre_HandleStructCommSendBufferSize(hypre_handle)       ((hypre_handle) -> struct_comm_send_buffer_size)
+
+#define hypre_HandleDeviceData(hypre_handle)                     ((hypre_handle) -> device_data)
+#define hypre_HandleDeviceGSMethod(hypre_handle)                 ((hypre_handle) -> device_gs_method)
+#define hypre_HandleUseGpuAwareMPI(hypre_handle)                 ((hypre_handle) -> use_gpu_aware_mpi)
+
+#define hypre_HandleCurandGenerator(hypre_handle)                hypre_DeviceDataCurandGenerator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCublasHandle(hypre_handle)                   hypre_DeviceDataCublasHandle(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCusparseHandle(hypre_handle)                 hypre_DeviceDataCusparseHandle(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleVendorSolverHandle(hypre_handle)             hypre_DeviceDataVendorSolverHandle(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleComputeStream(hypre_handle)                  hypre_DeviceDataComputeStream(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubBinGrowth(hypre_handle)                   hypre_DeviceDataCubBinGrowth(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMinBin(hypre_handle)                      hypre_DeviceDataCubMinBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMaxBin(hypre_handle)                      hypre_DeviceDataCubMaxBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMaxCachedBytes(hypre_handle)              hypre_DeviceDataCubMaxCachedBytes(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubDevAllocator(hypre_handle)                hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDevice(hypre_handle)                         hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle)         hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDeviceMaxShmemPerBlock(hypre_handle)         hypre_DeviceDataDeviceMaxShmemPerBlock(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDeviceMaxShmemPerBlockInited(hypre_handle)   hypre_DeviceDataDeviceMaxShmemPerBlockInited(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleComputeStreamNum(hypre_handle)               hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleReduceBuffer(hypre_handle)                   hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmUseVendor(hypre_handle)                hypre_DeviceDataSpgemmUseVendor(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpMVUseVendor(hypre_handle)                  hypre_DeviceDataSpMVUseVendor(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpTransUseVendor(hypre_handle)               hypre_DeviceDataSpTransUseVendor(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmAlgorithm(hypre_handle)                hypre_DeviceDataSpgemmAlgorithm(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmBinned(hypre_handle)                   hypre_DeviceDataSpgemmBinned(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmNumBin(hypre_handle)                   hypre_DeviceDataSpgemmNumBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmHighestBin(hypre_handle)               hypre_DeviceDataSpgemmHighestBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmBlockNumDim(hypre_handle)              hypre_DeviceDataSpgemmBlockNumDim(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDeviceAllocator(hypre_handle)                hypre_DeviceDataDeviceAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleUseGpuRand(hypre_handle)                     hypre_DeviceDataUseGpuRand(hypre_HandleDeviceData(hypre_handle))
+
+#define hypre_HandleUserDeviceMalloc(hypre_handle)               ((hypre_handle) -> user_device_malloc)
+#define hypre_HandleUserDeviceMfree(hypre_handle)                ((hypre_handle) -> user_device_free)
+
+#define hypre_HandleUmpireResourceMan(hypre_handle)              ((hypre_handle) -> umpire_rm)
+#define hypre_HandleUmpireDevicePoolSize(hypre_handle)           ((hypre_handle) -> umpire_device_pool_size)
+#define hypre_HandleUmpireUMPoolSize(hypre_handle)               ((hypre_handle) -> umpire_um_pool_size)
+#define hypre_HandleUmpireHostPoolSize(hypre_handle)             ((hypre_handle) -> umpire_host_pool_size)
+#define hypre_HandleUmpirePinnedPoolSize(hypre_handle)           ((hypre_handle) -> umpire_pinned_pool_size)
+#define hypre_HandleUmpireBlockSize(hypre_handle)                ((hypre_handle) -> umpire_block_size)
+#define hypre_HandleUmpireDevicePoolName(hypre_handle)           ((hypre_handle) -> umpire_device_pool_name)
+#define hypre_HandleUmpireUMPoolName(hypre_handle)               ((hypre_handle) -> umpire_um_pool_name)
+#define hypre_HandleUmpireHostPoolName(hypre_handle)             ((hypre_handle) -> umpire_host_pool_name)
+#define hypre_HandleUmpirePinnedPoolName(hypre_handle)           ((hypre_handle) -> umpire_pinned_pool_name)
+#define hypre_HandleOwnUmpireDevicePool(hypre_handle)            ((hypre_handle) -> own_umpire_device_pool)
+#define hypre_HandleOwnUmpireUMPool(hypre_handle)                ((hypre_handle) -> own_umpire_um_pool)
+#define hypre_HandleOwnUmpireHostPool(hypre_handle)              ((hypre_handle) -> own_umpire_host_pool)
+#define hypre_HandleOwnUmpirePinnedPool(hypre_handle)            ((hypre_handle) -> own_umpire_pinned_pool)
+
+#define hypre_HandleMagmaQueue(hypre_handle)                     ((hypre_handle) -> magma_queue)
+
+#endif
 /******************************************************************************
  * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
@@ -1000,18 +1148,6 @@ HYPRE_Int hypre_MPI_Info_free( hypre_MPI_Info *info );
 //#pragma omp requires unified_shared_memory
 #endif
 
-#if defined(HYPRE_USING_UMPIRE)
-#include "umpire/config.hpp"
-#if UMPIRE_VERSION_MAJOR >= 2022
-#include "umpire/interface/c_fortran/umpire.h"
-#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_quick_pool
-#else
-#include "umpire/interface/umpire.h"
-#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_pool
-#endif
-#define HYPRE_UMPIRE_POOL_NAME_MAX_LEN 1024
-#endif
-
 /* stringification:
  * _Pragma(string-literal), so we need to cast argument to a string
  * The three dots as last argument of the macro tells compiler that this is a variadic macro.
@@ -1129,7 +1265,17 @@ HYPRE_Int hypre_umpire_um_pooled_allocate(void **ptr, size_t nbytes);
 HYPRE_Int hypre_umpire_um_pooled_free(void *ptr);
 HYPRE_Int hypre_umpire_pinned_pooled_allocate(void **ptr, size_t nbytes);
 HYPRE_Int hypre_umpire_pinned_pooled_free(void *ptr);
-
+HYPRE_Int hypre_UmpireInit(hypre_Handle *hypre_handle_);
+HYPRE_Int hypre_UmpireFinalize(hypre_Handle *hypre_handle_);
+HYPRE_Int hypre_UmpireGetCurrentMemoryUsage(MPI_Comm comm, HYPRE_Real *current);
+HYPRE_Int hypre_UmpireMemoryGetUsage(HYPRE_Real *memory);
+HYPRE_Int hypre_HostMemoryGetUsage(HYPRE_Real *mem);
+HYPRE_Int hypre_MemoryPrintUsage(MPI_Comm comm, HYPRE_Int level,
+                                 const char *function, HYPRE_Int line);
+#define HYPRE_PRINT_MEMORY_USAGE(comm) hypre_MemoryPrintUsage(comm,\
+                                                              hypre_HandleLogLevel(hypre_handle()),\
+                                                              __func__,\
+                                                              __LINE__)
 /* memory_dmalloc.c */
 HYPRE_Int hypre_InitMemoryDebugDML( HYPRE_Int id );
 HYPRE_Int hypre_FinalizeMemoryDebugDML( void );
@@ -1138,10 +1284,6 @@ char *hypre_CAllocDML( HYPRE_Int count, HYPRE_Int elt_size, char *file, HYPRE_In
 char *hypre_ReAllocDML( char *ptr, HYPRE_Int size, char *file, HYPRE_Int line );
 void hypre_FreeDML( char *ptr, char *file, HYPRE_Int line );
 
-/* GPU malloc prototype */
-typedef void (*GPUMallocFunc)(void **, size_t);
-typedef void (*GPUMfreeFunc)(void *);
-
 #ifdef __cplusplus
 }
 #endif
@@ -1767,138 +1909,6 @@ extern "C++"
  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
  ******************************************************************************/
 
-/******************************************************************************
- *
- * General structures and values
- *
- *****************************************************************************/
-
-#ifndef HYPRE_HANDLE_H
-#define HYPRE_HANDLE_H
-
-struct hypre_DeviceData;
-typedef struct hypre_DeviceData hypre_DeviceData;
-
-typedef struct
-{
-   HYPRE_Int              hypre_error;
-   HYPRE_MemoryLocation   memory_location;
-   HYPRE_ExecutionPolicy  default_exec_policy;
-
-   /* the device buffers needed to do MPI communication for struct comm */
-   HYPRE_Complex         *struct_comm_recv_buffer;
-   HYPRE_Complex         *struct_comm_send_buffer;
-   HYPRE_Int              struct_comm_recv_buffer_size;
-   HYPRE_Int              struct_comm_send_buffer_size;
-
-   /* GPU MPI */
-#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_DEVICE_OPENMP)
-   HYPRE_Int              use_gpu_aware_mpi;
-#endif
-
-#if defined(HYPRE_USING_GPU)
-   hypre_DeviceData      *device_data;
-   HYPRE_Int              device_gs_method; /* device G-S options */
-#endif
-
-   /* user malloc/free function pointers */
-   GPUMallocFunc          user_device_malloc;
-   GPUMfreeFunc           user_device_free;
-
-#if defined(HYPRE_USING_UMPIRE)
-   char                   umpire_device_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
-   char                   umpire_um_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
-   char                   umpire_host_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
-   char                   umpire_pinned_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
-   size_t                 umpire_device_pool_size;
-   size_t                 umpire_um_pool_size;
-   size_t                 umpire_host_pool_size;
-   size_t                 umpire_pinned_pool_size;
-   size_t                 umpire_block_size;
-   HYPRE_Int              own_umpire_device_pool;
-   HYPRE_Int              own_umpire_um_pool;
-   HYPRE_Int              own_umpire_host_pool;
-   HYPRE_Int              own_umpire_pinned_pool;
-   umpire_resourcemanager umpire_rm;
-#endif
-
-#if defined(HYPRE_USING_MAGMA)
-   magma_queue_t          magma_queue;
-#endif
-} hypre_Handle;
-
-/* accessor macros to hypre_Handle */
-#define hypre_HandleMemoryLocation(hypre_handle)                 ((hypre_handle) -> memory_location)
-#define hypre_HandleDefaultExecPolicy(hypre_handle)              ((hypre_handle) -> default_exec_policy)
-
-#define hypre_HandleStructCommRecvBuffer(hypre_handle)           ((hypre_handle) -> struct_comm_recv_buffer)
-#define hypre_HandleStructCommSendBuffer(hypre_handle)           ((hypre_handle) -> struct_comm_send_buffer)
-#define hypre_HandleStructCommRecvBufferSize(hypre_handle)       ((hypre_handle) -> struct_comm_recv_buffer_size)
-#define hypre_HandleStructCommSendBufferSize(hypre_handle)       ((hypre_handle) -> struct_comm_send_buffer_size)
-
-#define hypre_HandleDeviceData(hypre_handle)                     ((hypre_handle) -> device_data)
-#define hypre_HandleDeviceGSMethod(hypre_handle)                 ((hypre_handle) -> device_gs_method)
-#define hypre_HandleUseGpuAwareMPI(hypre_handle)                 ((hypre_handle) -> use_gpu_aware_mpi)
-
-#define hypre_HandleCurandGenerator(hypre_handle)                hypre_DeviceDataCurandGenerator(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCublasHandle(hypre_handle)                   hypre_DeviceDataCublasHandle(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCusparseHandle(hypre_handle)                 hypre_DeviceDataCusparseHandle(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleVendorSolverHandle(hypre_handle)             hypre_DeviceDataVendorSolverHandle(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleComputeStream(hypre_handle)                  hypre_DeviceDataComputeStream(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCubBinGrowth(hypre_handle)                   hypre_DeviceDataCubBinGrowth(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCubMinBin(hypre_handle)                      hypre_DeviceDataCubMinBin(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCubMaxBin(hypre_handle)                      hypre_DeviceDataCubMaxBin(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCubMaxCachedBytes(hypre_handle)              hypre_DeviceDataCubMaxCachedBytes(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCubDevAllocator(hypre_handle)                hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleDevice(hypre_handle)                         hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle)         hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleDeviceMaxShmemPerBlock(hypre_handle)         hypre_DeviceDataDeviceMaxShmemPerBlock(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleDeviceMaxShmemPerBlockInited(hypre_handle)   hypre_DeviceDataDeviceMaxShmemPerBlockInited(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleComputeStreamNum(hypre_handle)               hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleReduceBuffer(hypre_handle)                   hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmUseVendor(hypre_handle)                hypre_DeviceDataSpgemmUseVendor(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpMVUseVendor(hypre_handle)                  hypre_DeviceDataSpMVUseVendor(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpTransUseVendor(hypre_handle)               hypre_DeviceDataSpTransUseVendor(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmAlgorithm(hypre_handle)                hypre_DeviceDataSpgemmAlgorithm(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmBinned(hypre_handle)                   hypre_DeviceDataSpgemmBinned(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmNumBin(hypre_handle)                   hypre_DeviceDataSpgemmNumBin(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmHighestBin(hypre_handle)               hypre_DeviceDataSpgemmHighestBin(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmBlockNumDim(hypre_handle)              hypre_DeviceDataSpgemmBlockNumDim(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleDeviceAllocator(hypre_handle)                hypre_DeviceDataDeviceAllocator(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleUseGpuRand(hypre_handle)                     hypre_DeviceDataUseGpuRand(hypre_HandleDeviceData(hypre_handle))
-
-#define hypre_HandleUserDeviceMalloc(hypre_handle)               ((hypre_handle) -> user_device_malloc)
-#define hypre_HandleUserDeviceMfree(hypre_handle)                ((hypre_handle) -> user_device_free)
-
-#define hypre_HandleUmpireResourceMan(hypre_handle)              ((hypre_handle) -> umpire_rm)
-#define hypre_HandleUmpireDevicePoolSize(hypre_handle)           ((hypre_handle) -> umpire_device_pool_size)
-#define hypre_HandleUmpireUMPoolSize(hypre_handle)               ((hypre_handle) -> umpire_um_pool_size)
-#define hypre_HandleUmpireHostPoolSize(hypre_handle)             ((hypre_handle) -> umpire_host_pool_size)
-#define hypre_HandleUmpirePinnedPoolSize(hypre_handle)           ((hypre_handle) -> umpire_pinned_pool_size)
-#define hypre_HandleUmpireBlockSize(hypre_handle)                ((hypre_handle) -> umpire_block_size)
-#define hypre_HandleUmpireDevicePoolName(hypre_handle)           ((hypre_handle) -> umpire_device_pool_name)
-#define hypre_HandleUmpireUMPoolName(hypre_handle)               ((hypre_handle) -> umpire_um_pool_name)
-#define hypre_HandleUmpireHostPoolName(hypre_handle)             ((hypre_handle) -> umpire_host_pool_name)
-#define hypre_HandleUmpirePinnedPoolName(hypre_handle)           ((hypre_handle) -> umpire_pinned_pool_name)
-#define hypre_HandleOwnUmpireDevicePool(hypre_handle)            ((hypre_handle) -> own_umpire_device_pool)
-#define hypre_HandleOwnUmpireUMPool(hypre_handle)                ((hypre_handle) -> own_umpire_um_pool)
-#define hypre_HandleOwnUmpireHostPool(hypre_handle)              ((hypre_handle) -> own_umpire_host_pool)
-#define hypre_HandleOwnUmpirePinnedPool(hypre_handle)            ((hypre_handle) -> own_umpire_pinned_pool)
-
-#define hypre_HandleMagmaQueue(hypre_handle)                     ((hypre_handle) -> magma_queue)
-
-#endif
-/******************************************************************************
- * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
- * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
- *
- * SPDX-License-Identifier: (Apache-2.0 OR MIT)
- ******************************************************************************/
-
 #ifndef HYPRE_GSELIM_H
 #define HYPRE_GSELIM_H
 
@@ -2082,8 +2092,6 @@ HYPRE_Int hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_);
 HYPRE_Int hypre_GetDevice(hypre_int *device_id);
 HYPRE_Int hypre_GetDeviceCount(hypre_int *device_count);
 HYPRE_Int hypre_GetDeviceLastError(void);
-HYPRE_Int hypre_UmpireInit(hypre_Handle *hypre_handle_);
-HYPRE_Int hypre_UmpireFinalize(hypre_Handle *hypre_handle_);
 HYPRE_Int hypre_GetDeviceMaxShmemSize(hypre_int device_id, hypre_int *max_size_ptr,
                                       hypre_int *max_size_optin_ptr);
 
@@ -2388,6 +2396,9 @@ HYPRE_Int hypre_bind_device_id(HYPRE_Int device_id_in, HYPRE_Int myid,
                                HYPRE_Int nproc, MPI_Comm comm);
 HYPRE_Int hypre_bind_device(HYPRE_Int myid, HYPRE_Int nproc, MPI_Comm comm);
 
+/* stl_ops.c */
+HYPRE_Int hypre_IntSequence(HYPRE_MemoryLocation memory_location, HYPRE_Int size, HYPRE_Int *data);
+
 /* nvtx.c */
 void hypre_GpuProfilingPushRangeColor(const char *name, HYPRE_Int cid);
 void hypre_GpuProfilingPushRange(const char *name);
@@ -2401,6 +2412,7 @@ HYPRE_Int hypre_CheckDirExists(const char *path);
 HYPRE_Int hypre_CreateDir(const char *path);
 HYPRE_Int hypre_CreateNextDirOfSequence(const char *basepath, const char *prefix,
                                         char **fullpath_ptr);
+char* hypre_ConvertIndicesToString(HYPRE_Int size, HYPRE_Int *indices);
 
 HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
 HYPRE_Int hypre_RestoreSyncCudaCompute(void);
@@ -2408,6 +2420,7 @@ HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
 HYPRE_Int hypre_ForceSyncComputeStream(hypre_Handle *hypre_handle);
 
 /* handle.c */
+HYPRE_Int hypre_SetLogLevel( HYPRE_Int log_level );
 HYPRE_Int hypre_SetSpTransUseVendor( HYPRE_Int use_vendor );
 HYPRE_Int hypre_SetSpMVUseVendor( HYPRE_Int use_vendor );
 HYPRE_Int hypre_SetSpGemmUseVendor( HYPRE_Int use_vendor );
diff --git a/3rd_party/hypre/src/utilities/device_utils.c b/3rd_party/hypre/src/utilities/device_utils.c
index 499fea868..34d094de8 100644
--- a/3rd_party/hypre/src/utilities/device_utils.c
+++ b/3rd_party/hypre/src/utilities/device_utils.c
@@ -23,7 +23,9 @@
 hypre_DeviceData*
 hypre_DeviceDataCreate()
 {
-   hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST);
+   /* Note: this allocation is done directly with calloc in order to
+      avoid a segmentation fault when building with HYPRE_USING_UMPIRE_HOST */
+   hypre_DeviceData *data = (hypre_DeviceData*) calloc(1, sizeof(hypre_DeviceData));
 
 #if defined(HYPRE_USING_SYCL)
    hypre_DeviceDataDevice(data)           = nullptr;
@@ -165,7 +167,8 @@ hypre_DeviceDataDestroy(hypre_DeviceData *data)
    data->device = nullptr;
 #endif
 
-   hypre_TFree(data, HYPRE_MEMORY_HOST);
+   /* Note: Directly using free since this variable was allocated with calloc */
+   free((void*) data);
 }
 
 /*--------------------------------------------------------------------
@@ -3025,4 +3028,3 @@ hypre_bind_device( HYPRE_Int myid,
 {
    return hypre_bind_device_id(-1, myid, nproc, comm);
 }
-
diff --git a/3rd_party/hypre/src/utilities/general.c b/3rd_party/hypre/src/utilities/general.c
index 16491b04c..836b20427 100644
--- a/3rd_party/hypre/src/utilities/general.c
+++ b/3rd_party/hypre/src/utilities/general.c
@@ -30,8 +30,11 @@ hypre_handle(void)
 hypre_Handle*
 hypre_HandleCreate(void)
 {
-   hypre_Handle *hypre_handle_ = hypre_CTAlloc(hypre_Handle, 1, HYPRE_MEMORY_HOST);
+   /* Note: this allocation is done directly with calloc in order to
+      avoid a segmentation fault when building with HYPRE_USING_UMPIRE_HOST */
+   hypre_Handle *hypre_handle_ = (hypre_Handle*) calloc(1, sizeof(hypre_Handle));
 
+   hypre_HandleLogLevel(hypre_handle_) = 0;
    hypre_HandleMemoryLocation(hypre_handle_) = HYPRE_MEMORY_DEVICE;
 
 #if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_DEVICE_OPENMP)
@@ -70,7 +73,8 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_)
    hypre_HandleDeviceData(hypre_handle_) = NULL;
 #endif
 
-   hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST);
+   /* Note: Directly using free since this variable was allocated with calloc */
+   free((void*) hypre_handle_);
 
    return hypre_error_flag;
 }
@@ -490,177 +494,19 @@ HYPRE_PrintDeviceInfo(void)
    return hypre_error_flag;
 }
 
-/******************************************************************************
- *
- * hypre Umpire
- *
- *****************************************************************************/
-
-#if defined(HYPRE_USING_UMPIRE)
-HYPRE_Int
-hypre_UmpireInit(hypre_Handle *hypre_handle_)
-{
-   umpire_resourcemanager_get_instance(&hypre_HandleUmpireResourceMan(hypre_handle_));
-
-   hypre_HandleUmpireDevicePoolSize(hypre_handle_) = 4LL * 1024 * 1024 * 1024;
-   hypre_HandleUmpireUMPoolSize(hypre_handle_)     = 4LL * 1024 * 1024 * 1024;
-   hypre_HandleUmpireHostPoolSize(hypre_handle_)   = 4LL * 1024 * 1024 * 1024;
-   hypre_HandleUmpirePinnedPoolSize(hypre_handle_) = 4LL * 1024 * 1024 * 1024;
-
-   hypre_HandleUmpireBlockSize(hypre_handle_) = 512;
-
-   strcpy(hypre_HandleUmpireDevicePoolName(hypre_handle_), "HYPRE_DEVICE_POOL");
-   strcpy(hypre_HandleUmpireUMPoolName(hypre_handle_),     "HYPRE_UM_POOL");
-   strcpy(hypre_HandleUmpireHostPoolName(hypre_handle_),   "HYPRE_HOST_POOL");
-   strcpy(hypre_HandleUmpirePinnedPoolName(hypre_handle_), "HYPRE_PINNED_POOL");
-
-   hypre_HandleOwnUmpireDevicePool(hypre_handle_) = 0;
-   hypre_HandleOwnUmpireUMPool(hypre_handle_)     = 0;
-   hypre_HandleOwnUmpireHostPool(hypre_handle_)   = 0;
-   hypre_HandleOwnUmpirePinnedPool(hypre_handle_) = 0;
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-hypre_UmpireFinalize(hypre_Handle *hypre_handle_)
-{
-   umpire_resourcemanager *rm_ptr = &hypre_HandleUmpireResourceMan(hypre_handle_);
-   umpire_allocator allocator;
-
-#if defined(HYPRE_USING_UMPIRE_HOST)
-   if (hypre_HandleOwnUmpireHostPool(hypre_handle_))
-   {
-      const char *pool_name = hypre_HandleUmpireHostPoolName(hypre_handle_);
-      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
-      umpire_allocator_release(&allocator);
-   }
-#endif
-
-#if defined(HYPRE_USING_UMPIRE_DEVICE)
-   if (hypre_HandleOwnUmpireDevicePool(hypre_handle_))
-   {
-      const char *pool_name = hypre_HandleUmpireDevicePoolName(hypre_handle_);
-      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
-      umpire_allocator_release(&allocator);
-   }
-#endif
-
-#if defined(HYPRE_USING_UMPIRE_UM)
-   if (hypre_HandleOwnUmpireUMPool(hypre_handle_))
-   {
-      const char *pool_name = hypre_HandleUmpireUMPoolName(hypre_handle_);
-      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
-      umpire_allocator_release(&allocator);
-   }
-#endif
-
-#if defined(HYPRE_USING_UMPIRE_PINNED)
-   if (hypre_HandleOwnUmpirePinnedPool(hypre_handle_))
-   {
-      const char *pool_name = hypre_HandleUmpirePinnedPoolName(hypre_handle_);
-      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
-      umpire_allocator_release(&allocator);
-   }
-#endif
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-HYPRE_SetUmpireDevicePoolSize(size_t nbytes)
-{
-   hypre_HandleUmpireDevicePoolSize(hypre_handle()) = nbytes;
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-HYPRE_SetUmpireUMPoolSize(size_t nbytes)
-{
-   hypre_HandleUmpireUMPoolSize(hypre_handle()) = nbytes;
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-HYPRE_SetUmpireHostPoolSize(size_t nbytes)
-{
-   hypre_HandleUmpireHostPoolSize(hypre_handle()) = nbytes;
-
-   return hypre_error_flag;
-}
+/*--------------------------------------------------------------------------
+ * HYPRE_MemoryPrintUsage
+ *--------------------------------------------------------------------------*/
 
 HYPRE_Int
-HYPRE_SetUmpirePinnedPoolSize(size_t nbytes)
+HYPRE_MemoryPrintUsage(MPI_Comm    comm,
+                       HYPRE_Int   level,
+                       const char *function,
+                       HYPRE_Int   line)
 {
-   hypre_HandleUmpirePinnedPoolSize(hypre_handle()) = nbytes;
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-HYPRE_SetUmpireDevicePoolName(const char *pool_name)
-{
-   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
-   {
-      hypre_error_in_arg(1);
-
-      return hypre_error_flag;
-   }
-
-   strcpy(hypre_HandleUmpireDevicePoolName(hypre_handle()), pool_name);
-
-   return hypre_error_flag;
+   return hypre_MemoryPrintUsage(comm, level, function, line);
 }
 
-HYPRE_Int
-HYPRE_SetUmpireUMPoolName(const char *pool_name)
-{
-   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
-   {
-      hypre_error_in_arg(1);
-
-      return hypre_error_flag;
-   }
-
-   strcpy(hypre_HandleUmpireUMPoolName(hypre_handle()), pool_name);
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-HYPRE_SetUmpireHostPoolName(const char *pool_name)
-{
-   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
-   {
-      hypre_error_in_arg(1);
-
-      return hypre_error_flag;
-   }
-
-   strcpy(hypre_HandleUmpireHostPoolName(hypre_handle()), pool_name);
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-HYPRE_SetUmpirePinnedPoolName(const char *pool_name)
-{
-   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
-   {
-      hypre_error_in_arg(1);
-
-      return hypre_error_flag;
-   }
-
-   strcpy(hypre_HandleUmpirePinnedPoolName(hypre_handle()), pool_name);
-
-   return hypre_error_flag;
-}
-
-#endif /* #if defined(HYPRE_USING_UMPIRE) */
-
 /******************************************************************************
  *
  * HYPRE memory location
diff --git a/3rd_party/hypre/src/utilities/handle.c b/3rd_party/hypre/src/utilities/handle.c
index faeb9f48c..8abaf30bd 100644
--- a/3rd_party/hypre/src/utilities/handle.c
+++ b/3rd_party/hypre/src/utilities/handle.c
@@ -14,7 +14,22 @@
 #include "_hypre_utilities.h"
 #include "_hypre_utilities.hpp"
 
-/* GPU SpTrans */
+/*--------------------------------------------------------------------------
+ * hypre_SetLogLevel
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_SetLogLevel(HYPRE_Int log_level)
+{
+   hypre_HandleLogLevel(hypre_handle()) = log_level;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_SetSpTransUseVendor
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpTransUseVendor( HYPRE_Int use_vendor )
 {
@@ -27,7 +42,10 @@ hypre_SetSpTransUseVendor( HYPRE_Int use_vendor )
    return hypre_error_flag;
 }
 
-/* GPU SpMV */
+/*--------------------------------------------------------------------------
+ * hypre_SetSpMVUseVendor
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpMVUseVendor( HYPRE_Int use_vendor )
 {
@@ -40,7 +58,10 @@ hypre_SetSpMVUseVendor( HYPRE_Int use_vendor )
    return hypre_error_flag;
 }
 
-/* GPU SpGemm */
+/*--------------------------------------------------------------------------
+ * hypre_SetSpGemmUseVendor
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpGemmUseVendor( HYPRE_Int use_vendor )
 {
@@ -53,6 +74,10 @@ hypre_SetSpGemmUseVendor( HYPRE_Int use_vendor )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetSpGemmAlgorithm
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpGemmAlgorithm( HYPRE_Int value )
 {
@@ -72,6 +97,10 @@ hypre_SetSpGemmAlgorithm( HYPRE_Int value )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetSpGemmBinned
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpGemmBinned( HYPRE_Int value )
 {
@@ -84,6 +113,10 @@ hypre_SetSpGemmBinned( HYPRE_Int value )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetSpGemmRownnzEstimateMethod
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpGemmRownnzEstimateMethod( HYPRE_Int value )
 {
@@ -103,6 +136,10 @@ hypre_SetSpGemmRownnzEstimateMethod( HYPRE_Int value )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetSpGemmRownnzEstimateNSamples
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpGemmRownnzEstimateNSamples( HYPRE_Int value )
 {
@@ -115,6 +152,10 @@ hypre_SetSpGemmRownnzEstimateNSamples( HYPRE_Int value )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetSpGemmRownnzEstimateMultFactor
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetSpGemmRownnzEstimateMultFactor( HYPRE_Real value )
 {
@@ -134,7 +175,10 @@ hypre_SetSpGemmRownnzEstimateMultFactor( HYPRE_Real value )
    return hypre_error_flag;
 }
 
-/* GPU Rand */
+/*--------------------------------------------------------------------------
+ * hypre_SetUseGpuRand
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetUseGpuRand( HYPRE_Int use_gpurand )
 {
@@ -147,6 +191,10 @@ hypre_SetUseGpuRand( HYPRE_Int use_gpurand )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetGaussSeidelMethod
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetGaussSeidelMethod( HYPRE_Int gs_method )
 {
@@ -159,6 +207,10 @@ hypre_SetGaussSeidelMethod( HYPRE_Int gs_method )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetUserDeviceMalloc
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetUserDeviceMalloc(GPUMallocFunc func)
 {
@@ -171,6 +223,10 @@ hypre_SetUserDeviceMalloc(GPUMallocFunc func)
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetUserDeviceMfree
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetUserDeviceMfree(GPUMfreeFunc func)
 {
@@ -183,6 +239,10 @@ hypre_SetUserDeviceMfree(GPUMfreeFunc func)
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_SetGpuAwareMPI
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_SetGpuAwareMPI( HYPRE_Int use_gpu_aware_mpi )
 {
@@ -194,6 +254,10 @@ hypre_SetGpuAwareMPI( HYPRE_Int use_gpu_aware_mpi )
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_GetGpuAwareMPI
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_GetGpuAwareMPI(void)
 {
diff --git a/3rd_party/hypre/src/utilities/handle.h b/3rd_party/hypre/src/utilities/handle.h
index 88b16782d..7ca8c1244 100644
--- a/3rd_party/hypre/src/utilities/handle.h
+++ b/3rd_party/hypre/src/utilities/handle.h
@@ -14,11 +14,26 @@
 #ifndef HYPRE_HANDLE_H
 #define HYPRE_HANDLE_H
 
+#if defined(HYPRE_USING_UMPIRE)
+#include "umpire/config.hpp"
+#if UMPIRE_VERSION_MAJOR >= 2022
+#include "umpire/interface/c_fortran/umpire.h"
+#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_quick_pool
+#else
+#include "umpire/interface/umpire.h"
+#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_pool
+#endif /* UMPIRE_VERSION_MAJOR >= 2022 */
+#define HYPRE_UMPIRE_POOL_NAME_MAX_LEN 1024
+#endif /* defined(HYPRE_USING_UMPIRE) */
+
 struct hypre_DeviceData;
 typedef struct hypre_DeviceData hypre_DeviceData;
+typedef void (*GPUMallocFunc)(void **, size_t);
+typedef void (*GPUMfreeFunc)(void *);
 
 typedef struct
 {
+   HYPRE_Int              log_level;
    HYPRE_Int              hypre_error;
    HYPRE_MemoryLocation   memory_location;
    HYPRE_ExecutionPolicy  default_exec_policy;
@@ -66,6 +81,7 @@ typedef struct
 } hypre_Handle;
 
 /* accessor macros to hypre_Handle */
+#define hypre_HandleLogLevel(hypre_handle)                       ((hypre_handle) -> log_level)
 #define hypre_HandleMemoryLocation(hypre_handle)                 ((hypre_handle) -> memory_location)
 #define hypre_HandleDefaultExecPolicy(hypre_handle)              ((hypre_handle) -> default_exec_policy)
 
diff --git a/3rd_party/hypre/src/utilities/headers b/3rd_party/hypre/src/utilities/headers
index f4fef2202..7de3d3a8b 100755
--- a/3rd_party/hypre/src/utilities/headers
+++ b/3rd_party/hypre/src/utilities/headers
@@ -33,6 +33,7 @@ extern "C" {
 # Structures and prototypes
 #===========================================================================
 
+cat handle.h                   >> $INTERNAL_HEADER
 cat state.h                    >> $INTERNAL_HEADER
 cat general.h                  >> $INTERNAL_HEADER
 cat base.h                     >> $INTERNAL_HEADER
@@ -50,7 +51,6 @@ cat timing.h                   >> $INTERNAL_HEADER
 cat amg_linklist.h             >> $INTERNAL_HEADER
 cat exchange_data.h            >> $INTERNAL_HEADER
 cat caliper_instrumentation.h  >> $INTERNAL_HEADER
-cat handle.h                   >> $INTERNAL_HEADER
 cat gselim.h                   >> $INTERNAL_HEADER
 cat int_array.h                >> $INTERNAL_HEADER
 cat protos.h                   >> $INTERNAL_HEADER
diff --git a/3rd_party/hypre/src/utilities/int_array_device.c b/3rd_party/hypre/src/utilities/int_array_device.c
index 342a3777a..76537f40e 100644
--- a/3rd_party/hypre/src/utilities/int_array_device.c
+++ b/3rd_party/hypre/src/utilities/int_array_device.c
@@ -79,6 +79,7 @@ hypre_IntArrayInverseMappingDevice( hypre_IntArray  *v,
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim);
 
    HYPRE_GPU_LAUNCH( hypreGPUKernel_IntArrayInverseMapping, gDim, bDim, size, v_data, w_data );
+   hypre_SyncComputeStream(hypre_handle());
 
 #elif defined(HYPRE_USING_DEVICE_OPENMP)
    HYPRE_Int i;
diff --git a/3rd_party/hypre/src/utilities/matrix_stats.c b/3rd_party/hypre/src/utilities/matrix_stats.c
index 2d36acd6e..caf7fb2db 100644
--- a/3rd_party/hypre/src/utilities/matrix_stats.c
+++ b/3rd_party/hypre/src/utilities/matrix_stats.c
@@ -199,7 +199,7 @@ hypre_MatrixStatsArrayPrint(HYPRE_Int                num_hierarchies,
    ndigits[4]  = 4;
    ndigits[5]  = 4;
    ndigits[6]  = 4;
-   ndigits[7]  = 4;
+   ndigits[7]  = 5;
    ndigits[8]  = 8;
    ndigits[9]  = 8;
    ndigits[10] = 8;
@@ -246,8 +246,7 @@ hypre_MatrixStatsArrayPrint(HYPRE_Int                num_hierarchies,
    offsets[1] = 0 + ndigits[2];
    offsets[2] = 2 + ndigits[3];
    offsets[3] = 7 + (ndigits[4] + ndigits[5] + ndigits[6] + ndigits[7]) / 2;
-   offsets[4] = (ndigits[4] + ndigits[5] + ndigits[6] + ndigits[7]) / 2 +
-                (ndigits[4] + ndigits[5] + ndigits[6] + ndigits[7]) % 2 - 3;
+   offsets[4] = (1 + ndigits[4] + ndigits[5] + ndigits[6] + ndigits[7]) / 2 - 3;
    offsets[5] = 4 + (ndigits[8] + ndigits[9] + ndigits[10] + ndigits[11]) / 2;
    if (!square)
    {
diff --git a/3rd_party/hypre/src/utilities/memory.c b/3rd_party/hypre/src/utilities/memory.c
index 715b8981b..ff313732c 100644
--- a/3rd_party/hypre/src/utilities/memory.c
+++ b/3rd_party/hypre/src/utilities/memory.c
@@ -13,6 +13,14 @@
 
 #include "_hypre_utilities.h"
 #include "_hypre_utilities.hpp"
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <mach/mach.h>
+#include <mach/task_info.h>
+#elif defined(__linux__)
+#include <sys/sysinfo.h>
+#endif
 
 #if defined(HYPRE_USE_UMALLOC)
 #undef HYPRE_USE_UMALLOC
@@ -597,6 +605,11 @@ static inline void
 hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_dst,
                   hypre_MemoryLocation loc_src)
 {
+   if (size == 0)
+   {
+      return;
+   }
+
 #if defined(HYPRE_USING_SYCL)
    sycl::queue* q = hypre_HandleComputeStream(hypre_handle());
 #endif
@@ -650,7 +663,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_HIP)
-      HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToDevice) );
+      // hipMemcpy(DtoD) causes a host-side synchronization, unlike cudaMemcpy(DtoD),
+      // use hipMemcpyAsync to get cuda's more performant behavior. For more info see:
+      // https://github.com/mfem/mfem/pull/2780
+      HYPRE_HIP_CALL( hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToDevice) );
 #endif
 
 #if defined(HYPRE_USING_SYCL)
@@ -778,7 +794,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_HIP)
-      HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToDevice) );
+      // hipMemcpy(DtoD) causes a host-side synchronization, unlike cudaMemcpy(DtoD),
+      // use hipMemcpyAsync to get cuda's more performant behavior. For more info see:
+      // https://github.com/mfem/mfem/pull/2780
+      HYPRE_HIP_CALL( hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToDevice) );
 #endif
 
 #if defined(HYPRE_USING_SYCL)
@@ -793,6 +812,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 /*--------------------------------------------------------------------------*
  * ExecPolicy
  *--------------------------------------------------------------------------*/
+
 static inline HYPRE_ExecutionPolicy
 hypre_GetExecPolicy1_core(hypre_MemoryLocation location)
 {
@@ -1171,6 +1191,12 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    {
       *memory_location = hypre_MEMORY_HOST_PINNED;
    }
+#if (HIP_VERSION_MAJOR >= 6)
+   else if (attr.type == hipMemoryTypeUnregistered)
+   {
+      *memory_location = hypre_MEMORY_HOST;
+   }
+#endif
 #endif // defined(HYPRE_USING_HIP)
 
 #if defined(HYPRE_USING_SYCL)
@@ -1211,8 +1237,362 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    return ierr;
 }
 
-/*--------------------------------------------------------------------------*
- * Memory Pool
+/*--------------------------------------------------------------------------
+ * hypre_HostMemoryGetUsage
+ *
+ * Retrieves various memory usage statistics involving CPU RAM. The function
+ * fills an array with the memory data, converted to gigabytes (GB).
+ * Detailed info is given below:
+ *
+ *    - mem[0]: VmSize
+ *      The current virtual memory size used by the process. This includes
+ *      all memory the process can access, including memory that is swapped
+ *      out and memory allocated but not used.
+ *
+ *    - mem[1]: VmPeak
+ *      The peak virtual memory size used by the process during its lifetime.
+ *
+ *    - mem[2]: VmRSS
+ *      The resident set size, which is the portion of the process' memory
+ *      that is held in CPU RAM. This includes code, data, and stack space
+ *      but excludes swapped-out memory.
+ *
+ *    - mem[3]: VmHWM
+ *      The peak resident set size, which is the maximum amount of memory
+ *      that the process has had in CPU RAM at any point in time, aka.
+ *      high water mark.
+ *
+ *    - mem[4]: free
+ *      The amount of free CPU RAM available in the system.
+ *
+ *    - mem[5]: total
+ *      The total amount of CPU RAM installed in the system.
+ *
+ * This function doesn't return correct memory info for Windows environments.
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_HostMemoryGetUsage(HYPRE_Real *mem)
+{
+   size_t       vm_size  = 0;
+   size_t       vm_rss   = 0;
+   size_t       vm_hwm   = 0;
+   size_t       vm_peak  = 0;
+   size_t       tot_mem  = 0;
+   size_t       free_mem = 0;
+   HYPRE_Real   b_to_gb  = (HYPRE_Real) (1 << 30);
+
+   /* Sanity check */
+   if (!mem)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Mem is a NULL pointer!");
+      return hypre_error_flag;
+   }
+
+   /* Get system memory info */
+#if defined(__APPLE__)
+   struct task_basic_info   t_info;
+   mach_msg_type_number_t   t_info_count = TASK_BASIC_INFO_COUNT;
+   mach_msg_type_number_t   count = HOST_VM_INFO_COUNT;
+   vm_statistics_data_t     vm_stat;
+   hypre_int                mib[2] = {CTL_HW, HW_MEMSIZE};
+   size_t                   length = sizeof(size_t);
+
+   if (sysctl(mib, 2, &tot_mem, &length, NULL, 0))
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Problem running sysctl!");
+      return hypre_error_flag;
+   }
+
+   if (host_statistics(mach_host_self(), HOST_VM_INFO, (host_info_t)&vm_stat, &count) !=
+       KERN_SUCCESS)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Problem running host_statistics!");
+      return hypre_error_flag;
+   }
+
+   free_mem = (size_t) vm_stat.free_count * (size_t) vm_page_size;
+
+   /* Get the task info */
+   if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&t_info,
+                 &t_info_count) != KERN_SUCCESS)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Problem running task_info!");
+      return hypre_error_flag;
+   }
+
+   /* vm_peak is not directly available, so we set it to vm_size */
+   vm_size = vm_peak = (size_t) t_info.virtual_size;
+
+   /* vm_hwm is not directly available, so we set it to vm_rss */
+   vm_rss = vm_hwm = (size_t) t_info.resident_size;
+
+#elif defined(__linux__)
+   struct sysinfo   info;
+   char             line[512];
+   FILE            *file;
+
+   if (sysinfo(&info) != 0)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Problem running sysinfo!");
+      return hypre_error_flag;
+   }
+   tot_mem  = info.totalram * info.mem_unit;
+   free_mem = info.freeram  * info.mem_unit;
+
+   /* Function to get process memory info */
+   file = fopen("/proc/self/status", "r");
+   if (file == NULL)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Cannot open /proc/self/status!");
+      return hypre_error_flag;
+   }
+
+   while (fgets(line, sizeof(line), file))
+   {
+      (void) sscanf(line, "VmPeak: %zu kB", &vm_peak);
+      (void) sscanf(line, "VmSize: %zu kB", &vm_size);
+      (void) sscanf(line,  "VmRSS: %zu kB", &vm_rss);
+      (void) sscanf(line,  "VmHWM: %zu kB", &vm_hwm);
+   }
+   fclose(file);
+
+   /* Convert KB to bytes */
+   vm_peak *= 1024;
+   vm_size *= 1024;
+   vm_rss  *= 1024;
+   vm_hwm  *= 1024;
+#endif
+
+   /* Convert data from bytes to GB (HYPRE_Real) */
+   mem[0] = vm_size  / b_to_gb;
+   mem[1] = vm_peak  / b_to_gb;
+   mem[2] = vm_rss   / b_to_gb;
+   mem[3] = vm_hwm   / b_to_gb;
+   mem[4] = free_mem / b_to_gb;
+   mem[5] = tot_mem  / b_to_gb;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_MemoryPrintUsage
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_MemoryPrintUsage(MPI_Comm    comm,
+                       HYPRE_Int   log_level,
+                       const char *function,
+                       HYPRE_Int   line)
+{
+#if defined(HYPRE_USING_UMPIRE)
+   HYPRE_Int    ne = 14;
+#else
+   HYPRE_Int    ne = 6;
+#endif
+   HYPRE_Real   lmem[14];
+   HYPRE_Real   min[14];
+   HYPRE_Real   max[14];
+   HYPRE_Real   avg[14];
+   HYPRE_Real   ssq[14];
+   HYPRE_Real   std[14];
+   HYPRE_Real  *gmem = NULL;
+   HYPRE_Int    i, j, myid, nprocs, ndigits;
+   const char  *labels[] = {"Min", "Max", "Avg", "Std"};
+   HYPRE_Real  *data[]   = {min, max, avg, std};
+
+   /* Return if neither the 1st nor 2nd bits of log_level are set */
+   if (!(log_level & 0x3))
+   {
+      return hypre_error_flag;
+   }
+
+   /* Initialize locals */
+   for (j = 0; j < ne; j++)
+   {
+      lmem[j] = 0.0;
+      min[j]  = HYPRE_REAL_MAX;
+      max[j]  = 0.0;
+      avg[j]  = 0.0;
+      ssq[j]  = 0.0;
+      std[j]  = 0.0;
+   }
+
+   /* MPI variables */
+   hypre_MPI_Comm_size(comm, &nprocs);
+   hypre_MPI_Comm_rank(comm, &myid);
+   ndigits = hypre_ndigits(nprocs);
+
+   /* Work space for gathering memory info */
+   if (!myid)
+   {
+      gmem = hypre_CTAlloc(HYPRE_Real, ne * nprocs, HYPRE_MEMORY_HOST);
+   }
+
+   /* Get host memory info */
+   hypre_HostMemoryGetUsage(lmem);
+
+   /* Get umpire memory info */
+#if defined(HYPRE_USING_UMPIRE)
+   hypre_UmpireMemoryGetUsage(&lmem[6]);
+#endif
+
+   /* Gather memory info to rank 0 */
+   hypre_MPI_Gather(lmem, ne, hypre_MPI_REAL, gmem, ne, hypre_MPI_REAL, 0, comm);
+
+   /* Rank 0 computes min/max/avg/stddev statistics */
+   if (!myid && (log_level & 0x2))
+   {
+      for (i = 0; i < nprocs; i++)
+      {
+         for (j = 0; j < ne; j++)
+         {
+            if (gmem[ne * i + j] < min[j]) { min[j] = gmem[ne * i + j]; }
+            if (gmem[ne * i + j] > max[j]) { max[j] = gmem[ne * i + j]; }
+            avg[j] += gmem[ne * i + j];
+         }
+      }
+
+      for (j = 0; j < ne; j++)
+      {
+         avg[j] /= (HYPRE_Real) nprocs;
+      }
+
+      for (i = 0; i < nprocs; i++)
+      {
+         for (j = 0; j < ne; j++)
+         {
+            ssq[j] += hypre_pow(gmem[ne * i + j] - avg[j], 2) / (HYPRE_Real) nprocs;
+         }
+      }
+
+      for (j = 0; j < ne; j++)
+      {
+         std[j] = hypre_sqrt(ssq[j]);
+      }
+   }
+
+   /* Rank 0 prints the data */
+   if (!myid)
+   {
+      /* Local memory usage statistics */
+      if (log_level & 0x1)
+      {
+         for (i = 0; i < nprocs; i++)
+         {
+            if (line > 0)
+            {
+               hypre_printf("[%*d]: %s at line %d", ndigits, i, function, line);
+            }
+            else
+            {
+               hypre_printf("[%*d]: %s", ndigits, i, function);
+            }
+            hypre_printf(" | Vm[Size,RSS]/[Peak,HWM]: (%.2f, %.2f / %.2f, %.2f) GB",
+                         gmem[ne * i + 0], gmem[ne * i + 2],
+                         gmem[ne * i + 1], gmem[ne * i + 3]);
+            hypre_printf(" | Free/Total: (%.2f / %.2f)", gmem[ne * i + 4], gmem[ne * i + 5]);
+#if defined(HYPRE_USING_UMPIRE)
+            if (gmem[ne * i + 7])
+            {
+               hypre_printf(" | UmpHSize/UmpHPeak: (%.2f / %.2f)",
+                            gmem[ne * i + 6], gmem[ne * i + 7]);
+            }
+            if (gmem[ne * i + 9])
+            {
+               hypre_printf(" | UmpDPeak/UmpDPeak: (%.2f / %.2f)",
+                            gmem[ne * i + 8], gmem[ne * i + 9]);
+            }
+            if (gmem[ne * i + 11])
+            {
+               hypre_printf(" | UmpUPeak/UmpUPeak: (%.2f / %.2f)",
+                            gmem[ne * i + 10], gmem[ne * i + 11]);
+            }
+            if (gmem[ne * i + 13])
+            {
+               hypre_printf(" | UmpPSize/UmpPPeak: (%.2f / %.2f)",
+                            gmem[ne * i + 12], gmem[ne * i + 13]);
+            }
+#endif
+            hypre_printf("\n");
+         }
+      }
+
+      /* Global memory usage statistics */
+      if (log_level & 0x2)
+      {
+         hypre_printf("\nMemory usage across ranks - ");
+         if (line > 0)
+         {
+            hypre_printf("%s at line %d\n\n", function, line);
+         }
+         else
+         {
+            hypre_printf("%s\n\n", function);
+         }
+
+         /* Print header */
+         hypre_printf("       | %11s | %11s | %11s | %11s",
+                      "VmSize (GB)", "VmPeak (GB)", "VmRSS (GB)", "VmHWM (GB)");
+#if defined(HYPRE_USING_UMPIRE_HOST)
+         hypre_printf(" | %13s | %13s", "UmpHSize (GB)", "UmpHPeak (GB)");
+#endif
+#if defined(HYPRE_USING_UMPIRE_DEVICE)
+         hypre_printf(" | %13s | %13s", "UmpDSize (GB)", "UmpDPeak (GB)");
+#endif
+#if defined(HYPRE_USING_UMPIRE_UM)
+         hypre_printf(" | %13s | %13s", "UmpUSize (GB)", "UmpUPeak (GB)");
+#endif
+#if defined(HYPRE_USING_UMPIRE_PINNED)
+         hypre_printf(" | %13s | %13s", "UmpPSize (GB)", "UmpPPeak (GB)")
+#endif
+         hypre_printf("\n");
+         hypre_printf("   ----+-------------+-------------+-------------+------------");
+#if defined(HYPRE_USING_UMPIRE_HOST)
+         hypre_printf("-+---------------+--------------");
+#endif
+#if defined(HYPRE_USING_UMPIRE_DEVICE)
+         hypre_printf("-+---------------+--------------");
+#endif
+#if defined(HYPRE_USING_UMPIRE_UM)
+         hypre_printf("-+---------------+--------------");
+#endif
+#if defined(HYPRE_USING_UMPIRE_PINNED)
+         hypre_printf("-+---------------+--------------");
+#endif
+         hypre_printf("\n");
+
+         /* Print table */
+         for (i = 0; i < 4; i++)
+         {
+            hypre_printf("   %-3s", labels[i]);
+            hypre_printf(" | %11.3f | %11.3f | %11.3f | %11.3f",
+                         data[i][0], data[i][1], data[i][2], data[i][3]);
+#if defined(HYPRE_USING_UMPIRE_HOST)
+            hypre_printf(" | %13.3f | %13.3f", data[i][6], data[i][7]);
+#endif
+#if defined(HYPRE_USING_UMPIRE_DEVICE)
+            hypre_printf(" | %13.3f | %13.3f", data[i][8], data[i][9]);
+#endif
+#if defined(HYPRE_USING_UMPIRE_UM)
+            hypre_printf(" | %13.3f | %13.3f", data[i][10], data[i][11]);
+#endif
+#if defined(HYPRE_USING_UMPIRE_PINNED)
+            hypre_printf(" | %13.3f | %13.3f", data[i][12], data[i][13]);
+#endif
+            hypre_printf("\n");
+         }
+      }
+   }
+   hypre_MPI_Barrier(comm);
+
+   hypre_TFree(gmem, HYPRE_MEMORY_HOST);
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_SetCubMemPoolSize
  *--------------------------------------------------------------------------*/
 
 HYPRE_Int
@@ -1247,6 +1627,10 @@ hypre_SetCubMemPoolSize(hypre_uint cub_bin_growth,
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * HYPRE_SetGPUMemoryPoolSize
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 HYPRE_SetGPUMemoryPoolSize(HYPRE_Int bin_growth,
                            HYPRE_Int min_bin,
@@ -1256,7 +1640,12 @@ HYPRE_SetGPUMemoryPoolSize(HYPRE_Int bin_growth,
    return hypre_SetCubMemPoolSize(bin_growth, min_bin, max_bin, max_cached_bytes);
 }
 
-#if defined(HYPRE_USING_DEVICE_POOL)
+#if defined(HYPRE_USING_DEVICE_POOL) && defined(HYPRE_USING_CUDA)
+
+/*--------------------------------------------------------------------------
+ * hypre_CachingMallocDevice
+ *--------------------------------------------------------------------------*/
+
 cudaError_t
 hypre_CachingMallocDevice(void **ptr, size_t nbytes)
 {
@@ -1275,12 +1664,20 @@ hypre_CachingMallocDevice(void **ptr, size_t nbytes)
    return hypre_HandleCubDevAllocator(hypre_handle()) -> DeviceAllocate(ptr, nbytes);
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_CachingFreeDevice
+ *--------------------------------------------------------------------------*/
+
 cudaError_t
 hypre_CachingFreeDevice(void *ptr)
 {
    return hypre_HandleCubDevAllocator(hypre_handle()) -> DeviceFree(ptr);
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_CachingMallocManaged
+ *--------------------------------------------------------------------------*/
+
 cudaError_t
 hypre_CachingMallocManaged(void **ptr, size_t nbytes)
 {
@@ -1299,12 +1696,20 @@ hypre_CachingMallocManaged(void **ptr, size_t nbytes)
    return hypre_HandleCubUvmAllocator(hypre_handle()) -> DeviceAllocate(ptr, nbytes);
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_CachingFreeManaged
+ *--------------------------------------------------------------------------*/
+
 cudaError_t
 hypre_CachingFreeManaged(void *ptr)
 {
    return hypre_HandleCubUvmAllocator(hypre_handle()) -> DeviceFree(ptr);
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_DeviceDataCubCachingAllocatorCreate
+ *--------------------------------------------------------------------------*/
+
 hypre_cub_CachingDeviceAllocator *
 hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth,
                                           hypre_uint min_bin,
@@ -1326,6 +1731,10 @@ hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth,
    return allocator;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_DeviceDataCubCachingAllocatorDestroy
+ *--------------------------------------------------------------------------*/
+
 void
 hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data)
 {
@@ -1333,9 +1742,14 @@ hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data)
    delete hypre_DeviceDataCubUvmAllocator(data);
 }
 
-#endif // #if defined(HYPRE_USING_DEVICE_POOL)
+#endif // #if defined(HYPRE_USING_DEVICE_POOL) && defined(HYPRE_USING_CUDA)
 
 #if defined(HYPRE_USING_UMPIRE_HOST)
+
+/*--------------------------------------------------------------------------
+ * hypre_umpire_host_pooled_allocate
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_host_pooled_allocate(void **ptr, size_t nbytes)
 {
@@ -1365,6 +1779,10 @@ hypre_umpire_host_pooled_allocate(void **ptr, size_t nbytes)
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_umpire_host_pooled_free
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_host_pooled_free(void *ptr)
 {
@@ -1382,6 +1800,10 @@ hypre_umpire_host_pooled_free(void *ptr)
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_umpire_host_pooled_realloc
+ *--------------------------------------------------------------------------*/
+
 void *
 hypre_umpire_host_pooled_realloc(void *ptr, size_t size)
 {
@@ -1401,6 +1823,11 @@ hypre_umpire_host_pooled_realloc(void *ptr, size_t size)
 #endif
 
 #if defined(HYPRE_USING_UMPIRE_DEVICE)
+
+/*--------------------------------------------------------------------------
+ * hypre_umpire_device_pooled_allocate
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_device_pooled_allocate(void **ptr, size_t nbytes)
 {
@@ -1434,6 +1861,10 @@ hypre_umpire_device_pooled_allocate(void **ptr, size_t nbytes)
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_umpire_device_pooled_free
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_device_pooled_free(void *ptr)
 {
@@ -1453,6 +1884,11 @@ hypre_umpire_device_pooled_free(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_UMPIRE_UM)
+
+/*--------------------------------------------------------------------------
+ * hypre_umpire_um_pooled_allocate
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_um_pooled_allocate(void **ptr, size_t nbytes)
 {
@@ -1483,6 +1919,10 @@ hypre_umpire_um_pooled_allocate(void **ptr, size_t nbytes)
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_umpire_um_pooled_free
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_um_pooled_free(void *ptr)
 {
@@ -1502,6 +1942,11 @@ hypre_umpire_um_pooled_free(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_UMPIRE_PINNED)
+
+/*--------------------------------------------------------------------------
+ * hypre_umpire_pinned_pooled_allocate
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_pinned_pooled_allocate(void **ptr, size_t nbytes)
 {
@@ -1532,6 +1977,10 @@ hypre_umpire_pinned_pooled_allocate(void **ptr, size_t nbytes)
    return hypre_error_flag;
 }
 
+/*--------------------------------------------------------------------------
+ * hypre_umpire_pinned_pooled_free
+ *--------------------------------------------------------------------------*/
+
 HYPRE_Int
 hypre_umpire_pinned_pooled_free(void *ptr)
 {
@@ -1549,3 +1998,281 @@ hypre_umpire_pinned_pooled_free(void *ptr)
    return hypre_error_flag;
 }
 #endif
+
+/******************************************************************************
+ *
+ * hypre Umpire
+ *
+ *****************************************************************************/
+
+#if defined(HYPRE_USING_UMPIRE)
+
+/*--------------------------------------------------------------------------
+ * hypre_UmpireInit
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_UmpireInit(hypre_Handle *hypre_handle_)
+{
+   umpire_resourcemanager_get_instance(&hypre_HandleUmpireResourceMan(hypre_handle_));
+
+   hypre_HandleUmpireDevicePoolSize(hypre_handle_) = 4LL * (1 << 30); // 4 GB
+   hypre_HandleUmpireUMPoolSize(hypre_handle_)     = 4LL * (1 << 30); // 4 GB
+   hypre_HandleUmpireHostPoolSize(hypre_handle_)   = 4LL * (1 << 30); // 4 GB
+   hypre_HandleUmpirePinnedPoolSize(hypre_handle_) = 4LL * (1 << 30); // 4 GB
+
+   hypre_HandleUmpireBlockSize(hypre_handle_) = 512;
+
+   strcpy(hypre_HandleUmpireDevicePoolName(hypre_handle_), "HYPRE_DEVICE_POOL");
+   strcpy(hypre_HandleUmpireUMPoolName(hypre_handle_),     "HYPRE_UM_POOL");
+   strcpy(hypre_HandleUmpireHostPoolName(hypre_handle_),   "HYPRE_HOST_POOL");
+   strcpy(hypre_HandleUmpirePinnedPoolName(hypre_handle_), "HYPRE_PINNED_POOL");
+
+   hypre_HandleOwnUmpireDevicePool(hypre_handle_) = 0;
+   hypre_HandleOwnUmpireUMPool(hypre_handle_)     = 0;
+   hypre_HandleOwnUmpireHostPool(hypre_handle_)   = 0;
+   hypre_HandleOwnUmpirePinnedPool(hypre_handle_) = 0;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_UmpireFinalize
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_UmpireFinalize(hypre_Handle *hypre_handle_)
+{
+   umpire_resourcemanager *rm_ptr = &hypre_HandleUmpireResourceMan(hypre_handle_);
+   umpire_allocator allocator;
+
+#if defined(HYPRE_USING_UMPIRE_HOST)
+   if (hypre_HandleOwnUmpireHostPool(hypre_handle_))
+   {
+      const char *pool_name = hypre_HandleUmpireHostPoolName(hypre_handle_);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      umpire_allocator_release(&allocator);
+   }
+#endif
+
+#if defined(HYPRE_USING_UMPIRE_DEVICE)
+   if (hypre_HandleOwnUmpireDevicePool(hypre_handle_))
+   {
+      const char *pool_name = hypre_HandleUmpireDevicePoolName(hypre_handle_);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      umpire_allocator_release(&allocator);
+   }
+#endif
+
+#if defined(HYPRE_USING_UMPIRE_UM)
+   if (hypre_HandleOwnUmpireUMPool(hypre_handle_))
+   {
+      const char *pool_name = hypre_HandleUmpireUMPoolName(hypre_handle_);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      umpire_allocator_release(&allocator);
+   }
+#endif
+
+#if defined(HYPRE_USING_UMPIRE_PINNED)
+   if (hypre_HandleOwnUmpirePinnedPool(hypre_handle_))
+   {
+      const char *pool_name = hypre_HandleUmpirePinnedPoolName(hypre_handle_);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      umpire_allocator_release(&allocator);
+   }
+#endif
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * hypre_UmpireMemoryGetUsage
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_UmpireMemoryGetUsage(HYPRE_Real *memory)
+{
+   hypre_Handle                 *handle = hypre_handle();
+   umpire_resourcemanager       *rm_ptr = &hypre_HandleUmpireResourceMan(handle);
+   umpire_allocator              allocator;
+
+   size_t                        memoryB[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+   HYPRE_Int                     i;
+
+   /* Sanity check */
+   if (!memory)
+   {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "memory is a NULL pointer!");
+      return hypre_error_flag;
+   }
+
+#if defined(HYPRE_USING_UMPIRE_HOST)
+   if (hypre_HandleOwnUmpireHostPool(handle))
+   {
+      const char *pool_name = hypre_HandleUmpireHostPoolName(handle);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      memoryB[0] = umpire_allocator_get_current_size(&allocator);
+      memoryB[1] = umpire_allocator_get_high_watermark(&allocator);
+   }
+#endif
+
+#if defined(HYPRE_USING_UMPIRE_DEVICE)
+   if (hypre_HandleOwnUmpireDevicePool(handle))
+   {
+      const char *pool_name = hypre_HandleUmpireDevicePoolName(handle);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      memoryB[2] = umpire_allocator_get_current_size(&allocator);
+      memoryB[3] = umpire_allocator_get_high_watermark(&allocator);
+   }
+#endif
+
+#if defined(HYPRE_USING_UMPIRE_UM)
+   if (hypre_HandleOwnUmpireUMPool(handle))
+   {
+      const char *pool_name = hypre_HandleUmpireUMPoolName(handle);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      memoryB[4] = umpire_allocator_get_current_size(&allocator);
+      memoryB[5] = umpire_allocator_get_high_watermark(&allocator);
+   }
+#endif
+
+#if defined(HYPRE_USING_UMPIRE_PINNED)
+   if (hypre_HandleOwnUmpirePinnedPool(handle))
+   {
+      const char *pool_name = hypre_HandleUmpirePinnedPoolName(handle);
+      umpire_resourcemanager_get_allocator_by_name(rm_ptr, pool_name, &allocator);
+      memoryB[6] = umpire_allocator_get_current_size(&allocator);
+      memoryB[7] = umpire_allocator_get_high_watermark(&allocator);
+   }
+#endif
+
+   /* Convert bytes to GB */
+   for (i = 0; i < 8; i++)
+   {
+      memory[i] = ((HYPRE_Real) memoryB[i]) / ((HYPRE_Real) (1 << 30));
+   }
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpireDevicePoolSize
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpireDevicePoolSize(size_t nbytes)
+{
+   hypre_HandleUmpireDevicePoolSize(hypre_handle()) = nbytes;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpireUMPoolSize
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpireUMPoolSize(size_t nbytes)
+{
+   hypre_HandleUmpireUMPoolSize(hypre_handle()) = nbytes;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpireHostPoolSize
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpireHostPoolSize(size_t nbytes)
+{
+   hypre_HandleUmpireHostPoolSize(hypre_handle()) = nbytes;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpirePinnedPoolSize
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpirePinnedPoolSize(size_t nbytes)
+{
+   hypre_HandleUmpirePinnedPoolSize(hypre_handle()) = nbytes;
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpireDevicePoolName
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpireDevicePoolName(const char *pool_name)
+{
+   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   strcpy(hypre_HandleUmpireDevicePoolName(hypre_handle()), pool_name);
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpireUMPoolName
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpireUMPoolName(const char *pool_name)
+{
+   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   strcpy(hypre_HandleUmpireUMPoolName(hypre_handle()), pool_name);
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpireHostPoolName
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpireHostPoolName(const char *pool_name)
+{
+   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   strcpy(hypre_HandleUmpireHostPoolName(hypre_handle()), pool_name);
+
+   return hypre_error_flag;
+}
+
+/*--------------------------------------------------------------------------
+ * HYPRE_SetUmpirePinnedPoolName
+ *--------------------------------------------------------------------------*/
+
+HYPRE_Int
+HYPRE_SetUmpirePinnedPoolName(const char *pool_name)
+{
+   if (strlen(pool_name) > HYPRE_UMPIRE_POOL_NAME_MAX_LEN)
+   {
+      hypre_error_in_arg(1);
+      return hypre_error_flag;
+   }
+
+   strcpy(hypre_HandleUmpirePinnedPoolName(hypre_handle()), pool_name);
+
+   return hypre_error_flag;
+}
+
+#endif /* #if defined(HYPRE_USING_UMPIRE) */
diff --git a/3rd_party/hypre/src/utilities/memory.h b/3rd_party/hypre/src/utilities/memory.h
index 86464d644..70accd7f2 100644
--- a/3rd_party/hypre/src/utilities/memory.h
+++ b/3rd_party/hypre/src/utilities/memory.h
@@ -75,18 +75,6 @@
 //#pragma omp requires unified_shared_memory
 #endif
 
-#if defined(HYPRE_USING_UMPIRE)
-#include "umpire/config.hpp"
-#if UMPIRE_VERSION_MAJOR >= 2022
-#include "umpire/interface/c_fortran/umpire.h"
-#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_quick_pool
-#else
-#include "umpire/interface/umpire.h"
-#define hypre_umpire_resourcemanager_make_allocator_pool umpire_resourcemanager_make_allocator_pool
-#endif
-#define HYPRE_UMPIRE_POOL_NAME_MAX_LEN 1024
-#endif
-
 /* stringification:
  * _Pragma(string-literal), so we need to cast argument to a string
  * The three dots as last argument of the macro tells compiler that this is a variadic macro.
@@ -204,7 +192,17 @@ HYPRE_Int hypre_umpire_um_pooled_allocate(void **ptr, size_t nbytes);
 HYPRE_Int hypre_umpire_um_pooled_free(void *ptr);
 HYPRE_Int hypre_umpire_pinned_pooled_allocate(void **ptr, size_t nbytes);
 HYPRE_Int hypre_umpire_pinned_pooled_free(void *ptr);
-
+HYPRE_Int hypre_UmpireInit(hypre_Handle *hypre_handle_);
+HYPRE_Int hypre_UmpireFinalize(hypre_Handle *hypre_handle_);
+HYPRE_Int hypre_UmpireGetCurrentMemoryUsage(MPI_Comm comm, HYPRE_Real *current);
+HYPRE_Int hypre_UmpireMemoryGetUsage(HYPRE_Real *memory);
+HYPRE_Int hypre_HostMemoryGetUsage(HYPRE_Real *mem);
+HYPRE_Int hypre_MemoryPrintUsage(MPI_Comm comm, HYPRE_Int level,
+                                 const char *function, HYPRE_Int line);
+#define HYPRE_PRINT_MEMORY_USAGE(comm) hypre_MemoryPrintUsage(comm,\
+                                                              hypre_HandleLogLevel(hypre_handle()),\
+                                                              __func__,\
+                                                              __LINE__)
 /* memory_dmalloc.c */
 HYPRE_Int hypre_InitMemoryDebugDML( HYPRE_Int id );
 HYPRE_Int hypre_FinalizeMemoryDebugDML( void );
@@ -213,10 +211,6 @@ char *hypre_CAllocDML( HYPRE_Int count, HYPRE_Int elt_size, char *file, HYPRE_In
 char *hypre_ReAllocDML( char *ptr, HYPRE_Int size, char *file, HYPRE_Int line );
 void hypre_FreeDML( char *ptr, char *file, HYPRE_Int line );
 
-/* GPU malloc prototype */
-typedef void (*GPUMallocFunc)(void **, size_t);
-typedef void (*GPUMfreeFunc)(void *);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/3rd_party/hypre/src/utilities/protos.h b/3rd_party/hypre/src/utilities/protos.h
index 013fbf8fa..159bb0e74 100644
--- a/3rd_party/hypre/src/utilities/protos.h
+++ b/3rd_party/hypre/src/utilities/protos.h
@@ -53,8 +53,6 @@ HYPRE_Int hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_);
 HYPRE_Int hypre_GetDevice(hypre_int *device_id);
 HYPRE_Int hypre_GetDeviceCount(hypre_int *device_count);
 HYPRE_Int hypre_GetDeviceLastError(void);
-HYPRE_Int hypre_UmpireInit(hypre_Handle *hypre_handle_);
-HYPRE_Int hypre_UmpireFinalize(hypre_Handle *hypre_handle_);
 HYPRE_Int hypre_GetDeviceMaxShmemSize(hypre_int device_id, hypre_int *max_size_ptr,
                                       hypre_int *max_size_optin_ptr);
 
@@ -359,6 +357,9 @@ HYPRE_Int hypre_bind_device_id(HYPRE_Int device_id_in, HYPRE_Int myid,
                                HYPRE_Int nproc, MPI_Comm comm);
 HYPRE_Int hypre_bind_device(HYPRE_Int myid, HYPRE_Int nproc, MPI_Comm comm);
 
+/* stl_ops.c */
+HYPRE_Int hypre_IntSequence(HYPRE_MemoryLocation memory_location, HYPRE_Int size, HYPRE_Int *data);
+
 /* nvtx.c */
 void hypre_GpuProfilingPushRangeColor(const char *name, HYPRE_Int cid);
 void hypre_GpuProfilingPushRange(const char *name);
@@ -372,6 +373,7 @@ HYPRE_Int hypre_CheckDirExists(const char *path);
 HYPRE_Int hypre_CreateDir(const char *path);
 HYPRE_Int hypre_CreateNextDirOfSequence(const char *basepath, const char *prefix,
                                         char **fullpath_ptr);
+char* hypre_ConvertIndicesToString(HYPRE_Int size, HYPRE_Int *indices);
 
 HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
 HYPRE_Int hypre_RestoreSyncCudaCompute(void);
@@ -379,6 +381,7 @@ HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
 HYPRE_Int hypre_ForceSyncComputeStream(hypre_Handle *hypre_handle);
 
 /* handle.c */
+HYPRE_Int hypre_SetLogLevel( HYPRE_Int log_level );
 HYPRE_Int hypre_SetSpTransUseVendor( HYPRE_Int use_vendor );
 HYPRE_Int hypre_SetSpMVUseVendor( HYPRE_Int use_vendor );
 HYPRE_Int hypre_SetSpGemmUseVendor( HYPRE_Int use_vendor );
diff --git a/3rd_party/hypre/src/utilities/stl_ops.c b/3rd_party/hypre/src/utilities/stl_ops.c
new file mode 100644
index 000000000..3f867d9b9
--- /dev/null
+++ b/3rd_party/hypre/src/utilities/stl_ops.c
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+#include "_hypre_onedpl.hpp"
+#include "_hypre_utilities.h"
+#include "_hypre_utilities.hpp"
+#include <math.h>
+
+/*--------------------------------------------------------------------
+ * hypre_IntSequence
+ *
+ * Generate a linear sequence of integers from 0 to size-1 and store
+ * them in the provided data array.
+ *--------------------------------------------------------------------*/
+
+HYPRE_Int
+hypre_IntSequence(HYPRE_MemoryLocation  memory_location,
+                  HYPRE_Int             size,
+                  HYPRE_Int            *data)
+{
+#if !defined (HYPRE_USING_GPU)
+   HYPRE_UNUSED_VAR(memory_location);
+#endif
+
+   HYPRE_Int   i;
+
+#if defined (HYPRE_USING_GPU)
+   if (hypre_GetExecPolicy1(memory_location) == HYPRE_EXEC_DEVICE)
+   {
+#if defined(HYPRE_USING_SYCL)
+      hypreSycl_sequence(data, data + size, 0);
+#else
+      HYPRE_THRUST_CALL(sequence, data, data + size);
+#endif
+   }
+   else
+#endif
+   {
+#ifdef HYPRE_USING_OPENMP
+      #pragma omp parallel for HYPRE_SMP_SCHEDULE
+#endif
+      for (i = 0; i < size; i++)
+      {
+         data[i] = i;
+      }
+   }
+
+   return hypre_error_flag;
+}
diff --git a/3rd_party/hypre/src/utilities/utilities.c b/3rd_party/hypre/src/utilities/utilities.c
index 61bde8466..454a0e308 100644
--- a/3rd_party/hypre/src/utilities/utilities.c
+++ b/3rd_party/hypre/src/utilities/utilities.c
@@ -202,3 +202,66 @@ hypre_CreateNextDirOfSequence(const char *basepath, const char *prefix, char **f
 
    return hypre_error_flag;
 }
+
+/*--------------------------------------------------------------------
+ * hypre_ConvertIndicesToString
+ *
+ * Converts an array of integers (indices) into a formatted string.
+ * The function creates a string representing the array in a comma-
+ * separated format, enclosed within square brackets ("[]").
+ *
+ * - If the input array is empty (size = 0), it returns a string "[]".
+ * - The resulting string includes the list of integers with proper
+ *   formatting: each integer is separated by a comma and a space.
+ *
+ * Parameters:
+ * - size: Number of elements in the input array.
+ * - indices: Pointer to the array of integers (HYPRE_Int) to convert.
+ *
+ * Returns:
+ * - A dynamically allocated string representing the integer array.
+ *--------------------------------------------------------------------*/
+
+char*
+hypre_ConvertIndicesToString(HYPRE_Int  size,
+                             HYPRE_Int *indices)
+{
+   HYPRE_Int    max_length;
+   HYPRE_Int    i, length;
+   char        *string;
+   char        *pos;
+
+   if (!size)
+   {
+      string = hypre_TAlloc(char, 3, HYPRE_MEMORY_HOST);
+      hypre_sprintf(string, "[]");
+
+      return string;
+   }
+
+   /* Estimate maximum string needed */
+   max_length = 12 * size + 3;
+   string = hypre_TAlloc(char, max_length, HYPRE_MEMORY_HOST);
+
+   pos    = string;
+   length = hypre_sprintf(pos, "[");
+   pos    += length;
+
+   for (i = 0; i < size; i++)
+   {
+      /* Add comma before all but the first element */
+      if (i > 0)
+      {
+         length = hypre_sprintf(pos, ", ");
+         pos += length;
+      }
+
+      /* Write integer as string */
+      length = hypre_sprintf(pos, "%d", indices[i]);
+      pos += length;
+   }
+
+   hypre_sprintf(pos, "]");
+
+   return string;
+}
diff --git a/3rd_party/occa/src/occa/internal/lang/modes/dpcpp.cpp b/3rd_party/occa/src/occa/internal/lang/modes/dpcpp.cpp
index bd59f432b..ebaa682a0 100644
--- a/3rd_party/occa/src/occa/internal/lang/modes/dpcpp.cpp
+++ b/3rd_party/occa/src/occa/internal/lang/modes/dpcpp.cpp
@@ -154,13 +154,12 @@ namespace occa
         return "";
       }
 
-      // @note: As of SYCL 2020 this will need to change from `CL/sycl.hpp` to `sycl.hpp`
       void dpcppParser::setupHeaders()
       {
         root.addFirst(
             *(new directiveStatement(
                 &root,
-                directiveToken(root.source->origin, "include <CL/sycl.hpp>\n using namespace sycl;\n"))));
+                directiveToken(root.source->origin, "include <sycl/sycl.hpp>\n using namespace sycl;\n"))));
       }
 
       void dpcppParser::addExtensions()
diff --git a/3rd_party/update.sh b/3rd_party/update.sh
index 94bbff030..0b36081d4 100755
--- a/3rd_party/update.sh
+++ b/3rd_party/update.sh
@@ -38,7 +38,7 @@ elif [ "$1" == "hypre" ]; then
 git rm -rf 3rd_party/hypre
 git commit -m 'remove hypre'
 rm -rf 3rd_party/hypre
-git subtree add --prefix 3rd_party/hypre https://github.com/hypre-space/hypre.git v2.31.0 --squash
+git subtree add --prefix 3rd_party/hypre https://github.com/hypre-space/hypre.git v2.32.0 --squash
 rm -rf 3rd_party/hypre/src/examples 3rd_party/hypre/src/docs 3rd_party/hypre/src/test
 git reset HEAD~2 --soft
 git add -u
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72b9d0e73..2bf850044 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.21)
-project(NekRS LANGUAGES C CXX Fortran VERSION 24.0.3)
+project(NekRS LANGUAGES C CXX Fortran VERSION 24.0.4)
 
 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
@@ -148,7 +148,7 @@ set(OCCA_ENABLE_METAL OFF CACHE BOOL "Enable OCCA Metal support")
 
 set(ENABLE_AMGX OFF CACHE BOOL "Enable NVIDIA AMGX support")
 set(ENABLE_HYPRE_GPU ON CACHE BOOL "Enable HYPRE GPU support")
-set(NEKRS_GPU_MPI ON CACHE BOOL "Enable GPU aware MPI")
+set(NEKRS_GPU_MPI OFF CACHE BOOL "Enable GPU aware MPI")
 set(ENABLE_CVODE OFF CACHE BOOL "Enable CVODE support")
 set(ENABLE_ADIOS ON CACHE BOOL "Enable ADIOS support")
 set(ADIOS2_INSTALL_DIR "" CACHE STRING "ADIOS install directory")
@@ -333,6 +333,11 @@ configure_file(
   ${CMAKE_INSTALL_PREFIX}/nekrs.conf
   @ONLY)
 
+configure_file(
+  ${CMAKE_CURRENT_LIST_DIR}/examples/CMakeLists.txt  # Input file
+  ${CMAKE_INSTALL_PREFIX}/examples/CMakeLists.txt  # Output file (build directory)
+  @ONLY)
+
 file(MAKE_DIRECTORY ${CMAKE_INSTALL_PREFIX}/3rd_party)
 
 install(
diff --git a/RELEASE.md b/RELEASE.md
index 43dffaf70..570fe397c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -3,25 +3,26 @@
 ## What is new? 
 
 * FP32 solver mode
+* DPCPP backend to support Intel GPUs
 * Interpolation based velocity recycling
 * [Ascent](https://ascent.readthedocs.io/en/latest/) in situ visualisation plugin
 * iofld class reading/writing field files including [ADIOS2](https://adios2.readthedocs.io/) support 
 * Addtional output options (element filter and interpolation on uniform grid / different polynomial-order)
 * Multi session nek-nek including multi-rate time stepping
+* Improved memory management
 * CHT nek-nek support
 * nek-nek support for nrsqsub scripts
 * Improved JIT compilation performance
-* HIP support for BoomerAMG
-* Intel GPU support
+* HIP support for SEMFEM  
 * Aero forces
 * opSEM class
 * Mesh surface ops
 * Linear implicit velocity source term
-* Combined CG for improved performance
 * Various bug fixes
 
 ## Good to know
 
+* GPU aware MPI is disabled by default (`NEKRS_GPU_MPI=0`)
 * HYPRE replaces AmgX
 * [reproducibility] variable time step controller restricts dt to 5 significant digits
 * after fixing a bug in the linear solver residual norm, iteration counts have increased compared to previous versions
@@ -31,20 +32,18 @@
 This list provides an overview of the most significant changes in this release, although it may not encompass all modifications. We acknowledge that this release introduces several breaking changes. These adjustments were essential to enhance the stability of the user interface in future iterations. We apologize for any inconvenience this may cause.
 
 * run `build.sh` instead of `nrsconfig` to build the code
-* change par section `TEMPERATURE` to `SCALAR00` in case it does not represent indeed a physical temperature
+* change par section `SCALAR00` to `TEMPERATURE` in case it represent indeed a physical temperature
 * `velocityDirichletConditions` -> `codedFixedValueVelocity` (same for scalars)
 * `velocityNeumannConditions` -> `codedFixedGradientVelocity` (same for scalars)
-* `nek::useric` is no longer automatically called, if needed call it in `UDF_Setup` (see e.g. lowMach example)
-* `nek::userchk` is no longer called automatically 
+* `nek::userchk` is no longer called automatically during the setup phase 
 * use temporary instead of `nrs->U` and copy to `nrs->o_U`
 * use temporary instead of `cds->S` and copy to `cds->o_S`
 * use `auto [x, y, z] = mesh->xyzHost()` instead of `mesh->x` (same for other components) 
-* `nrs->meshV` -> `nrs->mesh`
 * `nrs->_mesh` -> `cds->mesh[0]`
 * `nek::ocopyToNek` -> `nrs->copyToNek`
 * `nek::ocopyFromNek` -> `nek::copyFromNek`
 * send signal (defined in env-var `NEKRS_SIGNUM_UPD`) to process trigger file `nekrs.upd`
-* use `auto foo = platform->o_memPool.reserve<T>(nWords)` instead of e.g. `platform->o_mempool.slice0`
+* use `auto foo = platform->deviceMemoryPool.reserve<T>(nWords)` instead of e.g. `platform->o_mempool.slice0`
 * change count argument of `occa::memory::slice, occa::memory::copyFrom, occa::memory::copyTo` to number of words instead of bytes 
 * define `time` as double (instead of defloat) in all UDF functions
 * remove `nrs_t` argument from UDF API functions (nrs object is now globally accessible within udf if the Navier Stokes solver is enabled)
@@ -79,7 +78,7 @@ This list provides an overview of the most significant changes in this release,
 
 ## Thanks to our Contributors
 
-@kris-rowe, @yslan, @MalachiTimothyPhillips, @tcew
+@kris-rowe, @tcew, @yslan, @MalachiTimothyPhillips, @thilinarmtb
 
 We are grateful to all who added new features, filed issues or helped resolve them, 
 asked and answered questions, and were part of inspiring discussions.
diff --git a/cmake/nekrs.cmake b/cmake/nekrs.cmake
index ee4da4ec9..0dc173660 100644
--- a/cmake/nekrs.cmake
+++ b/cmake/nekrs.cmake
@@ -31,6 +31,7 @@ set(OGS_SOURCES
 
 set(NRS_SRC 
     src/lib/nekrs.cpp
+    src/core/threadPool.cpp
     src/core/io/iofld.cpp
     src/core/io/iofldFactory.cpp
     src/core/io/iofldNek.cpp
diff --git a/doc/envHelp.txt b/doc/envHelp.txt
index f110cc0cf..80cc24f0d 100644
--- a/doc/envHelp.txt
+++ b/doc/envHelp.txt
@@ -3,6 +3,7 @@ Variable			Value(s)   	Description / Comment
 ------------------------------------------------------------------------------------------------------------------------------------
 NEKRS_HOME			string	     	Installation directory
 
+NEKRS_JITC_NTHREADS             int             
 NEKRS_CACHE_DIR			string		Case cache directory	
 NEKRS_LOCAL_TMP_DIR		string		Node local temp directory
 NEKRS_CACHE_BCAST		0[D]/1		Broadcast cache to NEKRS_LOCAL_TMP_DIR after JIT compilation
diff --git a/doc/parHelp.txt b/doc/parHelp.txt
index e02e4c099..5fa649984 100644
--- a/doc/parHelp.txt
+++ b/doc/parHelp.txt
@@ -31,6 +31,7 @@ startFrom                   "<string>"
                               + u                                      read velocity 
                               + t                                      read temperature
                               + s00 s01 s02 ...                        read scalars 
+                              + int                                    use point interpolation
 
 timeStepper                 tombo1, tombo2 [D], tombo3
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 99a4b18b1..f3422fd73 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -9,12 +9,13 @@ else()
 endif()
 set (NEKRS_HOME "$ENV{NEKRS_HOME}")
 
+set(BACKEND "@BACKEND_DEFAULT@")
 set(EXAMPLES_DIR "${CMAKE_CURRENT_LIST_DIR}")
 set(LAUNCHER "${NEKRS_HOME}/bin/nrsmpi" CACHE STRING "Path to the launcher executable")
 set(NRANKS "0" CACHE STRING "Number of MPI ranks to run a test")
 set(TESTS "")
 
-
+ message(STATUS "backend: ${BACKEND}")
 if(NRANKS GREATER 0)
   message(STATUS "numer of MPI ranks to run tests: ${NRANKS}")
 endif()
@@ -57,12 +58,10 @@ function(add target num_processes par_file cimodes fp32 working_directory build_
  
     if(build_only)
       add_test(NAME ${name} WORKING_DIRECTORY ${working_directory}
-               COMMAND ${LAUNCHER} ${par_file} ${num_processes} --cimode ${cimode} --build-only ${num_processes}
-               DEPENDS ${target})
+               COMMAND ${LAUNCHER} ${par_file} ${num_processes} --cimode ${cimode} --build-only ${num_processes})
     else()
       add_test(NAME ${name} WORKING_DIRECTORY ${working_directory}
-               COMMAND ${LAUNCHER} ${par_file} ${num_processes} --cimode ${cimode}
-               DEPENDS ${target})
+               COMMAND ${LAUNCHER} ${par_file} ${num_processes} --cimode ${cimode})
     endif()
 
     set_tests_properties(${name} PROPERTIES ENVIRONMENT "FP32=${fp32_value}")
@@ -102,10 +101,15 @@ endfunction()
 ## TESTS ##
 
 add("ethier-buildOnly" 2 "ethier.par" "1" "${USE_FP32}" "${EXAMPLES_DIR}/ethier" ON)
-add("ethier" 2 "ethier.par" "1;2;3;4;7;8;9;10;11;12;14;15;19" "${USE_FP32}" "${dir}" "${BUILD_ONLY}")
+add("ethier" 2 "ethier.par" "1;2;4;7;8;9;10;11;12;14;15;19;23" "${USE_FP32}" "${dir}" "${BUILD_ONLY}")
+
+if(NOT BACKEND STREQUAL "DPCPP")
+  add("ethier" 2 "ethier.par" "3" "${USE_FP32}" "${dir}" "${BUILD_ONLY}")
+endif()
+
 add("ethier" 2 "mv_ethier.par" "5;6" "${USE_FP32}" "${dir}" "${BUILD_ONLY}")
 add("ethier" 2 "ethierScalar.par" "13" "${USE_FP32}" "${dir}" "${BUILD_ONLY}")
-add("ethier" 1 "ethier.par" "22" ON "${dir}" "${BUILD_ONLY}")
+add("ethier" 2 "ethier.par" "22" ON "${dir}" "${BUILD_ONLY}")
 
 add("lowMach" 2 "lowMach.par" "1" "${USE_FP32}" "${dir}" "${BUILD_ONLY}")
 
diff --git a/examples/channel/channel.udf b/examples/channel/channel.udf
index 261c94a5e..5a8933fa8 100644
--- a/examples/channel/channel.udf
+++ b/examples/channel/channel.udf
@@ -41,11 +41,6 @@ void UDF_Setup()
   nrs->userVelocitySource = &userf;
   nrs->userProperties = &uservp;
 
-  if (platform->options.getArgs("RESTART FILE NAME").empty()) {
-    nek::getIC();
-    nrs->copyFromNek();
-  }
-
   o_nekFU.resize(nrs->NVfields * nrs->fieldOffset);
   o_nekMue.resize(mesh->Nlocal);
 
diff --git a/examples/eddyNekNek/ci.inc b/examples/eddyNekNek/ci.inc
index de4906369..0c7db8bf1 100644
--- a/examples/eddyNekNek/ci.inc
+++ b/examples/eddyNekNek/ci.inc
@@ -61,7 +61,7 @@ void ciTestPartitionOfUnity(nrs_t *nrs, double time, int tstep)
   const auto expectedGlobalVolume = (maxX - minX) * (maxY - minY) * (maxZ - minZ);
 
   const auto o_partition = nrs->neknek->partitionOfUnity();
-  auto o_volume = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_volume = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
 
   platform->linAlg->axmyz(mesh->Nlocal, 1.0, o_partition, mesh->o_Jw, o_volume);
 
@@ -166,7 +166,7 @@ void ciTestInitialStep(nrs_t *nrs, double time, int tstep)
   
   neknek->updateBoundary(tstep, 2, time); // pass in stage == 2 to prevent lagging
 
-  auto o_Uexp = platform->o_memPool.reserve<dfloat>(nrs->NVfields * neknekFieldOffset);
+  auto o_Uexp = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * neknekFieldOffset);
   auto npt = neknek->npt();
   exactUVW(npt, neknekFieldOffset, time, neknek->o_x(), neknek->o_y(), neknek->o_z(), o_Uexp);
 
@@ -227,14 +227,14 @@ void ciTestErrors(nrs_t *nrs, double time, int tstep)
 
   auto mesh = nrs->mesh;
 
-  auto o_Uexact = platform->o_memPool.reserve<dfloat>(mesh->dim * nrs->fieldOffset);
+  auto o_Uexact = platform->deviceMemoryPool.reserve<dfloat>(mesh->dim * nrs->fieldOffset);
   exactUVW(mesh->Nlocal, nrs->fieldOffset, time, mesh->o_x, mesh->o_y, mesh->o_z, o_Uexact);
 
   auto uErr = platform->linAlg->maxRelativeError(
                 mesh->Nlocal, mesh->dim, nrs->fieldOffset, 1e-6,
                 nrs->o_U, o_Uexact, platform->comm.mpiCommParent);
 
-  auto o_Sexact = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_Sexact = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
   o_Sexact.copyFrom(o_Uexact, nrs->fieldOffset);
 
   auto o_s00 = nrs->cds->o_S.slice(nrs->cds->fieldOffsetScan[0], mesh->Nlocal);
diff --git a/examples/eddyNekNek/eddy.udf b/examples/eddyNekNek/eddy.udf
index bc94e96c3..7fb71d137 100644
--- a/examples/eddyNekNek/eddy.udf
+++ b/examples/eddyNekNek/eddy.udf
@@ -226,11 +226,6 @@ void UDF_Setup()
 {
   nrs->userConvergenceCheck = timeStepConverged;
   nrs->userScalarSource = &userq;
-
-  if (!platform->options.getArgs("RESTART FILE NAME").empty()) {
-    nek::getIC();
-    nrs->copyFromNek();
-  }
 }
 
 void UDF_ExecuteStep(double time, int tstep)
diff --git a/examples/ethier/ci.inc b/examples/ethier/ci.inc
index af86b3fac..b099f346c 100644
--- a/examples/ethier/ci.inc
+++ b/examples/ethier/ci.inc
@@ -145,7 +145,6 @@ void ciSetup(MPI_Comm comm, setupAide &options)
   if (ciMode == 2 || ciMode == 19 || ciMode == 22) {
     options.setArgs("VELOCITY BLOCK SOLVER", "TRUE");
     options.setArgs("SUBCYCLING STEPS", std::string("1"));
-    options.setArgs("PRESSURE INITIAL GUESS", "PROJECTION-ACONJ");
 
     if (ciMode == 22) {
       options.setArgs("VELOCITY SOLVER TOLERANCE", std::string("1e-10"));
@@ -316,7 +315,11 @@ void ciSetup(MPI_Comm comm, setupAide &options)
     options.setArgs("SCALAR01 SOLVER", "CVODE");
     options.setArgs("SUBCYCLING STEPS", std::string("1"));
   }
-
+  if (ciMode == 23) {
+    options.setArgs("NUMBER TIMESTEPS", std::string("0"));
+    options.setArgs("END TIME", std::string("0"));
+    options.setArgs("CHECKPOINT INTERVAL", std::string("0"));
+  }
 }
 
 auto generatePoints(int nPoints, double R)
@@ -474,7 +477,7 @@ void ciTestPointInterpolation(nrs_t *nrs, double time, occa::kernel exactUVWPKer
   exactUVWPKernel(mesh->Nlocal, time, mesh->o_x, mesh->o_y, mesh->o_z, nrs->fieldOffset, o_Pe, o_Ue);
 
   const auto verbosity = pointInterpolation_t::VerbosityLevel::Detailed;
-  const auto threshold = (sizeof(dfloat) == sizeof(double)) ? 2e-8 : 1e-5;
+  const auto threshold = (sizeof(dfloat) == sizeof(double)) ? 3e-8 : 1e-5;
 
   const auto computeRelErrors = [&](dlong n, dlong offset, occa::memory o_Uexact, occa::memory o_Uinterp) {
     const auto absTol = 1e-5;
@@ -575,7 +578,7 @@ void ciTestLVector(nrs_t *nrs, double time, occa::kernel exactUVWPKernel)
   auto mesh = nrs->mesh;
   exactUVWPKernel(mesh->Nlocal, time, mesh->o_x, mesh->o_y, mesh->o_z, nrs->fieldOffset, nrs->o_P, nrs->o_U);
   
-  auto o_Usave = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  auto o_Usave = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
   nrs->o_U.copyTo(o_Usave, nrs->NVfields * nrs->fieldOffset);
 
   std::vector<mesh_t*> meshes = {nrs->mesh, nrs->mesh, nrs->mesh};
@@ -590,7 +593,7 @@ void ciTestLVector(nrs_t *nrs, double time, occa::kernel exactUVWPKernel)
 
   // sanity check: is o_U the same as before?
   auto checkErr = [&](bool subtract = true){
-    auto o_Uerr = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+    auto o_Uerr = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
     if(subtract){
       platform->linAlg->axpbyzMany(nrs->mesh->Nlocal,
                                   nrs->NVfields,
@@ -644,7 +647,7 @@ void ciTestLVector(nrs_t *nrs, double time, occa::kernel exactUVWPKernel)
     lengths_[is] = Nlocal;
   }
 
-  auto o_UL = platform->o_memPool.reserve<dfloat>(nEq);
+  auto o_UL = platform->deviceMemoryPool.reserve<dfloat>(nEq);
   UL.fieldOffsets(lengths_);
   UL.optr(o_UL);
   
@@ -723,13 +726,50 @@ void ciTestSurfaceIntegral(nrs_t *nrs)
   }
 }
 
+void ciCheckpointing(nrs_t *nrs)
+{
+  auto run = [&](const std::string& engine, const std::string& restartFile, bool readOnly = false) 
+  {
+    const auto tol = 10 * std::numeric_limits<float>::epsilon();
+    const double time = 0.123; // just for testing, actual solution is based on t==0
+
+    if (!readOnly) {
+      platform->options.setArgs("CHECKPOINT ENGINE", engine);
+      nrs->checkpointWriter = iofldFactory::create();
+  
+      nrs->writeCheckpoint(time, 0);
+    }
+
+    nrs->restartFromFile(restartFile); 
+ 
+    double startTime;
+    platform->options.getArgs("START TIME", startTime);
+ 
+    nrs->copyToNek(0.0, 0);
+    nek::userchk();
+ 
+    auto err = nek::ptr<double>("errors");
+    CiEvalTest("velocity", err[0] < tol);
+    CiEvalTest("pressure", err[1] < tol);
+    CiEvalTest("scalar00", err[2] < tol);
+    CiEvalTest("scalar01", err[3] < tol);
+    CiEvalTest("time", std::abs(startTime- time) < tol);
+
+    nrs->checkpointWriter.reset();
+  };
+
+  run("NEK", "ethier0.f00000");
+  run("ADIOS", "ethier.bp");
+  run("ADIOS", "ethier.bp+int", true);
+}
+
 void ciTestDistance(nrs_t *nrs)
 {
   auto mesh = nrs->mesh;
 
   int nbID = 1;
   int bID = 1;
-  occa::memory o_bID = platform->o_memPool.reserve<int>(nbID);
+  occa::memory o_bID = platform->deviceMemoryPool.reserve<int>(nbID);
   o_bID.copyFrom(&bID);
 
   auto o_dist = mesh->minDistance(nbID, o_bID, "cheap_dist");
@@ -755,7 +795,7 @@ void ciTestDistance(nrs_t *nrs)
     refVal[n] = ref;
   }
 
-  auto o_refVal = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_refVal = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
   o_refVal.copyFrom(refVal.data(), mesh->Nlocal);
 
   platform->linAlg->axpby(mesh->Nlocal, 1.0, o_refVal, -1.0, o_dist);
@@ -878,11 +918,7 @@ void ciTestLinAlg(nrs_t *nrs, const int N)
   evaluateMethod(linAlg->amax(N, o_x, comm), &amaxRef, "amax");
   o_x.free();
 
-  int fieldOffset = N;
-  const int pageW = ALIGN_SIZE / sizeof(dfloat);
-  if (fieldOffset % pageW) {
-    fieldOffset = (fieldOffset / pageW + 1) * pageW;
-  }
+  int fieldOffset = alignStride<dfloat>(N);
 
   constexpr int maxFields = 10;
   for (int Nfields = 1; Nfields <= maxFields; ++Nfields) {
@@ -974,15 +1010,15 @@ void ciTestErrors(nrs_t *nrs,
     }
 
     int expectedNiterS00 = 17;
-    int expectedNiterS01 = 14;
+    int expectedNiterS01 = 17;
     if (ciMode == 19 && nrs->timeStepConverged) {
       expectedNiterS00 = 11;
-      expectedNiterS01 = 8;
+      expectedNiterS01 = 11;
     }
 
     if (ciMode == 22) {
       expectedNiterS00 = 13;
-      expectedNiterS01 = 11;
+      expectedNiterS01 = 13;
     }
 
     if (ciMode != 12 && nrs->timeStepConverged) {
@@ -1072,7 +1108,7 @@ void ciTestErrors(nrs_t *nrs,
     velIterTest = abs(NiterU - 14) < 2 ;
     s1Test = abs((err[2] - 7.49E-12) / err[2]) < EPS;
     s2Test = abs((err[3] - 7.22E-12) / err[3]) < EPS;
-    pIterTest = abs(NiterP - 4) < 2;
+    pIterTest = abs(NiterP - 6) < 2;
     vxTest = abs((err[0] - 2.77E-10) / err[0]) < EPS;
     prTest = abs((err[1] - 7.14E-10) / err[1]) < EPS;
     break;
@@ -1200,7 +1236,7 @@ void ciTestErrors(nrs_t *nrs,
     s2Test = abs((err[3] - 2E-13) / err[3]) < EPS;
 
     // just check for convergence
-    pIterTest = (NiterP < 300);
+    pIterTest = (NiterP < 500);
 
     vxTest = abs((err[0] - 1.4E-10) / err[0]) < EPS;
     prTest = abs((err[1] - 8.7E-9) / err[1]) < EPS;
@@ -1332,8 +1368,8 @@ void ciTestErrors(nrs_t *nrs,
     s1Test = err[2] < 2e-6;
     s2Test = err[3] < 2e-6;
     pIterTest = abs(NiterP - 1) < 2;
-    vxTest = err[0] < 8e-7;
-    prTest = err[1] < 2e-5;
+    vxTest = err[0] < 8e-5;
+    prTest = err[1] < 8e-4;
     break;
   }
 
diff --git a/examples/ethier/ethier.par b/examples/ethier/ethier.par
index 904d07be4..b03001d8b 100644
--- a/examples/ethier/ethier.par
+++ b/examples/ethier/ethier.par
@@ -1,5 +1,5 @@
 [GENERAL]
-verbose = true 
+#verbose = true 
 polynomialOrder = 9
 #startFrom = "restart.fld"
 stopAt = numSteps
diff --git a/examples/ethier/ethier.udf b/examples/ethier/ethier.udf
index 303e0e8f8..4332dbad0 100644
--- a/examples/ethier/ethier.udf
+++ b/examples/ethier/ethier.udf
@@ -337,11 +337,6 @@ int timeStepConverged(int stage)
 
 void UDF_Setup()
 {
-  if (platform->options.getArgs("RESTART FILE NAME").empty()) {
-    nek::getIC();
-    nrs->copyFromNek();
-  }
-
   // only ciMode 1 exercises the particles
   if (ciMode == 1) {
 
@@ -494,6 +489,10 @@ void UDF_ExecuteStep(double time, int tstep)
     }
   }
 
+  if (ciMode == 23) {
+    ciCheckpointing(nrs);
+  }
+
   // only ciMode 1 exercises the particles path
   if (ciMode == 1) {
     particles->integrate(time);
diff --git a/examples/kershaw/kershaw.par b/examples/kershaw/kershaw.par
index 15afb7bcb..5b5740f04 100644
--- a/examples/kershaw/kershaw.par
+++ b/examples/kershaw/kershaw.par
@@ -1,5 +1,5 @@
 [GENERAL] 
-#verbose = true
+verbose = false 
 polynomialOrder = 7
 timeStepper = tombo2
 subCyclingSteps = 1
@@ -53,12 +53,12 @@ viscosity = 1.0
 [CASEDATA]
 gsOverlap = 1
 
-bp5 = true
+bp5 = true 
 bp5Repetitions = 50
 
-bps5 = true
+bps5 = true 
 bps5Repetitions = 50
 eps = 0.3
 
 bp6 = true
-bp6Repetitions = 50 
+bp6Repetitions = 50
diff --git a/examples/kershaw/kershaw.udf b/examples/kershaw/kershaw.udf
index 7b988ab6d..f3045331f 100644
--- a/examples/kershaw/kershaw.udf
+++ b/examples/kershaw/kershaw.udf
@@ -23,7 +23,7 @@ dfloat stdDev(const std::vector<dfloat>& data)
 
 void runBenchmarks(nrs_t *nrs)
 {
-  auto *mesh = nrs->mesh;
+  auto mesh = nrs->mesh;
   auto [x, y, z] = mesh->xyzHost();
 
   std::vector<dfloat> rhs;
@@ -88,7 +88,6 @@ void runBenchmarks(nrs_t *nrs)
 
       Nrep = bp6NReps;
       solver->options().setArgs("PRECONDITIONER", "NONE");
-      solver->options().setArgs("BLOCK SOLVER", "TRUE");
       solver->options().setArgs("SOLVER", "PCG");
       solver->options().setArgs("MAXIMUM ITERATIONS", "1000");
       solver->options().setArgs("SOLVER TOLERANCE", to_string_f(1e-15));
@@ -98,7 +97,6 @@ void runBenchmarks(nrs_t *nrs)
 
       Nrep = bp6NReps;
       solver->options().setArgs("PRECONDITIONER", "NONE");
-      solver->options().setArgs("BLOCK SOLVER", "TRUE");
       solver->options().setArgs("SOLVER", "PCG+COMBINED");
       solver->options().setArgs("MAXIMUM ITERATIONS", "1000");
       solver->options().setArgs("SOLVER TOLERANCE", to_string_f(1e-15));
@@ -122,8 +120,17 @@ void runBenchmarks(nrs_t *nrs)
       platform->device.finish();
       MPI_Barrier(platform->comm.mpiComm);
 
+      occa::memory o_r;
+      occa::memory o_U;
+      if (benchmark == "BP5" || (benchmark == "BPS5")) {  
+        o_r = o_rhs.slice(0, mesh->Nlocal);
+        o_U = nrs->o_U.slice(0, mesh->Nlocal);
+      } else {
+        o_r = o_rhs;
+        o_U = nrs->o_U.slice(0, nrs->NVfields * nrs->fieldOffset);
+      }
       const auto tStart = MPI_Wtime();
-      solver->solve(o_lambda0, o_lambda1, o_rhs, nrs->o_U);
+      solver->solve(o_lambda0, o_lambda1, o_r, o_U);
       platform->device.finish();
       platform->timer.set(solver->name() + "Solve", MPI_Wtime() - tStart);
 
diff --git a/examples/ktauChannel/ci.inc b/examples/ktauChannel/ci.inc
index c90a7a7d9..c2a357657 100644
--- a/examples/ktauChannel/ci.inc
+++ b/examples/ktauChannel/ci.inc
@@ -61,7 +61,7 @@ void ciTestErrors(nrs_t *nrs, double time, int tstep)
   auto forces = nrs->aeroForces(bidWall.size(), o_bidWall, o_Sij);
   o_Sij.free();
 
-  auto o_tmp = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   platform->linAlg->fill(mesh->Nlocal, 1.0, o_tmp);
   const auto areaWall = mesh->surfaceAreaMultiplyIntegrate(bidWall.size(), o_bidWall, o_tmp);
 
diff --git a/examples/lowMach/lowMach.udf b/examples/lowMach/lowMach.udf
index 81625f3bd..53c716360 100644
--- a/examples/lowMach/lowMach.udf
+++ b/examples/lowMach/lowMach.udf
@@ -68,11 +68,6 @@ void UDF_Setup()
   nrs->userScalarSource = &userq;
   nrs->userProperties = &uservp;
 
-  if (platform->options.getArgs("RESTART FILE NAME").empty()) {
-    nek::getIC();
-    nrs->copyFromNek();
-  }
-
   o_beta.resize(nrs->fieldOffset);
   o_kappa.resize(nrs->fieldOffset);
 
diff --git a/examples/mv_cyl/mv_cyl.udf b/examples/mv_cyl/mv_cyl.udf
index 00cebc092..3a2b64b12 100644
--- a/examples/mv_cyl/mv_cyl.udf
+++ b/examples/mv_cyl/mv_cyl.udf
@@ -103,11 +103,6 @@ void UDF_Setup()
   nrs->userProperties = &uservp;
   nrs->userScalarSource = &userq;
 
-  if (platform->options.getArgs("RESTART FILE NAME").empty()) {
-    nek::getIC();
-    nrs->copyFromNek();
-  }
-
   o_beta.resize(nrs->fieldOffset);
   o_kappa.resize(nrs->fieldOffset);
 
diff --git a/examples/shlChannel/channel.udf b/examples/shlChannel/channel.udf
index 70e4696f3..607c4aff9 100644
--- a/examples/shlChannel/channel.udf
+++ b/examples/shlChannel/channel.udf
@@ -39,11 +39,6 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options)
 void UDF_Setup()
 {
   nrs->userVelocitySource = &userf;
-
-  if (platform->options.getArgs("RESTART FILE NAME").empty()) {
-    nek::getIC();
-    nrs->copyFromNek();
-  }
 }
 
 void UDF_ExecuteStep(double time, int tstep)
diff --git a/examples/turbPipe/turbPipe.udf b/examples/turbPipe/turbPipe.udf
index 358e402e8..d70f2cb66 100644
--- a/examples/turbPipe/turbPipe.udf
+++ b/examples/turbPipe/turbPipe.udf
@@ -92,7 +92,7 @@ void UDF_Setup()
 #if 0
   iofld = iofldFactory::create();
   iofld->open(mesh, iofld::mode::write, "qcrit");
-  iofld->writeAttribute("uniform", "true");
+  iofld->writeAttribute("uniformSpacing", "true");
   iofld->writeAttribute("polynomialOrder", std::to_string(mesh->N + 2));
 
 #if 0
diff --git a/scripts/nrsqsub_aurora b/scripts/nrsqsub_aurora
index d14db62eb..881eb74bf 100755
--- a/scripts/nrsqsub_aurora
+++ b/scripts/nrsqsub_aurora
@@ -2,8 +2,8 @@
 set -e
 
 #--------------------------------------
-: ${QUEUE:="lustre_scaling"}
-: ${NEKRS_GPU_MPI:=1}
+: ${QUEUE:="EarlyAppAccess"}
+: ${NEKRS_GPU_MPI:=0}
 : ${NEKRS_BACKEND:="dpcpp"}
 : ${RANKS_PER_NODE:=12}
 : ${RANKS_FOR_BUILD:=12}
@@ -85,14 +85,14 @@ chmod u+x $CMD
 
 if [ $RUN_ONLY -eq 0 ]; then
   echo -e "\n# precompilation" >>$SFILE
-  CMD_build="mpiexec -n ${RANKS_FOR_BUILD} -ppn ${RANKS_FOR_BUILD} --cpu-bind=list:${CPU_BIND_LIST} -- ./${CMD} $bin --setup \${case_tmp} --backend ${NEKRS_BACKEND} --device-id 0 $extra_args --build-only \${ntasks_tmp}"
+  CMD_build="mpiexec --no-vni -n ${RANKS_FOR_BUILD} -ppn ${RANKS_FOR_BUILD} --cpu-bind=list:${CPU_BIND_LIST} -- ./${CMD} $bin --setup \${case_tmp} --backend ${NEKRS_BACKEND} --device-id 0 $extra_args --build-only \${ntasks_tmp}"
   add_build_CMD "$SFILE" "$CMD_build" "$TOTAL_RANKS"
 fi
 
 if [ $BUILD_ONLY -eq 0 ]; then
   link_neknek_logfile "$SFILE"
   echo -e "\n# actual run" >>$SFILE
-  echo "mpiexec -n ${TOTAL_RANKS} -ppn ${RANKS_PER_NODE} --cpu-bind=list:${CPU_BIND_LIST} -- ./${CMD} $bin --setup ${case} --backend ${NEKRS_BACKEND} --device-id 0 $extra_args" >> $SFILE
+  echo "mpiexec --no-vni -n ${TOTAL_RANKS} -ppn ${RANKS_PER_NODE} --cpu-bind=list:${CPU_BIND_LIST} -- ./${CMD} $bin --setup ${case} --backend ${NEKRS_BACKEND} --device-id 0 $extra_args" >> $SFILE
 fi
 
 qsub -q $QUEUE $SFILE
diff --git a/scripts/nrsqsub_frontier b/scripts/nrsqsub_frontier
index 7a3178c22..7f82dbb6c 100755
--- a/scripts/nrsqsub_frontier
+++ b/scripts/nrsqsub_frontier
@@ -50,9 +50,8 @@ echo "module load craype-accel-amd-gfx90a" >> $SFILE
 echo "module load cray-mpich" >> $SFILE
 echo "module load rocm" >> $SFILE
 echo "module load cmake" >> $SFILE
-echo "module use /sw/frontier/ascent/modulefiles/" >> $SFILE
-echo "module load ascent/0.9.2-gpu-mpi-omp" >> $SFILE
-echo "module load adios2" >> $SFILE
+#echo "module use /sw/frontier/ascent/modulefiles/" >> $SFILE
+#echo "module load ascent/0.9.2-gpu-mpi-omp" >> $SFILE
 echo "module unload cray-libsci" >> $SFILE
 echo "module list" >> $SFILE
 
@@ -63,11 +62,8 @@ echo "squeue -u \$USER" >>$SFILE
 
 echo "export MPICH_GPU_SUPPORT_ENABLED=1" >>$SFILE
 
-# Ascent (the module is brocken after 07/16/24)
-echo "export NEKRS_ASCENT_INSTALL_DIR=\"$OLCF_ASCENT_ROOT/\"" >> $SFILE
-
-# Adios2
-echo "export NEKRS_ADIOS2_INSTALL_DIR=$OLCF_ADIOS2_ROOT" >> $SFILE
+# Ascent (the module is brocken for lacking VTKH after 07/16/24)
+#echo "export NEKRS_ASCENT_INSTALL_DIR=\"$OLCF_ASCENT_ROOT/\"" >> $SFILE
 
 ## These must be set before compiling so the executable picks up GTL
 echo "export PE_MPICH_GTL_DIR_amd_gfx90a=\"-L${CRAY_MPICH_ROOTDIR}/gtl/lib\"" >> $SFILE
@@ -97,32 +93,35 @@ echo "" >> $SFILE
 echo "date" >>$SFILE
 echo "" >> $SFILE
 
-bin_nvme=$NVME_HOME"nekrs-bin"
-bin_nvme_libs=$bin_nvme"_libs"
-echo "sbcast -fp --send-libs $bin $bin_nvme" >> $SFILE
-echo "if [ ! \"\$?\" == \"0\" ]; then"  >> $SFILE
-echo "    echo \"SBCAST failed!\"" >> $SFILE
-echo "    exit 1" >> $SFILE
-echo "fi" >> $SFILE
-
-echo "export LD_LIBRARY_PATH=$bin_nvme_libs:${LD_LIBRARY_PATH}" >> $SFILE
-if [ $FP32 -eq 0 ]; then
-echo "export LD_PRELOAD=$bin_nvme_libs/libnekrs.so:$bin_nvme_libs/libocca.so:$bin_nvme_libs/libnekrs-hypre-device.so:$bin_nvme_libs/libnekrs-hypre.so" >> $SFILE
-else
-echo "export LD_PRELOAD=$bin_nvme_libs/libnekrs-fp32.so:$bin_nvme_libs/libocca.so:$bin_nvme_libs/libnekrs-hypre-device.so:$bin_nvme_libs/libnekrs-hypre.so" >> $SFILE
+if [ $NEKRS_CACHE_BCAST -eq 1 ]; then
+  bin_nvme=$NVME_HOME"nekrs-bin"
+  bin_nvme_libs=$bin_nvme"_libs"
+  echo "sbcast -fp --send-libs $bin $bin_nvme" >> $SFILE
+  echo "if [ ! \"\$?\" == \"0\" ]; then"  >> $SFILE
+  echo "    echo \"SBCAST failed!\"" >> $SFILE
+  echo "    exit 1" >> $SFILE
+  echo "fi" >> $SFILE
+
+  echo "export LD_LIBRARY_PATH=$bin_nvme_libs:${LD_LIBRARY_PATH}" >> $SFILE
+  if [ $FP32 -eq 0 ]; then
+  echo "export LD_PRELOAD=$bin_nvme_libs/libnekrs.so:$bin_nvme_libs/libocca.so:$bin_nvme_libs/libnekrs-hypre-device.so:$bin_nvme_libs/libnekrs-hypre.so" >> $SFILE
+  else
+  echo "export LD_PRELOAD=$bin_nvme_libs/libnekrs-fp32.so:$bin_nvme_libs/libocca.so:$bin_nvme_libs/libnekrs-hypre-device.so:$bin_nvme_libs/libnekrs-hypre.so" >> $SFILE
+  fi
+
+  # OLCFDEV-1787: sbcast requires extra patch when using GPU-aware MPI
+  echo "ln -s $bin_nvme_libs/libhsa-runtime64.so.1 $bin_nvme_libs/libhsa-runtime64.so" >> $SFILE
+  echo "ln -s $bin_nvme_libs/libamdhip64.so.5 $bin_nvme_libs/libamdhip64.so" >> $SFILE
+
+  echo "ls -ltra $NVME_HOME" >> $SFILE
+  echo "ls -ltra $bin_nvme_libs" >> $SFILE
+  echo "bin=$bin_nvme" >> $SFILE
 fi
-
-# OLCFDEV-1787: sbcast requires extra patch when using GPU-aware MPI
-echo "ln -s $bin_nvme_libs/libhsa-runtime64.so.1 $bin_nvme_libs/libhsa-runtime64.so" >> $SFILE
-echo "ln -s $bin_nvme_libs/libamdhip64.so.5 $bin_nvme_libs/libamdhip64.so" >> $SFILE
-
-echo "ls -ltra $NVME_HOME" >> $SFILE
-echo "ls -ltra $bin_nvme_libs" >> $SFILE
-echo "ldd $bin_nvme" >> $SFILE
+echo "ldd $bin" >> $SFILE
 
 if [ $RUN_ONLY -eq 0 ]; then
   echo -e "\n# precompilation" >>$SFILE
-  CMD_build="srun -N 1 -n $gpu_per_node $bin_nvme --backend $backend --device-id 0 $extra_args --setup \$case_tmp --build-only \$ntasks_tmp"
+  CMD_build="srun -N 1 -n $gpu_per_node $bin --backend $backend --device-id 0 $extra_args --setup \$case_tmp --build-only \$ntasks_tmp"
   add_build_CMD "$SFILE" "$CMD_build" "$ntasks"
 fi
 
@@ -130,7 +129,7 @@ fi
 if [ $BUILD_ONLY -eq 0 ]; then
   link_neknek_logfile "$SFILE"
   echo -e "\n# actual run" >>$SFILE
-  echo "srun -N $nodes -n $ntasks $bin_nvme --backend $backend --device-id 0 $extra_args --setup $case" >>$SFILE
+  echo "srun -N $nodes -n $ntasks $bin --backend $backend --device-id 0 $extra_args --setup $case" >>$SFILE
 fi
 sbatch $SFILE
 
diff --git a/scripts/nrsqsub_sunspot b/scripts/nrsqsub_sunspot
deleted file mode 100755
index 97753c116..000000000
--- a/scripts/nrsqsub_sunspot
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-set -e
-
-#--------------------------------------
-: ${QUEUE:="workq"}
-: ${NEKRS_GPU_MPI:=1}
-: ${NEKRS_BACKEND:="dpcpp"}
-: ${RANKS_PER_NODE:=12}
-: ${RANKS_FOR_BUILD:=12}
-: ${CPU_BIND_LIST:="0-7,104-111:8-15,112-119:16-23,120-127:24-31,128-135:32-39,136-143:40-47,144-151:52-59,156-163:60-67,164-171:68-75,172-179:76-83,180-187:84-91,188-195:92-99,196-203"}
-: ${OCCA_DPCPP_COMPILER_FLAGS:="-O3 -fsycl -fsycl-targets=intel_gpu_pvc -ftarget-register-alloc-mode=pvc:auto -fma"}
-: ${ONEAPI_SDK:="oneapi/eng-compiler/2023.12.15.002"}
-#--------------------------------------
-
-source $NEKRS_HOME/bin/nrsqsub_utils
-setup $# 1
-
-TOTAL_RANKS=$(( nodes * RANKS_PER_NODE ))
-gpus_per_node=6
-tiles_per_gpu=2
-
-chk_case $TOTAL_RANKS
-
-#--------------------------------------
-# Generate the submission script
-SFILE=s.bin
-echo "#!/bin/bash" > $SFILE
-echo "#PBS -A $PROJ_ID" >>$SFILE
-echo "#PBS -N $jobname" >>$SFILE
-echo "#PBS -l walltime=${time}:00" >>$SFILE
-echo "#PBS -l select=$qnodes" >>$SFILE
-echo "#PBS -l place=scatter" >>$SFILE
-echo "#PBS -k doe" >>$SFILE
-echo "#PBS -j oe" >>$SFILE
-
-echo "export TZ='/usr/share/zoneinfo/US/Central'" >> $SFILE
-
-# job to "run" from your submission directory
-echo "cd \$PBS_O_WORKDIR" >> $SFILE
-
-echo "echo Jobid: \$PBS_JOBID" >>$SFILE
-echo "echo Running on host \`hostname\`" >>$SFILE
-echo "echo Running on nodes \`cat \$PBS_NODEFILE\`" >>$SFILE
-
-echo "module load ${ONEAPI_SDK}" >> $SFILE
-echo "module load cmake" >> $SFILE
-echo "module list" >> $SFILE
-
-echo "export NEKRS_HOME=$NEKRS_HOME" >>$SFILE
-echo "export NEKRS_GPU_MPI=$NEKRS_GPU_MPI" >>$SFILE
-echo "export MPICH_GPU_SUPPORT_ENABLED=$NEKRS_GPU_MPI" >> $SFILE
-
-echo "export OCCA_DPCPP_COMPILER_FLAGS=\"$OCCA_DPCPP_COMPILER_FLAGS\"" >> $SFILE
-
-# https://github.com/Nek5000/Nek5000/issues/759
-echo "export FI_CXI_RX_MATCH_MODE=hybrid" >> $SFILE 
-
-# https://github.com/stgeke/nekRS/issues/1282
-echo "unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE" >> $SFILE
-echo "unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE" >> $SFILE
-echo "unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE" >> $SFILE
-
-CMD=.lhelper
-echo "#!/bin/bash" > $CMD
-echo "gpu_id=\$(((PALS_LOCAL_RANKID / ${tiles_per_gpu}) % ${gpus_per_node}))" >> $CMD
-echo "tile_id=\$((PALS_LOCAL_RANKID % ${tiles_per_gpu}))" >> $CMD
-echo "export ZE_AFFINITY_MASK=\$gpu_id.\$tile_id" >> $CMD
-echo "\"\$@\"" >> $CMD
-chmod u+x $CMD
-
-if [ $RUN_ONLY -eq 0 ]; then
-  echo -e "\n# precompilation" >>$SFILE
-  CMD_build="mpiexec -n ${RANKS_FOR_BUILD} -ppn ${RANKS_FOR_BUILD} --cpu-bind=list:${CPU_BIND_LIST} -- ./${CMD} $bin --setup \${case_tmp} --backend ${NEKRS_BACKEND} --device-id 0 $extra_args --build-only \${ntasks_tmp}"
-  add_build_CMD "$SFILE" "$CMD_build" "$TOTAL_RANKS"
-fi
-
-if [ $BUILD_ONLY -eq 0 ]; then
-  link_neknek_logfile "$SFILE"
-  echo -e "\n# actual run" >>$SFILE
-  echo "mpiexec -n ${TOTAL_RANKS} -ppn ${RANKS_PER_NODE} --cpu-bind=list:${CPU_BIND_LIST} -- ./${CMD} $bin --setup ${case} --backend ${NEKRS_BACKEND} --device-id 0 $extra_args" >> $SFILE
-fi
-qsub -q $QUEUE $SFILE
diff --git a/scripts/nrsqsub_utils b/scripts/nrsqsub_utils
index a92048f7e..769f2e56b 100644
--- a/scripts/nrsqsub_utils
+++ b/scripts/nrsqsub_utils
@@ -32,7 +32,7 @@ if [ ! -f $case.par ]; then
     NEKRS_CACHE_BCAST=0
 
     if ! [ "$nsessions" -gt 1 ] 2> /dev/null; then
-      echo "ERROR: not enough of sessions in $case.sess"
+      echo "ERROR: number of sessions in $case.sess has to be at least 2"
       exit 1
     fi
   fi
@@ -40,7 +40,7 @@ fi
 
 jobname="nekRS_"
 if [ $nsessions -gt 1 ]; then
-  jobname+="nn_"
+  jobname+="neknek_"
 fi
 if [ $BUILD_ONLY -eq 1 ]; then
   jobname+="build_"
@@ -51,6 +51,14 @@ if [ "$nsessions" -gt 1 ] 2> /dev/null; then
   case=$case.sess
 fi
 
+if [ $nsessions -gt 1 ]; then
+  if [ $nodes -lt $nsessions ]; then
+    echo "ERROR: number of nodes is smaller than number of sessions!"
+    exit 1
+  fi
+fi
+
+
 if [ $BUILD_ONLY -eq 0 ]; then
   qnodes=$nodes
   if [ $nsessions -gt $nodes ] 2> /dev/null; then
@@ -154,7 +162,7 @@ chk_case() {
   else
     ntmp=`cat $case | awk -F ':' '{sum+=$2} END{print sum}'`
     if [ $__ntasks -ne $ntmp ]; then
-      echo "ERROR: total #ranks in $case not matched with $__ntasks"
+      echo "ERROR: number of ranks in $case does not match job size"
       exit 1
     fi
 
@@ -225,6 +233,7 @@ link_neknek_logfile() {
       cname_sess="${p%:*}"
       folder_sess="$(dirname "${cname_sess}")"
       echo "  ln -sf $PWD/$folder_sess/logfile-\$jobid $PWD/$folder_sess/logfile" >> $f
+      echo "  echo "redirect output to $PWD/$folder_sess/logfile-\$jobid"" >> $f
     done < $case
     echo "fi" >> $f
   fi
diff --git a/src/bench/advsub/benchmark.cpp b/src/bench/advsub/benchmark.cpp
index e5e893f1d..1d4d4765d 100644
--- a/src/bench/advsub/benchmark.cpp
+++ b/src/bench/advsub/benchmark.cpp
@@ -60,7 +60,7 @@ std::map<CallParameters, occa::kernel> cachedResults;
 
 template <typename T>
 occa::kernel benchmarkAdvsub(int Nfields,
-                             int Nelements,
+                             dlong Nelements,
                              int Nq,
                              int cubNq,
                              int nEXT,
@@ -105,11 +105,11 @@ occa::kernel benchmarkAdvsub(int Nfields,
   const int N = Nq - 1;
   const int Np = Nq * Nq * Nq;
   const int Ntotal = Np * Nelements;
-  const int fieldOffset = alignStride<dfloat>(Ntotal);
+  const dlong fieldOffset = alignStride<dfloat>(Ntotal);
 
   const int cubN = cubNq - 1;
   const int cubNp = cubNq * cubNq * cubNq;
-  const int cubatureOffset = alignStride<dfloat>(cubNp * Nelements); 
+  const dlong cubatureOffset = alignStride<dfloat>(cubNp * Nelements); 
 
   const int NVfields = 3;
 
@@ -147,6 +147,7 @@ occa::kernel benchmarkAdvsub(int Nfields,
 
   std::vector<int> kernelVariants = {0};
   if (!platform->serial && dealias) {
+
     if (!isScalar) {
 
       std::vector<int> kernelSearchSpace = {6, 7, 8, 9, 16};
@@ -197,28 +198,75 @@ occa::kernel benchmarkAdvsub(int Nfields,
 
   const auto wordSize = sizeof(dfloat);
 
-  auto invLMM = randomVector<dfloat>(fieldOffset * nEXT, 0, 1, true);
-  auto cubD = randomVector<dfloat>(cubNq * cubNq, 0, 1, true);
-  auto NU = randomVector<dfloat>(Nfields * fieldOffset, 0, 1, true);
-  auto conv = randomVector<dfloat>(NVfields * cubatureOffset * nEXT, 0, 1, true);
-  auto cubInterpT = randomVector<dfloat>(Nq * cubNq, 0, 1, true);
-  auto Ud = randomVector<dfloat>(Nfields * fieldOffset, 0, 1, true);
-  auto BdivW = randomVector<dfloat>(fieldOffset * nEXT, 0, 1, true);
+#if 1
+  auto lambda = randomVector<dfloat>(4, 1, 2, true);
+#else
+  std::vector<dfloat> lambda(4, 1.0);
+#endif
+
+  auto generateField = [&](const int Nfields, const int Nq, const dlong offset, const dfloat lambda)
+  {
+    const dlong Np = Nq * Nq * Nq;
+    const dlong Nlocal = Nelements * Np; 
+    std::vector<dfloat> out((Nfields > 1) ? Nfields * offset : Nlocal, 0.0);
+
+   // convert to [-1, 1] 
+    auto convertToRange =  [&](int n) {
+      return 2.0 * (static_cast<double>(n) / Nq) - 1;
+    };
+
+    for (int f = 0; f < Nfields; f++) {
+      for (int e = 0; e < Nelements; e++) {
+        for (int i = 0; i < Nq; i++) {
+          for (int j = 0; j < Nq; j++) {
+            for (int k = 0; k < Nq; k++) {
+              const auto x = convertToRange(i);
+              const auto y = convertToRange(j);
+              const auto z = convertToRange(k);
+
+              const auto id = i * j * k + e * Np + f * offset;
+              out[id] = (f+1) * lambda * sin(M_PI * x + e) * sin(M_PI * y) * sin(M_PI * z);
+            }   
+          }
+        }
+      }
+    }
+    return out;
+  };
 
-  // elementList[e] = e
   std::vector<dlong> elementList(Nelements);
   std::iota(elementList.begin(), elementList.end(), 0);
   auto o_elementList = platform->device.malloc(elementList.size() * sizeof(dlong), elementList.data());
 
-  auto o_invLMM = platform->device.malloc(invLMM.size() * wordSize, invLMM.data());
+#if 1
+  auto cubD = randomVector<dfloat>(cubNq * cubNq, 1, 2, true);
+#else
+  std::vector<dfloat> cubD(cubNq * cubNq, 1.0);
+#endif
   auto o_cubD = platform->device.malloc(cubD.size() * wordSize, cubD.data());
-  auto o_NU = platform->device.malloc(NU.size() * wordSize, NU.data());
-  auto o_conv = platform->device.malloc(conv.size() * wordSize, conv.data());
+
+#if 1
+  auto cubInterpT = randomVector<dfloat>(Nq * cubNq, 1, 2, true);
+#else
+  std::vector<dfloat> cubInterpT(Nq * cubNq, 1.0);
+#endif
+
   auto o_cubInterpT = platform->device.malloc(cubInterpT.size() * wordSize, cubInterpT.data());
-  auto o_Ud = platform->device.malloc(Ud.size() * wordSize, Ud.data());
+
+  auto conv = generateField(NVfields * nEXT, cubNq, cubatureOffset, lambda[0]);
+  auto o_conv = platform->device.malloc(conv.size() * wordSize, conv.data());
+
+  auto invLMM = generateField(nEXT, Nq, fieldOffset, lambda[1]);
+  auto o_invLMM = platform->device.malloc(invLMM.size() * wordSize, invLMM.data());
+
+  auto BdivW = generateField(nEXT, Nq, fieldOffset, lambda[2]);
   auto o_BdivW = platform->device.malloc(BdivW.size() * wordSize, BdivW.data());
 
-  // popular cubD, cubInterpT with correct data
+  auto Ud = generateField(Nfields, Nq, fieldOffset, lambda[3]);
+  auto o_Ud = platform->device.malloc(Ud.size() * wordSize, Ud.data());
+
+  std::vector<dfloat> NU(Nfields * fieldOffset, 0);
+  auto o_NU = platform->device.malloc(NU.size() * wordSize, NU.data());
 
   auto buildKernel2 = [&props, &oklpath](const std::string& _fileName)
   {
@@ -241,9 +289,9 @@ occa::kernel benchmarkAdvsub(int Nfields,
   }
 
   auto kernelRunner = [&](occa::kernel &subcyclingKernel) {
-    const auto c0 = 0.1;
-    const auto c1 = 0.2;
-    const auto c2 = 0.3;
+    const dfloat c0 = 0.1;
+    const dfloat c1 = 0.2;
+    const dfloat c2 = 0.3;
 
     if (!dealias) {
       subcyclingKernel(Nelements,
@@ -282,8 +330,8 @@ occa::kernel benchmarkAdvsub(int Nfields,
     auto kernel = buildKernel(kernelVariant);
     if (!kernel.isInitialized()) return occa::kernel();
 
-    auto o_NUref = platform->device.malloc(o_NU.size());
     kernelRunner(referenceKernel);
+    auto o_NUref = platform->device.malloc(o_NU.size());
     o_NU.copyTo(o_NUref);
 
     kernelRunner(kernel);
@@ -295,18 +343,24 @@ occa::kernel benchmarkAdvsub(int Nfields,
       o_NUref.copyTo(referenceResults.data(), referenceResults.size() * wordSize, i*fieldOffset * wordSize);
       o_NU.copyTo(results.data(), results.size() * wordSize, i*fieldOffset * wordSize);
 
-      const auto absTol = 1e-2;
+      const auto absTol = 1.0;
       const auto err = maxRelErr<dfloat>(referenceResults, results, platform->comm.mpiComm, absTol);
       const auto scale = 100 * range<dfloat>(referenceResults, absTol);
+      const auto eps = scale * std::numeric_limits<dfloat>::epsilon();
  
-      if (err > scale * std::numeric_limits<dfloat>::epsilon() || std::isnan(err)) {
+      if (err > eps || std::isnan(err)) {
         if (platform->comm.mpiRank == 0 && verbosity > 1) {
-          std::cout << "advSub: Ignore version " << kernelVariant << " as correctness check failed with " << err
+          std::cout << "advSub: Ignore version " << kernelVariant << " as correctness check failed with err=" << err
                     << std::endl;
         }
  
         // pass un-initialized kernel to skip this kernel variant
         return occa::kernel();
+      } else {
+        if (platform->comm.mpiRank == 0 && verbosity > 1) {
+          std::cout << "advSub: kernel version " << kernelVariant << " passed correctness check with err=" << err
+                    << std::endl;
+        }
       }
     }
 
diff --git a/src/bench/advsub/main.cpp b/src/bench/advsub/main.cpp
index 0ff904c6c..920d4ee94 100644
--- a/src/bench/advsub/main.cpp
+++ b/src/bench/advsub/main.cpp
@@ -148,15 +148,8 @@ int main(int argc, char **argv)
   Np = Nq * Nq * Nq;
   const int cubNq = cubN + 1;
   cubNp = cubNq * cubNq * cubNq;
-  fieldOffset = Np * Nelements;
-  const int pageW = ALIGN_SIZE / sizeof(dfloat);
-  if (fieldOffset % pageW) {
-    fieldOffset = (fieldOffset / pageW + 1) * pageW;
-  }
-  cubatureOffset = std::max(fieldOffset, Nelements * cubNp);
-  if (cubatureOffset % pageW) {
-    cubatureOffset = (cubatureOffset / pageW + 1) * pageW;
-  }
+  fieldOffset = alignStride<dfloat>(Np * Nelements);
+  cubatureOffset = alignStride<dfloat>(std::max(fieldOffset, Nelements * cubNp));
 
   platform = platform_t::getInstance(options, MPI_COMM_WORLD, MPI_COMM_WORLD);
   platform->options.setArgs("BUILD ONLY", "FALSE");
diff --git a/src/bench/axHelm/benchmark.cpp b/src/bench/axHelm/benchmark.cpp
index d40056577..19d1cca9b 100644
--- a/src/bench/axHelm/benchmark.cpp
+++ b/src/bench/axHelm/benchmark.cpp
@@ -174,7 +174,7 @@ occa::kernel benchmarkAx(int Nelements,
         props["defines/pts_per_thread"] = Nq/n_plane;              
       }
       if (kernelName == "ellipticBlockPartialAxCoeffHex3D") {
-        const int Nkernels = 3;
+        const int Nkernels = 5;
         for (int knl = 0; knl < Nkernels; ++knl)
           kernelVariants.push_back(knl);
 
diff --git a/src/bench/core/kernelBenchmarker.cpp b/src/bench/core/kernelBenchmarker.cpp
index 82e2732b5..810db4db8 100644
--- a/src/bench/core/kernelBenchmarker.cpp
+++ b/src/bench/core/kernelBenchmarker.cpp
@@ -4,7 +4,7 @@
 namespace
 {
 constexpr int Nbaseline{100};
-constexpr int Nwarmup{10};
+constexpr int Nwarmup{500};
 
 double run(int Nsamples, std::function<void(occa::kernel &)> kernelRunner, occa::kernel &kernel)
 {
diff --git a/src/core/LVector.cpp b/src/core/LVector.cpp
index 1eb5607e0..7ac1013fa 100644
--- a/src/core/LVector.cpp
+++ b/src/core/LVector.cpp
@@ -44,7 +44,7 @@ class LVectorMappingManager_t
 
   LVectorMapping_t setup(mesh_t *mesh)
   {
-    auto o_Lids = platform->o_memPool.reserve<dlong>(mesh->Nlocal);
+    auto o_Lids = platform->deviceMemoryPool.reserve<dlong>(mesh->Nlocal);
     std::vector<dlong> Eids(mesh->Nlocal);
     std::iota(Eids.begin(), Eids.end(), 0);
     o_Lids.copyFrom(Eids.data(), mesh->Nlocal);
diff --git a/src/core/avm.cpp b/src/core/avm.cpp
index 19cc87a88..db94b966c 100644
--- a/src/core/avm.cpp
+++ b/src/core/avm.cpp
@@ -36,23 +36,23 @@ namespace
 
 occa::memory modeInfoKlocknerHex3D(int _N)
 {
- const int _Np = (_N+1)*(_N+1)*(_N+1);
- const int _Nmodes1D = (_N+1);
- std::vector<int> _modeMap(_Np);
-
-  int sk = 0, n=0;
-  for(int id=0; id<_Nmodes1D;id++){
-    for(int j=0; j<_Nmodes1D; j++){
-      for(int i=0; i<_Nmodes1D; i++){
-        for(int k=0; k<_Nmodes1D; k++){
-         if(std::max(std::max(i,j),k) == id){
+  const int _Np = (_N + 1) * (_N + 1) * (_N + 1);
+  const int _Nmodes1D = (_N + 1);
+  std::vector<int> _modeMap(_Np);
+
+  int sk = 0, n = 0;
+  for (int id = 0; id < _Nmodes1D; id++) {
+    for (int j = 0; j < _Nmodes1D; j++) {
+      for (int i = 0; i < _Nmodes1D; i++) {
+        for (int k = 0; k < _Nmodes1D; k++) {
+          if (std::max(std::max(i, j), k) == id) {
             _modeMap[n++] = sk;
           }
-        sk++;
-         }
+          sk++;
         }
       }
-    sk=0;
+    }
+    sk = 0;
   }
 
   auto o_modeMap = platform->device.malloc<int>(_modeMap.size());
@@ -62,15 +62,15 @@ occa::memory modeInfoKlocknerHex3D(int _N)
 
 occa::memory leastSquaresFitKlockner(int _N)
 {
-  std::vector<dfloat> tmp(2*_N);
-  for(int n=0; n<_N; n++){
-    tmp[2*n + 0] = std::log10(n+1);
-    tmp[2*n + 1] = 1.0;
+  std::vector<dfloat> tmp(2 * _N);
+  for (int n = 0; n < _N; n++) {
+    tmp[2 * n + 0] = std::log10(n + 1);
+    tmp[2 * n + 1] = 1.0;
   }
- 
-  std::vector<dfloat> _LSF( _N);
+
+  std::vector<dfloat> _LSF(_N);
   auto invTmp = platform->linAlg->matrixPseudoInverse(2, tmp);
-  for(int n=0; n<_N; n++){
+  for (int n = 0; n < _N; n++) {
     _LSF[n] = invTmp[n];
   }
 
@@ -82,15 +82,15 @@ occa::memory leastSquaresFitKlockner(int _N)
 occa::memory baseLineDecayKlockner(int _N)
 {
   dfloat bsum = 0.0;
-  for(int j=1; j<_N+1; j++) {
-    bsum +=1.0/std::pow(j, 2*_N);
+  for (int j = 1; j < _N + 1; j++) {
+    bsum += 1.0 / std::pow(j, 2 * _N);
   }
-  bsum = 1.0/std::sqrt(bsum);
+  bsum = 1.0 / std::sqrt(bsum);
 
-  std::vector<dfloat> _BLD(_N+1, 0.0);
-  for(int n=1; n<_N+1; n++){
-    const dfloat bdecay = bsum/std::pow(n,_N);
-    _BLD[n] = bdecay*bdecay;
+  std::vector<dfloat> _BLD(_N + 1, 0.0);
+  for (int n = 1; n < _N + 1; n++) {
+    const dfloat bdecay = bsum / std::pow(n, _N);
+    _BLD[n] = bdecay * bdecay;
   }
 
   auto o_BLD = platform->device.malloc<dfloat>(_BLD.size());
@@ -98,7 +98,7 @@ occa::memory baseLineDecayKlockner(int _N)
   return o_BLD;
 }
 
-}
+} // namespace
 
 void setup(mesh_t *mesh_, oogs_t *gsh_)
 {
@@ -113,9 +113,8 @@ void setup(mesh_t *mesh_, oogs_t *gsh_)
   o_modeMap = modeInfoKlocknerHex3D(mesh->N);
   o_leastSquares1D = leastSquaresFitKlockner(mesh->N);
   o_baseLineDecay = baseLineDecayKlockner(mesh->N);
-  o_invVT = [&] () 
-  {
-    std::vector<dfloat> V(mesh->Nq *  mesh->Nq);
+  o_invVT = [&]() {
+    std::vector<dfloat> V(mesh->Nq * mesh->Nq);
     Vandermonde1D(mesh->N, mesh->Nq, mesh->r, V.data());
     auto invV = platform->linAlg->matrixInverse(mesh->Nq, V);
     auto invVT = platform->linAlg->matrixTranspose(mesh->Nq, invV);
@@ -140,21 +139,34 @@ void setup(mesh_t *mesh_, oogs_t *gsh_)
   modesKernel = platform->kernelRequests.load(kernelName);
 }
 
-occa::memory viscosity(dlong UFieldOffset, const occa::memory& o_U, const occa::memory& o_S,
-                       dfloat absTol, dfloat scalingCoeff, dfloat logS0, dfloat kappa, bool makeCont)
+occa::memory viscosity(dlong UFieldOffset,
+                       const occa::memory &o_U,
+                       const occa::memory &o_S,
+                       dfloat absTol,
+                       dfloat scalingCoeff,
+                       dfloat logS0,
+                       dfloat kappa,
+                       bool makeCont)
 {
-  auto o_nu = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_nu = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   viscosity(UFieldOffset, o_U, o_S, o_nu, absTol, scalingCoeff, logS0, kappa, makeCont);
   return o_nu;
 }
 
-void viscosity(dlong UFieldOffset, const occa::memory& o_U, const occa::memory& o_S, occa::memory& o_nu,
-               dfloat absTol, dfloat scalingCoeff, dfloat logS0, dfloat kappa, bool C0)
+void viscosity(dlong UFieldOffset,
+               const occa::memory &o_U,
+               const occa::memory &o_S,
+               occa::memory &o_nu,
+               dfloat absTol,
+               dfloat scalingCoeff,
+               dfloat logS0,
+               dfloat kappa,
+               bool C0)
 {
-  occa::memory o_logSk = platform->o_memPool.reserve<dfloat>(mesh->Nelements);
-  occa::memory o_Shat = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  occa::memory o_logSk = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nelements);
+  occa::memory o_Shat = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
 
-  modesKernel(mesh->Nelements, o_invVT, o_S, o_Shat); 
+  modesKernel(mesh->Nelements, o_invVT, o_S, o_Shat);
   relativeMassAveragedModeKernel(mesh->Nelements,
                                  absTol,
                                  o_modeMap,
@@ -177,8 +189,7 @@ void viscosity(dlong UFieldOffset, const occa::memory& o_U, const occa::memory&
                        mesh->o_z,
                        o_U,
                        o_logSk,
-                       o_nu
-  );
+                       o_nu);
 
   if (C0) {
     oogs::startFinish(o_nu, 1, 0, ogsDfloat, ogsMax, gsh);
diff --git a/src/core/comm.cpp b/src/core/comm.cpp
index ec0158590..a9460e459 100644
--- a/src/core/comm.cpp
+++ b/src/core/comm.cpp
@@ -24,18 +24,20 @@ comm_t::comm_t(MPI_Comm _commg, MPI_Comm _comm)
   }
 }
 
-MPI_Datatype comm_t::toMPI_Datatype(comm_t::type t) const
+MPI_Datatype comm_t::toMPI_Datatype(const occa::memory& t) const
 {
-  switch (t) {
-  case comm_t::type::dfloat:
-    return MPI_DFLOAT;
-  case comm_t::type::dlong:
-    return MPI_DLONG;
-  case comm_t::type::hlong:
-    return MPI_HLONG;
-  default:
+  const auto type = t.dtype(); 
+
+  if (type == occa::dtype::get<double>()) 
+   return MPI_DOUBLE;
+  else if (type == occa::dtype::get<float>()) 
+   return MPI_FLOAT;
+  else if (type == occa::dtype::get<int>()) 
+   return MPI_INT;
+  else if (type == occa::dtype::get<long long int>()) 
+   return MPI_LONG_LONG_INT;
+  else
     nekrsAbort(MPI_COMM_SELF, EXIT_FAILURE, "%s\n", "Unkown datatype!");
-  }
 
   return 0;
 }
@@ -76,58 +78,12 @@ void comm_t::reallocScratch(size_t Nbytes) const
   }
 };
 
-int comm_t::allreduce(const void *sendbuf,
-                      void *recvbuf,
-                      int count,
-                      comm_t::type datatype,
-                      comm_t::op op,
-                      MPI_Comm comm) const
-{
-  auto mpiDataType = toMPI_Datatype(datatype);
-  auto mpiOp = toMPI_Op(op);
-
-  return MPI_Allreduce(sendbuf, recvbuf, count, mpiDataType, mpiOp, comm);
-}
-
-int comm_t::allreduce(occa::memory sendbuf,
-                      occa::memory recvbuf,
-                      int count,
-                      comm_t::type datatype,
-                      comm_t::op op,
-                      MPI_Comm comm) const
-{
-  auto mpiDataType = toMPI_Datatype(datatype);
-  auto mpiOp = toMPI_Op(op);
-
-  int sizeBytes;
-  MPI_Type_size(mpiDataType, &sizeBytes);
-
-  const size_t Nbytes = sizeBytes * count;
-
-  reallocScratch(Nbytes);
-
-  if (useGPUAware || platform->serial) {
-    platform->device.finish();
-    return MPI_Allreduce((void *)sendbuf.ptr(), (void *)recvbuf.ptr(), count, mpiDataType, mpiOp, comm);
-  } else {
-    int retVal = 0;
-
-    sendbuf.copyTo(send, Nbytes);
-    retVal = MPI_Allreduce(send, recv, count, mpiDataType, mpiOp, comm);
-    recvbuf.copyFrom(recv, Nbytes);
-
-    return retVal;
-  }
-}
-
-// in place
 int comm_t::allreduce(occa::memory recvbuf,
                       int count,
-                      comm_t::type datatype,
                       comm_t::op op,
                       MPI_Comm comm) const
 {
-  auto mpiDataType = toMPI_Datatype(datatype);
+  auto mpiDataType = toMPI_Datatype(recvbuf);
   auto mpiOp = toMPI_Op(op);
 
   int sizeBytes;
@@ -141,12 +97,9 @@ int comm_t::allreduce(occa::memory recvbuf,
     platform->device.finish();
     return MPI_Allreduce(MPI_IN_PLACE, (void *)recvbuf.ptr(), count, mpiDataType, mpiOp, comm);
   } else {
-    int retVal = 0;
-
-    recvbuf.copyTo(recv, Nbytes);
-    retVal = MPI_Allreduce(MPI_IN_PLACE, recv, count, mpiDataType, mpiOp, comm);
-    recvbuf.copyFrom(recv, Nbytes);
-
+    recvbuf.copyTo(recv, count);
+    int retVal = MPI_Allreduce(MPI_IN_PLACE, recv, count, mpiDataType, mpiOp, comm);
+    recvbuf.copyFrom(recv, count);
     return retVal;
   }
 }
diff --git a/src/core/comm.hpp b/src/core/comm.hpp
index 2a7d46a4f..249702aa1 100644
--- a/src/core/comm.hpp
+++ b/src/core/comm.hpp
@@ -33,18 +33,11 @@ class comm_t{
     return ss.str();
   }
 
-  int allreduce(const void *sendbuf, void *recvbuf, int count,
-                  type datatype, op op, MPI_Comm comm) const;
-  int allreduce(occa::memory sendbuf, occa::memory recvbuf, int count,
-                  type datatype, op op, MPI_Comm comm) const;
-  
-  // in place
-  int allreduce(occa::memory recvbuf, int count,
-                  type datatype, op op, MPI_Comm comm) const;
+  int allreduce(occa::memory recvbuf, int count, op op, MPI_Comm comm) const;
   
 private:
 
-  MPI_Datatype toMPI_Datatype(type t) const;
+  MPI_Datatype toMPI_Datatype(const occa::memory& t) const;
   MPI_Op toMPI_Op(op t) const;
 
   void reallocScratch(size_t Nbytes) const;
diff --git a/src/core/device.cpp b/src/core/device.cpp
index b0d18cb00..e63746240 100644
--- a/src/core/device.cpp
+++ b/src/core/device.cpp
@@ -81,7 +81,7 @@ bool atomicsAvailable(device_t &device, MPI_Comm comm)
   return atomicSupported;
 }
 
-std::string extractKernelName(const std::string& fullPath)
+std::string extractKernelName(const std::string &fullPath)
 {
   std::regex kernelNameRegex(R"((.+)\/(.+)\.)");
   std::smatch kernelNameMatch;
@@ -93,9 +93,9 @@ std::string extractKernelName(const std::string& fullPath)
   // 2:   advectMeshVelocityHex3D.okl
 
   return (foundKernelName && kernelNameMatch.size() == 3) ? kernelNameMatch[2].str() : "";
-} 
+}
 
-occa::properties adjustKernelProps(const std::string& fileName, const occa::properties& props_)
+occa::properties adjustKernelProps(const std::string &fileName, const occa::properties &props_)
 {
   occa::properties props = props_;
   if (fileName.find(".okl") != std::string::npos) {
@@ -114,18 +114,12 @@ occa::kernel device_t::wrapperCompileKernel(const std::string &fileName,
                                             const occa::properties &props_,
                                             const std::string &suffix) const
 {
-  if(!_compilationEnabled) {
-    nekrsAbort(MPI_COMM_SELF,
-               EXIT_FAILURE,
-               "%s",
-               "illegal call detected after 'finish' declaration\n");
+  if (!_compilationEnabled) {
+    nekrsAbort(MPI_COMM_SELF, EXIT_FAILURE, "%s", "illegal call detected after 'finish' declaration\n");
   }
 
-  if(fileName.empty()) {
-    nekrsAbort(MPI_COMM_SELF,
-               EXIT_FAILURE,
-               "%s",
-               "Empty fileName\n");
+  if (fileName.empty()) {
+    nekrsAbort(MPI_COMM_SELF, EXIT_FAILURE, "%s", "Empty fileName\n");
   }
 
   auto props = props_;
@@ -168,7 +162,7 @@ occa::kernel device_t::wrapperLoadKernel(const std::string &fileName,
              "Cannot load kernel <%s>\n",
              kernelName.c_str());
 
-#if  0
+#if 0
   // restore
   if (platform->cacheBcast) {
     occa::env::OCCA_CACHE_DIR = cacheDir0;
@@ -184,8 +178,7 @@ occa::kernel device_t::compileKernel(const std::string &fileName,
                                      const std::string &suffix,
                                      const MPI_Comm &commIn) const
 {
-  const auto collective = [&commIn]() 
-  {
+  const auto collective = [&commIn]() {
     int tmp;
     MPI_Comm_compare(commIn, MPI_COMM_SELF, &tmp);
     return (tmp == MPI_UNEQUAL) ? true : false;
@@ -193,20 +186,23 @@ occa::kernel device_t::compileKernel(const std::string &fileName,
 
   MPI_Comm comm = commIn;
   if (collective) {
-    if (platform->cacheLocal) comm = _comm.mpiCommLocal; 
-    if (platform->cacheBcast) comm = _comm.mpiComm; 
+    if (platform->cacheLocal) {
+      comm = _comm.mpiCommLocal;
+    }
+    if (platform->cacheBcast) {
+      comm = _comm.mpiComm;
+    }
   }
 
-  const auto buildRank = [&comm]()
-  { 
+  const auto buildRank = [&comm]() {
     int rank;
     MPI_Comm_rank(comm, &rank);
-    return (rank == 0) ? true : false; 
+    return (rank == 0) ? true : false;
   }();
 
   occa::kernel knl;
-  if (buildRank) { 
-    knl = this->wrapperCompileKernel(fileName, props, suffix); 
+  if (buildRank) {
+    knl = this->wrapperCompileKernel(fileName, props, suffix);
   }
   MPI_Barrier(comm); // finish compilation
 
@@ -222,7 +218,7 @@ occa::kernel device_t::compileKernel(const std::string &fileName,
 }
 
 occa::kernel device_t::loadKernel(const std::string &fileName,
-                                  const std::string &kernelName, 
+                                  const std::string &kernelName,
                                   const occa::properties &props,
                                   const std::string &suffix) const
 {
@@ -240,7 +236,6 @@ occa::kernel device_t::loadKernel(const std::string &fileName,
   return this->loadKernel(fileName, extractKernelName(fileName), props, suffix);
 }
 
-
 occa::memory device_t::mallocHost(size_t Nbytes)
 {
   occa::properties props;
@@ -266,7 +261,9 @@ occa::memory device_t::malloc(size_t Nbytes, const occa::properties &properties)
 occa::memory device_t::malloc(size_t Nbytes, const void *src, const occa::properties &properties)
 {
   auto props = properties;
-  if (platform->serial) props["use_host_pointer"] = true;
+  if (platform->serial) {
+    props["use_host_pointer"] = true;
+  }
 
   occa::memory o_returnValue = _device.malloc(Nbytes, src, props);
   return o_returnValue;
@@ -275,7 +272,9 @@ occa::memory device_t::malloc(size_t Nbytes, const void *src, const occa::proper
 occa::memory device_t::malloc(size_t Nword, size_t wordSize, const occa::memory &src)
 {
   occa::properties props;
-  if (platform->serial) props["use_host_pointer"] = true;
+  if (platform->serial) {
+    props["use_host_pointer"] = true;
+  }
 
   occa::memory o_returnValue = _device.malloc(Nword * wordSize, src, props);
   return o_returnValue;
@@ -293,7 +292,8 @@ occa::memory device_t::malloc(size_t Nword, size_t wordSize)
 device_t::device_t(setupAide &options, comm_t &comm) : _comm(comm)
 {
   // OCCA build stuff
-  char deviceConfig[4096];
+  int deviceConfigSize = 4096;
+  char deviceConfig[deviceConfigSize];
   int worldRank = _comm.mpiRank;
 
   int device_id = 0;
@@ -312,23 +312,31 @@ device_t::device_t(setupAide &options, comm_t &comm) : _comm(comm)
     if (!getenv("CUDA_CACHE_DISABLE")) {
       setenv("CUDA_CACHE_DISABLE", "1", 1);
     }
-    sprintf(deviceConfig, "{mode: 'CUDA', device_id: %d}", device_id);
+    snprintf(deviceConfig, deviceConfigSize, "{mode: 'CUDA', device_id: %d}", device_id);
   } else if (strcasecmp(requestedOccaMode.c_str(), "HIP") == 0) {
-    sprintf(deviceConfig, "{mode: 'HIP', device_id: %d}", device_id);
+    snprintf(deviceConfig, deviceConfigSize, "{mode: 'HIP', device_id: %d}", device_id);
   } else if (strcasecmp(requestedOccaMode.c_str(), "DPCPP") == 0) {
     int plat = 0;
     options.getArgs("PLATFORM NUMBER", plat);
-    sprintf(deviceConfig, "{mode: 'dpcpp', device_id: %d, platform_id: %d}", device_id, plat);
+    snprintf(deviceConfig,
+             deviceConfigSize,
+             "{mode: 'dpcpp', device_id: %d, platform_id: %d}",
+             device_id,
+             plat);
   } else if (strcasecmp(requestedOccaMode.c_str(), "OPENCL") == 0) {
     int plat = 0;
     options.getArgs("PLATFORM NUMBER", plat);
-    sprintf(deviceConfig, "{mode: 'OpenCL', device_id: %d, platform_id: %d}", device_id, plat);
+    snprintf(deviceConfig,
+             deviceConfigSize,
+             "{mode: 'OpenCL', device_id: %d, platform_id: %d}",
+             device_id,
+             plat);
   } else if (strcasecmp(requestedOccaMode.c_str(), "OPENMP") == 0) {
     nekrsCheck(true, _comm.mpiComm, EXIT_FAILURE, "%s\n", "OpenMP backend currently not supported!");
-    sprintf(deviceConfig, "{mode: 'OpenMP'}");
+    snprintf(deviceConfig, deviceConfigSize, "{mode: 'OpenMP'}");
   } else if (strcasecmp(requestedOccaMode.c_str(), "CPU") == 0 ||
              strcasecmp(requestedOccaMode.c_str(), "SERIAL") == 0) {
-    sprintf(deviceConfig, "{mode: 'Serial'}");
+    snprintf(deviceConfig, deviceConfigSize, "{mode: 'Serial'}");
     options.setArgs("THREAD MODEL", "SERIAL");
     options.getArgs("THREAD MODEL", requestedOccaMode);
   } else {
@@ -349,7 +357,7 @@ device_t::device_t(setupAide &options, comm_t &comm) : _comm(comm)
   if (worldRank == 0) {
     printf("Initializing device \n");
   }
-  this->_device.setup((std::string)deviceConfig);
+  this->_device.setup(static_cast<std::string>(deviceConfig));
 
   if (worldRank == 0) {
     std::cout << "active occa mode: " << this->mode() << "\n\n";
@@ -381,18 +389,17 @@ device_t::device_t(setupAide &options, comm_t &comm) : _comm(comm)
 size_t device_t::memoryUsage() const
 {
   return platform->device.occaDevice().memoryAllocated();
-} 
+}
 
 void device_t::printMemoryUsage(MPI_Comm comm) const
 {
-  const auto maxMemSizes = [&]()
-  {
+  const auto maxMemSizes = [&]() {
     std::vector<uint64_t> work;
     work.push_back(platform->device.occaDevice().maxMemoryAllocated());
-    work.push_back(platform->o_memPool.size());
-    work.push_back(platform->memPool.size());
+    work.push_back(platform->deviceMemoryPool.size());
+    work.push_back(platform->memoryPool.size());
 
-    MPI_Allreduce(MPI_IN_PLACE, work.data(), work.size(), MPI_UINT64_T, MPI_MAX, comm); 
+    MPI_Allreduce(MPI_IN_PLACE, work.data(), work.size(), MPI_UINT64_T, MPI_MAX, comm);
     return std::make_tuple(work[0], work[1], work[2]);
   }();
 
@@ -401,11 +408,12 @@ void device_t::printMemoryUsage(MPI_Comm comm) const
 
   if (rank == 0) {
     int width = 12;
-    std::cout << "occa max memory usage: " << std::setw(width) << std::right 
-              << std::get<0>(maxMemSizes)  << " bytes" << std::endl;
-    std::cout << "  o_mempool:           " << std::setw(width) << std::right 
-              << std::get<1>(maxMemSizes)  << " bytes" << std::endl;
-    std::cout << "  mempool:             " << std::setw(width) << std::right 
-              << std::get<2>(maxMemSizes)  << " bytes" << std::endl << std::flush;
+    std::cout << "occa max memory usage: " << std::setw(width) << std::right << std::get<0>(maxMemSizes)
+              << " bytes" << std::endl;
+    std::cout << "  deviceMemoryPool:    " << std::setw(width) << std::right << std::get<1>(maxMemSizes)
+              << " bytes" << std::endl;
+    std::cout << "  mempool:             " << std::setw(width) << std::right << std::get<2>(maxMemSizes)
+              << " bytes" << std::endl
+              << std::flush;
   }
-} 
+}
diff --git a/src/core/deviceMemory.hpp b/src/core/deviceMemory.hpp
index 5bcba6f38..ce64de26d 100644
--- a/src/core/deviceMemory.hpp
+++ b/src/core/deviceMemory.hpp
@@ -2,11 +2,13 @@
 #define deviceMemory_hpp_
 #include "nekrsSys.hpp"
 
-template<typename T>
-class DeviceMemoryAllocator {
+template <typename T> class DeviceMemoryAllocator
+{
   using size_type = std::size_t;
- public:
-  static occa::memory malloc(size_type count) {
+
+public:
+  static occa::memory malloc(size_type count)
+  {
     if (std::is_same<T, std::byte>::value) {
       return platform->device.malloc(count);
     } else {
@@ -15,177 +17,245 @@ class DeviceMemoryAllocator {
   }
 };
 
-template<typename T = std::byte, class Allocator = DeviceMemoryAllocator<T>>
-class deviceMemory {
- using size_type = std::size_t;
+template <typename T = std::byte, class Allocator = DeviceMemoryAllocator<T>> class deviceMemory
+{
+  using size_type = std::size_t;
 
- public:
+public:
   deviceMemory() = default;
-    
-  explicit deviceMemory(size_type count)
-      : occa_memory_{Allocator::malloc(count)} {}
 
-  explicit deviceMemory(const std::vector<T>& values) 
-      : deviceMemory{values.size()} {copyFrom(values);}
+  explicit deviceMemory(size_type count) : occa_memory_{Allocator::malloc(count)} {}
 
-  explicit deviceMemory(const occa::memory& occa_memory) 
-      : occa_memory_{occa_memory} { 
-    if (occa_memory_.byte_size() && occa_memory_.dtype() != occa::dtype::get<T>()) throw std::runtime_error("data type does not match");
+  explicit deviceMemory(const std::vector<T> &values) : deviceMemory{values.size()}
+  {
+    copyFrom(values);
   }
 
-  explicit deviceMemory(occa::memory&& occa_memory) noexcept
-      : occa_memory_{std::move(occa_memory)} {}
+  explicit deviceMemory(const occa::memory &occa_memory) : occa_memory_{occa_memory}
+  {
+    if (occa_memory_.byte_size() && occa_memory_.dtype() != occa::dtype::get<T>()) {
+      throw std::runtime_error("data type does not match");
+    }
+  }
 
-  deviceMemory(const deviceMemory& other) 
-      : occa_memory_{other.occa_memory_} {
-    if (occa_memory_.byte_size() && occa_memory_.dtype() != occa::dtype::get<T>()) throw std::runtime_error("data type does not match");
+  explicit deviceMemory(occa::memory &&occa_memory) noexcept : occa_memory_{std::move(occa_memory)} {}
+
+  deviceMemory(const deviceMemory &other) : occa_memory_{other.occa_memory_}
+  {
+    if (occa_memory_.byte_size() && occa_memory_.dtype() != occa::dtype::get<T>()) {
+      throw std::runtime_error("data type does not match");
+    }
   }
 
-  deviceMemory(deviceMemory&& other) noexcept
-      : occa_memory_{std::move(other.occa_memory_)} {}
+  deviceMemory(deviceMemory &&other) noexcept : occa_memory_{std::move(other.occa_memory_)} {}
 
-  deviceMemory& operator=(const deviceMemory& rhs) {
+  deviceMemory &operator=(const deviceMemory &rhs)
+  {
     deviceMemory copy{rhs};
     swap(copy);
     return *this;
   }
 
-  deviceMemory& operator=(deviceMemory&& rhs) noexcept {
+  deviceMemory &operator=(deviceMemory &&rhs) noexcept
+  {
     deviceMemory moved{std::move(rhs)};
     swap(moved);
     return *this;
   }
-  
+
   ~deviceMemory() = default;
 
-  operator const occa::memory&() const {return occa_memory_;}
-  operator occa::memory&() {return occa_memory_;}
-  operator occa::kernelArg() const {return occa_memory_;}
+  operator const occa::memory &() const
+  {
+    return occa_memory_;
+  }
+
+  operator occa::memory &()
+  {
+    return occa_memory_;
+  }
 
-  void swap(deviceMemory& other) noexcept {
+  operator occa::kernelArg() const
+  {
+    return occa_memory_;
+  }
+
+  void swap(deviceMemory &other) noexcept
+  {
     occa::memory tmp = occa_memory_;
     occa_memory_ = other.occa_memory_;
     other.occa_memory_ = tmp;
   }
 
-  size_type byte_size() const { return occa_memory_.byte_size();}
+  size_type byte_size() const
+  {
+    return occa_memory_.byte_size();
+  }
 
-  size_type size() const { return occa_memory_.size();}
+  size_type size() const
+  {
+    return occa_memory_.size();
+  }
 
-  size_type length() const { return occa_memory_.length();}
+  size_type length() const
+  {
+    return occa_memory_.length();
+  }
 
-  bool isInitialized() const { return occa_memory_.isInitialized();}
+  bool isInitialized() const
+  {
+    return occa_memory_.isInitialized();
+  }
 
-  void clear() {occa_memory_.free();}
+  void clear()
+  {
+    occa_memory_.free();
+  }
 
-  void resize(size_type count) {
+  void resize(size_type count)
+  {
     if (count > size()) {
-      if (size()) clear();
+      if (size()) {
+        clear();
+      }
       occa_memory_ = Allocator::malloc(count);
     }
   }
 
-  T* ptr() { return static_cast<T*>(occa_memory_.ptr());}
-  
-  const T* ptr() const {return static_cast<const T*>(occa_memory_.ptr());}
+  T *ptr()
+  {
+    return static_cast<T *>(occa_memory_.ptr());
+  }
+
+  const T *ptr() const
+  {
+    return static_cast<const T *>(occa_memory_.ptr());
+  }
 
-  deviceMemory slice(size_type offset, size_type count = 0) const {
+  deviceMemory slice(size_type offset, size_type count = 0) const
+  {
     if (count) {
-      return deviceMemory{occa_memory_.slice(offset,count)};
+      return deviceMemory{occa_memory_.slice(offset, count)};
     } else {
       return deviceMemory{occa_memory_.slice(offset)};
     }
   }
 
-  deviceMemory operator + (size_type offset) const { return slice(offset);};
+  deviceMemory operator+(size_type offset) const
+  {
+    return slice(offset);
+  };
 
-  template<class A>
-  void copyFrom(const deviceMemory<T,A>& src) {
+  template <class A> void copyFrom(const deviceMemory<T, A> &src)
+  {
     occa_memory_.copyFrom(src.occa_memory_);
   }
 
-  template<class A>
-  void copyFrom(const deviceMemory<T,A>& src, size_type count, 
-                size_type dest_offset = 0, size_type src_offset = 0) {
+  template <class A>
+  void copyFrom(const deviceMemory<T, A> &src,
+                size_type count,
+                size_type dest_offset = 0,
+                size_type src_offset = 0)
+  {
     occa_memory_.copyFrom(src.occa_memory_, count, dest_offset, src_offset);
   }
-  
-  void copyFrom(const std::vector<T>& src) {
+
+  void copyFrom(const std::vector<T> &src)
+  {
     occa_memory_.copyFrom(src.data());
   }
 
-  void copyFrom(const void *src, size_type count, size_type dest_offset = 0) {
+  void copyFrom(const void *src, size_type count, size_type dest_offset = 0)
+  {
     occa_memory_.copyFrom(src, count, dest_offset);
   }
 
-  void copyFrom(const std::vector<T>& src, size_type count, size_type dest_offset = 0) {
+  void copyFrom(const std::vector<T> &src, size_type count, size_type dest_offset = 0)
+  {
     occa_memory_.copyFrom(src.data(), count, dest_offset);
   }
 
-  void copyFrom(const occa::memory& src) {
+  void copyFrom(const occa::memory &src)
+  {
     occa_memory_.copyFrom(src);
   }
 
-  void copyFrom(const occa::memory& src, size_type count, size_type dest_offset = 0) {
+  void copyFrom(const occa::memory &src, size_type count, size_type dest_offset = 0)
+  {
     occa_memory_.copyFrom(src, count, dest_offset);
   }
 
-  template<class A>
-  void copyTo(deviceMemory<T,A>& dest) const {
+  template <class A> void copyTo(deviceMemory<T, A> &dest) const
+  {
     occa_memory_.copyTo(dest.occa_memory_);
   }
 
-  template<class A>
-  void copyTo(deviceMemory<T,A>& dest, size_type count, 
-              size_type dest_offset = 0, size_type src_offset = 0) const {
+  template <class A>
+  void
+  copyTo(deviceMemory<T, A> &dest, size_type count, size_type dest_offset = 0, size_type src_offset = 0) const
+  {
     occa_memory_.copyTo(dest.occa_memory_, count, dest_offset, src_offset);
   }
 
-  void copyTo(std::vector<T>& dest) const {
+  void copyTo(std::vector<T> &dest) const
+  {
     occa_memory_.copyTo(dest.data());
   }
 
-  void copyTo(std::vector<T>& dest, size_type count, size_type src_offset = 0) const {
+  void copyTo(std::vector<T> &dest, size_type count, size_type src_offset = 0) const
+  {
     occa_memory_.copyTo(dest.data(), count, src_offset);
   }
 
-  void copyTo(occa::memory& dest) const {
+  void copyTo(occa::memory &dest) const
+  {
     occa_memory_.copyTo(dest);
   }
 
-  void copyTo(occa::memory& dest, size_type count, size_type src_offset = 0) const {
+  void copyTo(occa::memory &dest, size_type count, size_type src_offset = 0) const
+  {
     occa_memory_.copyTo(dest, count, src_offset);
   }
 
- private:
+private:
   occa::memory occa_memory_;
 };
 
-template<typename T>
-class DeviceMemoryPoolAllocator {
+template <typename T> class DeviceMemoryPoolAllocator
+{
   using size_type = std::size_t;
- public:
-  static occa::memory malloc(size_type count) {
+
+public:
+  static occa::memory malloc(size_type count)
+  {
     if (std::is_same<T, std::byte>::value) {
-      return platform->o_memPool.reserve(count);
+      return platform->deviceMemoryPool.reserve(count);
     } else {
-      return platform->o_memPool.reserve<T>(count);
+      return platform->deviceMemoryPool.reserve<T>(count);
     }
   }
 };
 
-template<typename T = std::byte, class Allocator = DeviceMemoryPoolAllocator<T>>
-class poolDeviceMemory : public deviceMemory<T> {
- using size_type = std::size_t;
+template <typename T = std::byte, class Allocator = DeviceMemoryPoolAllocator<T>>
+class poolDeviceMemory : public deviceMemory<T>
+{
+  using size_type = std::size_t;
 
- public:
+public:
   // Inherit base class constructors
   using deviceMemory<T>::deviceMemory;
 
-  operator const deviceMemory<T>&() const {return deviceMemory<T>(occa_memory_);}
-  operator deviceMemory<T>&() {return deviceMemory<T>(occa_memory_);}
+  operator const deviceMemory<T> &() const
+  {
+    return deviceMemory<T>(occa_memory_);
+  }
+
+  operator deviceMemory<T> &()
+  {
+    return deviceMemory<T>(occa_memory_);
+  }
 
- private:
+private:
   occa::memory occa_memory_;
 };
 
diff --git a/src/core/io/iofld.hpp b/src/core/io/iofld.hpp
index a5cfab78c..932922baf 100644
--- a/src/core/io/iofld.hpp
+++ b/src/core/io/iofld.hpp
@@ -61,6 +61,8 @@ class iofld
   bool uniform = false;
   int precision = 32;
   bool outputMesh = false;
+  bool redistribute = true;
+  bool pointInterpolation = false;
 
   void writeAttribute(const std::string& key_, const std::string& val)
   {
@@ -77,15 +79,30 @@ class iofld
       precision = stoi(val);
       nekrsCheck(precision != 64 && precision != 32,
                  MPI_COMM_SELF, EXIT_FAILURE, "invalid precision value %d\n", precision);
-    } else if (key == "uniform" || key == "equidistant") {
+    } else if (key.find("uniform") == 0 || key.find("equidistant") == 0) {
       uniform = (val == "true") ? true : false;
     } else if (key == "outputmesh") {
       outputMesh = (val == "true") ? true : false;
+    } else if (key == "redistribute") {
+      redistribute = (val == "true") ? true : false;
     } else {
       nekrsAbort(MPI_COMM_SELF, EXIT_FAILURE, "invalid attribute %s\n", key_.c_str());
     }
   };
 
+  void readAttribute(const std::string& key_, const std::string& val)
+  {
+    nekrsCheck(!initialized, MPI_COMM_SELF, EXIT_FAILURE, "%s\n", "illegal to call prior to iofld::open()!");
+
+    std::string key = key_;
+    lowerCase(key);
+
+    if (key == "interpolate") {
+      pointInterpolation = (val == "true") ? true : false;
+      if (pointInterpolation) redistribute = false; 
+    }
+  }
+
   std::vector<int> elementMask;
 
   void writeElementFilter(const std::vector<int>& elementMask_)
@@ -222,10 +239,12 @@ class iofld
       std::string orderSuffix;
       if (p > mesh->N) {
         kernelName = "coarsenHex3D";
+        if (engineMode == iofld::mode::write) kernelName = "prolongateHex3D";
         orderSuffix = std::string("_Nf_") + std::to_string(p) + std::string("_Nc_") + std::to_string(mesh->N);
       } else {
         kernelName = "prolongateHex3D";
-        orderSuffix = std::string("_Nf_") + std::to_string(p) + std::string("_Nc_") + std::to_string(mesh->N);
+        if (engineMode == iofld::mode::write) kernelName = "coarsenHex3D";
+        orderSuffix = std::string("_Nf_") + std::to_string(mesh->N) + std::string("_Nc_") + std::to_string(p);
       }
       return platform->kernelRequests.load("mesh-" + kernelName + orderSuffix);
     };
diff --git a/src/core/io/iofldAdios.cpp b/src/core/io/iofldAdios.cpp
index c53c3c0c9..1f377d56a 100644
--- a/src/core/io/iofldAdios.cpp
+++ b/src/core/io/iofldAdios.cpp
@@ -2,7 +2,8 @@
 
 #include "iofldAdios.hpp"
 
-static bool isLittleEndian() {
+static bool isLittleEndian()
+{
   const uint32_t value = 0x01020304;
   uint8_t bytes[4];
 
@@ -36,7 +37,7 @@ void iofldAdios::openEngine()
     adiosEngine = adiosIO.Open(fileNameBase, adios2::Mode::Write);
   } else {
     if (platform->comm.mpiRank == 0) {
-      std::cout << "reading checkpoint ..." << std::endl; 
+      std::cout << "reading checkpoint ..." << std::endl;
       std::cout << " fileName: " << fileNameBase << std::endl << std::flush;
     }
     adiosEngine = adiosIO.Open(fileNameBase, adios2::Mode::ReadRandomAccess);
@@ -58,7 +59,8 @@ void iofldAdios::openEngine()
     }
 
     for (const auto &entry : adiosIO.AvailableVariables(true)) {
-      _availableVariables.push_back(entry.first);
+      std::string name = entry.first;
+      _availableVariables.push_back(name);
     }
   }
 }
@@ -70,7 +72,7 @@ std::string iofldAdios::vtkSchema()
       <UnstructuredGrid>
           <Piece NumberOfPoints="numOfPoints" NumberOfCells="numOfCells">
               <Points>
-                  <DataArray Name="vertices" />
+                  <DataArray Name="mesh" />
               </Points>
               <Cells>
                   <DataArray Name="connectivity" />
@@ -79,12 +81,12 @@ std::string iofldAdios::vtkSchema()
               <PointData>
   )";
 
-  std::string endianTag = isLittleEndian() ? "LittleEndian" : "BigEndian"; 
+  std::string endianTag = isLittleEndian() ? "LittleEndian" : "BigEndian";
 
   std::string placeholder = "ENDIANTYPE";
   auto pos = schema.find(placeholder);
   if (pos != std::string::npos) {
-      schema.replace(pos, placeholder.length(), endianTag);
+    schema.replace(pos, placeholder.length(), endianTag);
   }
 
   for (auto &entry : userFields) {
@@ -156,7 +158,7 @@ void iofldAdios::putVariableConvert(const std::vector<occa::memory> &o_fld, occa
                  "%s\n",
                  "field has be of type dfloat for mapping to a different N or a uniform mesh");
 
-      auto o_tmp = platform->o_memPool.reserve<dfloat>(mesh_vis->Nlocal);
+      auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(mesh_vis->Nlocal);
       if (uniform) {
         mesh->interpolate(o_fld[dim_i], mesh_vis, o_tmp, true);
       } else {
@@ -221,7 +223,7 @@ template <typename OutputType> size_t iofldAdios::write_()
     putVariable<uint32_t>("numOfPoints", NumOfPoints, adiosMode);
     putVariable<uint32_t>("polynomialOrder", mesh_vis->N, adiosMode);
 
-    auto o_globalElementsIds = platform->memPool.reserve<uint64_t>(mesh_vis->Nelements);
+    auto o_globalElementsIds = platform->memoryPool.reserve<uint64_t>(mesh_vis->Nelements);
     auto globalElementsIdsPtr = static_cast<uint64_t *>(o_globalElementsIds.ptr());
     for (int e = 0; e < o_globalElementsIds.size(); e++) {
       globalElementsIdsPtr[e] = nek::localElementIdToGlobal(e);
@@ -230,7 +232,7 @@ template <typename OutputType> size_t iofldAdios::write_()
     writtenBytes += o_globalElementsIds.size() * sizeof(uint64_t);
 
     auto o_connectivity =
-        platform->memPool.reserve<uint64_t>(static_cast<uint64_t>(NumOfCells) * (mesh_vis->Nverts + 1));
+        platform->memoryPool.reserve<uint64_t>(static_cast<uint64_t>(NumOfCells) * (mesh_vis->Nverts + 1));
     generateConnectivity(o_connectivity);
     putVariable<uint64_t>("connectivity",
                           o_connectivity,
@@ -238,14 +240,14 @@ template <typename OutputType> size_t iofldAdios::write_()
                           adiosMode);
     writtenBytes += o_connectivity.size() * sizeof(uint64_t);
 
-    auto o_coordVertices = platform->memPool.reserve<OutputType>(mesh_vis->dim * mesh_vis->Nlocal);
+    auto o_coordVertices = platform->memoryPool.reserve<OutputType>(mesh_vis->dim * mesh_vis->Nlocal);
 
     std::vector<occa::memory> o_xyz;
     o_xyz.push_back(mesh->o_x);
     o_xyz.push_back(mesh->o_y);
     o_xyz.push_back(mesh->o_z);
     putVariableConvert<dfloat, OutputType>(o_xyz, o_coordVertices);
-    putVariable<OutputType>("vertices",
+    putVariable<OutputType>("mesh",
                             o_coordVertices,
                             {static_cast<size_t>(mesh_vis->Nlocal), static_cast<size_t>(mesh_vis->dim)},
                             adiosMode);
@@ -258,7 +260,7 @@ template <typename OutputType> size_t iofldAdios::write_()
     for (const auto &entry : std::get<1>(fld)) {
       fldSize += entry.size();
     }
-    o_fldDataScratch.push_back(platform->memPool.reserve<OutputType>(fldSize));
+    o_fldDataScratch.push_back(platform->memoryPool.reserve<OutputType>(fldSize));
   }
 
   // after this point no memPool reservations are allowed to ensure pointers
@@ -345,12 +347,11 @@ std::vector<occa::memory> iofldAdios::redistributeField(const std::vector<occa::
 
   const size_t winFieldOffset = maxRemoteSizes.second;
 #if 1
-  auto o_win = platform->memPool.reserve<T>(maxRemoteSizes.first * winFieldOffset);
+  auto o_win = platform->memoryPool.reserve<T>(maxRemoteSizes.first * winFieldOffset);
 #else
   auto o_win = platform->device.mallocHost<T>(maxRemoteSizes.first * winFieldOffset);
 #endif
 
-
   int typeSize;
   MPI_Datatype mpiType;
   if constexpr (std::is_same_v<T, double>) {
@@ -375,7 +376,7 @@ std::vector<occa::memory> iofldAdios::redistributeField(const std::vector<occa::
 
         const MPI_Aint winOffset = Np * localElementIndex + dim * winFieldOffset;
         const auto o_inSlice = o_in.at(dim).slice(e * Np, Np);
-       
+
         MPI_Put(o_inSlice.ptr(), Np, mpiType, targetRank, winOffset, Np, mpiType, win);
       }
     }
@@ -386,7 +387,7 @@ std::vector<occa::memory> iofldAdios::redistributeField(const std::vector<occa::
   std::vector<occa::memory> o_out;
   const auto o_outSize = maxRemoteSizes.first;
   for (int dim = 0; dim < o_outSize; dim++) {
-    o_out.push_back(platform->o_memPool.reserve<T>(winFieldOffset));
+    o_out.push_back(platform->deviceMemoryPool.reserve<T>(mesh_vis->Nlocal));
     o_out[dim].copyFrom(o_win.slice(dim * winFieldOffset));
   }
 
@@ -405,14 +406,14 @@ std::vector<occa::memory> iofldAdios::getDataConvert(const std::string &name)
     return o_out;
   }
 
-  //nekrsCheck(var.type != adios2::GetType<Tadios>, MPI_COMM_SELF, EXIT_FAILURE,
-  //           "ADIOS variable type does not match Tadios!\n");  
+  // nekrsCheck(var.type != adios2::GetType<Tadios>, MPI_COMM_SELF, EXIT_FAILURE,
+  //            "ADIOS variable type does not match Tadios!\n");
 
   const auto nDim = var.dim;
   const size_t Nlocal = in.size() / nDim;
 
   for (int dim = 0; dim < nDim; dim++) {
-    auto out = platform->memPool.reserve<Tout>(Nlocal);
+    auto out = platform->memoryPool.reserve<Tout>(Nlocal);
 
     // get latest pointer just in case a previous reserve has caused a resize
     auto inPtr = static_cast<Tadios *>(in.ptr());
@@ -430,47 +431,157 @@ std::vector<occa::memory> iofldAdios::getDataConvert(const std::string &name)
 template <typename Tadios>
 void iofldAdios::getData(const std::string &name, std::vector<occa::memory> &o_userBuf)
 {
-  std::vector<occa::memory> o_convDistributedData;
-  if (o_userBuf.at(0).dtype() == occa::dtype::get<double>()) {
-    auto o_convData = getDataConvert<Tadios, double>(name);
-    o_convDistributedData = redistributeField<double>(o_convData);
-  }
-  if (o_userBuf.at(0).dtype() == occa::dtype::get<float>()) {
-    auto o_convData = getDataConvert<Tadios, float>(name);
-    o_convDistributedData = redistributeField<float>(o_convData);
-  }
 
-  for (int dim = 0; dim < o_convDistributedData.size(); dim++) {
-    if (mesh_vis != mesh) {
-      mesh_vis->interpolate(o_convDistributedData[dim], mesh, o_userBuf[dim]);
+  auto o_convDistributedData = [&]() {
+    std::vector<occa::memory> o_out;
+
+    if (o_userBuf.at(0).dtype() == occa::dtype::get<double>()) {
+      auto convData = getDataConvert<Tadios, double>(name); // on host
+      if (redistribute) {
+        o_out = redistributeField<double>(convData);
+      } else {
+        for (int dim = 0; dim < o_userBuf.size(); dim++) {
+          auto Nlocal = (convData.size()) ? convData.at(dim).size() : 0;
+          o_out.push_back(platform->deviceMemoryPool.reserve<double>(Nlocal));
+          if (Nlocal) {
+            o_out.at(dim).copyFrom(convData.at(dim));
+          }
+        }
+      }
+    } else if (o_userBuf.at(0).dtype() == occa::dtype::get<float>()) {
+      auto convData = getDataConvert<Tadios, float>(name);
+      if (redistribute) {
+        o_out = redistributeField<float>(convData);
+      } else {
+        for (int dim = 0; dim < o_userBuf.size(); dim++) {
+          auto Nlocal = (convData.size()) ? convData.at(dim).size() : 0;
+          o_out.push_back(platform->deviceMemoryPool.reserve<float>(Nlocal));
+          if (Nlocal) {
+            o_out.at(dim).copyFrom(convData.at(dim));
+          }
+        }
+      }
+    }
+
+    return o_out;
+  }();
+
+  auto convertToDfloat = [&]() {
+    std::vector<occa::memory> o_work;
+    // type of o_userBuf might not be available (in case it's zero), 
+    // instead use the type matching o_userBuf   
+    if (o_userBuf.at(0).dtype() == occa::dtype::get<dfloat>()) {
+      o_work = o_convDistributedData;
     } else {
-      o_userBuf[dim].copyFrom(o_convDistributedData[dim]);
+      const auto Nlocal = o_convDistributedData.at(0).size();
+      for (int dim = 0; dim < o_convDistributedData.size(); dim++) {
+        o_work.push_back(platform->deviceMemoryPool.reserve<dfloat>(Nlocal));
+
+        if (o_userBuf.at(0).dtype() == occa::dtype::get<double>()) {
+          platform->copyDoubleToDfloatKernel(Nlocal, o_convDistributedData.at(dim), o_work.at(dim));
+          nekrsCheck((std::is_same<float, dfloat>::value),
+                     MPI_COMM_SELF,
+                     EXIT_FAILURE,
+                     "%s\n",
+                     "cannot convert field of type double to float!");
+        } else {
+          platform->copyFloatToDfloatKernel(Nlocal, o_convDistributedData.at(dim), o_work.at(dim));
+        }
+      }
+    }
+    return o_work;
+  };
+
+  auto convertFromDfloat = [&](const occa::memory &o_tmp, occa::memory &o_buf) {
+    if (o_buf.dtype() == occa::dtype::get<double>()) {
+      platform->copyDfloatToDoubleKernel(o_buf.size(), o_tmp, o_buf);
+    } else {
+      platform->copyDfloatToFloatKernel(o_buf.size(), o_tmp, o_buf);
+    }
+  };
+
+  if (pointInterpolation) {
+    auto o_work = convertToDfloat();
+    if (name == "mesh") {
+      mesh_vis->Nelements = o_work.at(0).size() / mesh_vis->Np;
+      mesh_vis->Nlocal = mesh_vis->Nelements * mesh_vis->Np;
+
+      mesh_vis->o_x = platform->device.malloc<dfloat>(mesh_vis->Nlocal);
+      mesh_vis->o_y = platform->device.malloc<dfloat>(mesh_vis->Nlocal);
+      mesh_vis->o_z = platform->device.malloc<dfloat>(mesh_vis->Nlocal);
+      mesh_vis->o_x.copyFrom(o_work.at(0));
+      mesh_vis->o_y.copyFrom(o_work.at(1));
+      mesh_vis->o_z.copyFrom(o_work.at(2));
+
+      interp = std::make_unique<pointInterpolation_t>(mesh_vis, platform->comm.mpiComm);
+      interp->setPoints(mesh->o_x, mesh->o_y, mesh->o_z);
+      const auto verbosity = pointInterpolation_t::VerbosityLevel::Detailed;
+      interp->find(verbosity);
+    } else {
+      for (int dim = 0; dim < o_work.size(); dim++) {
+        auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(interp->numPoints());
+
+        dlong pointOffset = 0;
+        const int pointBlockSize = alignStride<dlong>(128 * mesh->Np);
+
+        int nPointsBlocks = (interp->numPoints() + pointBlockSize - 1) / pointBlockSize;
+        MPI_Allreduce(MPI_IN_PLACE, &nPointsBlocks, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm);
+
+        for (int block = 0; block < nPointsBlocks; block++) {
+          const auto nPoints = std::max(std::min(interp->numPoints() - pointOffset, pointBlockSize), 0);
+          auto o_tmpBlock = (nPoints) ? o_tmp.slice(pointOffset, nPoints) : o_NULL;
+          interp->eval(1, 0, o_work.at(dim), 0, o_tmpBlock, nPoints, pointOffset);
+          pointOffset += pointBlockSize;
+        }
+
+        convertFromDfloat(o_tmp, o_userBuf.at(dim));
+      }
+    }
+  } else if (mesh_vis->N != mesh->N) {
+    auto o_work = convertToDfloat();
+    for (int dim = 0; dim < o_work.size(); dim++) {
+      auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(o_userBuf.at(dim).size());
+      mesh_vis->interpolate(o_work.at(dim), mesh, o_tmp);
+      convertFromDfloat(o_tmp, o_userBuf.at(dim));
+    }
+  } else {
+    for (int dim = 0; dim < o_convDistributedData.size(); dim++) {
+      nekrsCheck(o_userBuf.at(dim).size() < o_convDistributedData.at(dim).size(),
+                 MPI_COMM_SELF,
+                 EXIT_FAILURE,
+                 "user buffer for %s too small!\n",
+                 name.c_str());
+
+      o_userBuf.at(dim).copyFrom(o_convDistributedData.at(dim));
     }
   }
 }
 
 template <typename Tadios> void iofldAdios::getData(const std::string &name, variantType &variant)
 {
-#define HANDLE_TYPE(TYPE, MPI_TYPE) \
-    if (std::holds_alternative<std::reference_wrapper<TYPE>>(variant)) { \
-        auto &value = std::get<std::reference_wrapper<TYPE>>(variant).get(); \
-        if (platform->comm.mpiRank == 0) { \
-            value = *(variables[name].data.ptr<Tadios>()); \
-        } \
-        MPI_Bcast(&value, 1, MPI_TYPE, 0, platform->comm.mpiComm); \
-    }
+#define HANDLE_TYPE(TYPE, MPI_TYPE)                                                                          \
+  if (std::holds_alternative<std::reference_wrapper<TYPE>>(variant)) {                                       \
+    auto &value = std::get<std::reference_wrapper<TYPE>>(variant).get();                                     \
+    if (platform->comm.mpiRank == 0) {                                                                       \
+      value = *(variables[name].data.ptr<Tadios>());                                                         \
+    }                                                                                                        \
+    MPI_Bcast(&value, 1, MPI_TYPE, 0, platform->comm.mpiComm);                                               \
+  }
 
-    HANDLE_TYPE(int, MPI_INT)
-    HANDLE_TYPE(long long int, MPI_LONG_LONG_INT)
-    HANDLE_TYPE(float, MPI_FLOAT)
-    HANDLE_TYPE(double, MPI_DOUBLE)
+  HANDLE_TYPE(int, MPI_INT)
+  HANDLE_TYPE(long long int, MPI_LONG_LONG_INT)
+  HANDLE_TYPE(float, MPI_FLOAT)
+  HANDLE_TYPE(double, MPI_DOUBLE)
 #undef HANDLE_TYPE
 }
 
 template <class T> int iofldAdios::getVariable(bool allocateOnly, const std::string &name, size_t varStep)
 {
   auto adiosVariable = adiosIO.InquireVariable<T>(name);
-  if (!static_cast<bool>(adiosVariable)) return 1; // variable not found
+
+  if (!static_cast<bool>(adiosVariable)) {
+    return 1; // variable not found
+  }
 
   adiosVariable.SetStepSelection(adios2::Box<std::size_t>(varStep, 1));
 
@@ -510,7 +621,9 @@ template <class T> int iofldAdios::getVariable(bool allocateOnly, const std::str
   }
 
   const auto &blocks = adiosEngine.BlocksInfo(adiosVariable, varStep);
-  if (blocks.size() == 0) return 1; // step not found 
+  if (blocks.size() == 0) {
+    return 1; // step not found
+  }
 
   variable var;
   var.type = adios2::GetType<T>();
@@ -538,7 +651,7 @@ template <class T> int iofldAdios::getVariable(bool allocateOnly, const std::str
   if (var.blocks.size() == 0) {
     var.data = o_NULL;
   } else if (var.dim == 0) {
-    var.data = platform->memPool.reserve<T>(1);
+    var.data = platform->memoryPool.reserve<T>(1);
   } else {
     const auto Nlocal = [&]() {
       size_t sum = 0;
@@ -547,7 +660,7 @@ template <class T> int iofldAdios::getVariable(bool allocateOnly, const std::str
       }
       return sum;
     }();
-    var.data = platform->memPool.reserve<T>(Nlocal);
+    var.data = platform->memoryPool.reserve<T>(Nlocal);
   }
 
   if (platform->verbose && var.blocks.size()) {
@@ -580,38 +693,60 @@ size_t iofldAdios::read()
     return variables;
   }();
 
+   auto isAvailable = [&] (const std::string& name, bool abort = false)
+   {
+      auto exists = std::find(_availableVariables.begin(), _availableVariables.end(), name) != _availableVariables.end();
+      nekrsCheck(!exists && abort,
+                 platform->comm.mpiComm,
+                 EXIT_FAILURE,
+                 "requested variable %s not found in file!\n",
+                 name.c_str());
+      return exists;
+   };
+
   // first allocate then get variable to ensure deferred pointer to memPool remains valid
   for (int pass = 0; pass < 2; pass++) {
     const auto allocateOnly = (pass == 0) ? true : false;
 
     getVariable<uint32_t>(allocateOnly, "polynomialOrder", 0);
+    isAvailable("polynomialOrder", true);
+
     getVariable<uint64_t>(allocateOnly, "globalElementIds", 0);
+    isAvailable("globalElementIds", true);
 
-    for (const auto &name : userVariables) {
+    for (auto name : userVariables) {
       const auto &type = adiosIO.VariableType(name);
 
       int err = 0;
       auto typeFound = false;
-#define HANDLE_TYPE(TYPE, STEP) \
-      if (type == adios2::GetType<TYPE>()) { \
-        err = getVariable<TYPE>(allocateOnly, name, STEP); \
-        typeFound = true; \
-      }
+#define HANDLE_TYPE(TYPE, STEP)                                                                              \
+  if (type == adios2::GetType<TYPE>()) {                                                                     \
+    err = getVariable<TYPE>(allocateOnly, name, STEP);                                                       \
+    typeFound = true;                                                                                        \
+  }
       HANDLE_TYPE(double, step)
       HANDLE_TYPE(float, step)
       HANDLE_TYPE(int, step)
       HANDLE_TYPE(long long int, step)
- 
-      // fallback if mesh was not found 
+
+      // fallback to step 0 if mesh was not found in requested step
       if (err && name == "mesh") {
         HANDLE_TYPE(double, 0)
         HANDLE_TYPE(float, 0)
       }
 #undef HANDLE_TYPE
-      nekrsCheck(err, platform->comm.mpiComm, EXIT_FAILURE, 
-                 "requested variable %s not found in file!\n", name.c_str());
-      nekrsCheck(!typeFound, platform->comm.mpiComm, EXIT_FAILURE, 
-                 "ADIOS variable %s has unsupported type %s!\n", name.c_str(), type.c_str());
+
+      nekrsCheck(err,
+                 platform->comm.mpiComm,
+                 EXIT_FAILURE,
+                 "requested variable %s not found in file!\n",
+                 name.c_str());
+      nekrsCheck(!typeFound,
+                 platform->comm.mpiComm,
+                 EXIT_FAILURE,
+                 "ADIOS variable %s has unsupported type %s!\n",
+                 name.c_str(),
+                 type.c_str());
     }
   }
 
@@ -622,10 +757,11 @@ size_t iofldAdios::read()
     const auto &name = entry.first;
     auto &variant = entry.second;
 
-    const auto available = std::find(_availableVariables.begin(), _availableVariables.end(), name) !=
-                           _availableVariables.end();
-    nekrsCheck(!available, platform->comm.mpiComm, EXIT_FAILURE, 
-               "requested variable %s not found in file!\n", name.c_str());
+    nekrsCheck(!isAvailable(name),
+               platform->comm.mpiComm,
+               EXIT_FAILURE,
+               "requested variable %s not found in file!\n",
+               name.c_str());
 
     const auto &adiosType = variables[name].type;
 
@@ -643,30 +779,36 @@ size_t iofldAdios::read()
   mesh_vis = [&]() {
     variantType v = std::ref(N);
     getData<uint32_t>("polynomialOrder", v);
-    if (N != mesh->N) {
+    if (N != mesh->N || pointInterpolation) {
       return genVisMesh();
     } else {
       return mesh;
     }
   }();
 
-  for (auto &o_entry : userFields) {
-    const auto &name = o_entry.first;
-    auto &o_userBuf = o_entry.second;
+  auto assignUserBuf = [&](bool meshRequested = false) {
+    for (auto &o_entry : userFields) {
+      const auto &name = o_entry.first;
+      auto &o_userBuf = o_entry.second;
 
-    if (std::find(_availableVariables.begin(), _availableVariables.end(), name) ==
-        _availableVariables.end()) {
-      continue;
-    }
+      if (!isAvailable(name)) continue; 
 
-    const auto &adiosType = variables[name].type;
+      if ((meshRequested && name != "mesh") || (!meshRequested && name == "mesh")) {
+        continue;
+      }
 
-    if (adiosType == adios2::GetType<double>()) {
-      getData<double>(name, o_userBuf);
-    } else if (adiosType == adios2::GetType<float>()) {
-      getData<float>(name, o_userBuf);
+      const auto &adiosType = variables[name].type;
+
+      if (adiosType == adios2::GetType<double>()) {
+        getData<double>(name, o_userBuf);
+      } else if (adiosType == adios2::GetType<float>()) {
+        getData<float>(name, o_userBuf);
+      }
     }
-  }
+  };
+
+  assignUserBuf(true);
+  assignUserBuf();
 
   return 0;
 }
@@ -676,6 +818,12 @@ void iofldAdios::close()
   if (static_cast<bool>(adiosEngine)) {
     adiosEngine.Close();
   }
+
+  if (mesh_vis != mesh) {
+    mesh_vis->o_x.free();
+    mesh_vis->o_y.free();
+    mesh_vis->o_z.free();
+  }
 }
 
 #endif
diff --git a/src/core/io/iofldAdios.hpp b/src/core/io/iofldAdios.hpp
index 6e041d473..2c21ff4d0 100644
--- a/src/core/io/iofldAdios.hpp
+++ b/src/core/io/iofldAdios.hpp
@@ -6,6 +6,7 @@
 #include "iofld.hpp"
 #include "adios2.h"
 #include "nekInterfaceAdapter.hpp"
+#include "pointInterpolation.hpp"
 
 class iofldAdios : public iofld
 {
@@ -24,6 +25,8 @@ class iofldAdios : public iofld
   static constexpr int VTK_HEXAHEDRON = 12;
   static constexpr const char *configFile = "adios.yaml";
 
+  std::unique_ptr<pointInterpolation_t> interp;
+
   std::string streamName;
 
   adios2::ADIOS *adios;
diff --git a/src/core/io/iofldFactory.cpp b/src/core/io/iofldFactory.cpp
index ad30aa421..a70e4a9c1 100644
--- a/src/core/io/iofldFactory.cpp
+++ b/src/core/io/iofldFactory.cpp
@@ -17,7 +17,7 @@ std::unique_ptr<iofld> iofldFactory::create(const std::string& engineType_)
       nekrsAbort(MPI_COMM_SELF, EXIT_FAILURE, "%s\n", "iofld engine adios not enabled!");
 #endif
   } else {
-      nekrsAbort(MPI_COMM_SELF, EXIT_FAILURE, "invlid iofld engine %s!\n", engineType.c_str());
+      nekrsAbort(MPI_COMM_SELF, EXIT_FAILURE, "invalid iofld engine %s!\n", engineType.c_str());
   }
   return nullptr;
 }
diff --git a/src/core/io/iofldNek.cpp b/src/core/io/iofldNek.cpp
index 31da358ed..d7ec23102 100644
--- a/src/core/io/iofldNek.cpp
+++ b/src/core/io/iofldNek.cpp
@@ -138,6 +138,8 @@ size_t iofldNek::write()
 
 size_t iofldNek::read()
 {
+  nekrsCheck(pointInterpolation, MPI_COMM_SELF, EXIT_FAILURE, "%s\n", "read attribute interpolate not supported!");
+
   nek::readFld(fldData);
 
   if (auto time = inquireVariable<double>("time")) {
diff --git a/src/core/kernelRequestManager.cpp b/src/core/kernelRequestManager.cpp
index 28a176e24..8c7f5a653 100644
--- a/src/core/kernelRequestManager.cpp
+++ b/src/core/kernelRequestManager.cpp
@@ -5,16 +5,19 @@
 #include <unordered_set>
 #include <regex>
 #include "sha1.hpp"
+#include "threadPool.hpp"
 
 kernelRequestManager_t::kernelRequestManager_t(const platform_t &m_platform)
     : kernelsProcessed(false), platformRef(m_platform)
 {
 }
 
-// add (autotuned) kernel for subsequent load 
-void kernelRequestManager_t::add(const std::string& requestName, occa::kernel kernel)
+// add (autotuned) kernel for subsequent load
+void kernelRequestManager_t::add(const std::string &requestName, occa::kernel kernel)
 {
-  if (!kernel.isInitialized()) return;
+  if (!kernel.isInitialized()) {
+    return;
+  }
 
   kernelRequest_t req(requestName, kernel.sourceFilename(), kernel.properties(), "");
   req.kernel = kernel;
@@ -34,8 +37,8 @@ void kernelRequestManager_t::add(kernelRequest_t request, bool checkUnique)
 {
   auto [iter, inserted] = requests.insert(request);
 
-  // checkUnique flag is typically set to false because we may add the same request 
-  // (source file + properties) multiple times. 
+  // checkUnique flag is typically set to false because we may add the same request
+  // (source file + properties) multiple times.
   if (checkUnique) {
     int unique = (inserted) ? 1 : 0;
     MPI_Allreduce(MPI_IN_PLACE, &unique, 1, MPI_INT, MPI_MIN, platformRef.comm.mpiComm);
@@ -47,7 +50,6 @@ void kernelRequestManager_t::add(kernelRequest_t request, bool checkUnique)
                request.to_string().c_str());
   }
 
-
   // if the request already exists, it's important to verify that it is indeed the same,
   // as inadvertently overwriting the existing entry could occur otherwise.
   if (!inserted) {
@@ -55,27 +57,30 @@ void kernelRequestManager_t::add(kernelRequest_t request, bool checkUnique)
     nekrsCheck(request.props.hash() != exisitingProps.hash(),
                platformRef.comm.mpiComm,
                EXIT_FAILURE,
-               "detected different kernel hash for same request\n%s", request.to_string().c_str());
+               "detected different kernel hash for same request\n%s",
+               request.to_string().c_str());
 
     auto exisitingFileName = (requestMap.find(request.requestName)->second).fileName;
     nekrsCheck(request.fileName != exisitingFileName,
                platformRef.comm.mpiComm,
                EXIT_FAILURE,
-               "detected different kernel hash for same request\n%s", request.to_string().c_str());
+               "detected different kernel hash for same request\n%s",
+               request.to_string().c_str());
 
     return;
   }
 
   requestMap.insert({request.requestName, request});
-
 }
 
-occa::kernel kernelRequestManager_t::load(const std::string& requestName, const std::string& _kernelName)
+occa::kernel kernelRequestManager_t::load(const std::string &requestName, const std::string &_kernelName)
 {
   auto errTxt = [&]() {
     const auto valid = processed() && (requestMap.find(requestName) != requestMap.end());
 
-    if (valid) return std::string();
+    if (valid) {
+      return std::string();
+    }
 
     std::stringstream txt;
     txt << "\n";
@@ -88,31 +93,31 @@ occa::kernel kernelRequestManager_t::load(const std::string& requestName, const
     txt << "===========================================================\n";
     auto retVal = txt.str();
 
-    return retVal; 
+    return retVal;
   }();
 
   nekrsCheck(errTxt.size(), platformRef.comm.mpiComm, EXIT_FAILURE, "%s\n", errTxt.c_str());
 
-  auto kernel = [&]() 
-  {
-    const auto& req = requestMap.find(requestName)->second;
+  auto kernel = [&]() {
+    const auto &req = requestMap.find(requestName)->second;
 
     auto reqKnl = req.kernel;
-    if (reqKnl.isInitialized()) return reqKnl; // request is mapped to a already loaded kernel
+    if (reqKnl.isInitialized()) {
+      return reqKnl; // request is mapped to a already loaded kernel
+    }
 
-    const auto kernelName = [&]()
-    {
+    const auto kernelName = [&]() {
       if (_kernelName.empty()) {
         auto fullPath = req.fileName;
         std::regex kernelNameRegex(R"((.+)\/(.+)\.)");
         std::smatch kernelNameMatch;
         const auto foundKernelName = std::regex_search(fullPath, kernelNameMatch, kernelNameRegex);
-  
+
         // capture group
         // 0:   /path/to/install/nekrs/kernels/cds/advectMeshVelocityHex3D.okl
         // 1:   /path/to/install/nekrs/kernels/cds
         // 2:   advectMeshVelocityHex3D.okl
-  
+
         return (foundKernelName && kernelNameMatch.size() == 3) ? kernelNameMatch[2].str() : "";
       } else {
         return _kernelName;
@@ -122,7 +127,8 @@ occa::kernel kernelRequestManager_t::load(const std::string& requestName, const
     if (kernelMap.find({req, kernelName}) != kernelMap.end()) {
       return kernelMap[{req, kernelName}];
     } else {
-      return kernelMap[{req, kernelName}] = platformRef.device.loadKernel(req.fileName, kernelName, req.props, req.suffix);
+      return kernelMap[{req, kernelName}] =
+                 platformRef.device.loadKernel(req.fileName, kernelName, req.props, req.suffix);
     }
   }();
 
@@ -146,73 +152,88 @@ void kernelRequestManager_t::compile()
 
   const auto &device = platformRef.device;
 
-  constexpr int maxCompilingRanks{32}; // large enough to speed things up, small enough to control pressure on filesystem
+  constexpr int maxCompilingRanks{
+      32}; // large enough to speed things up, small enough to control pressure on filesystem
   const int rank = platform->cacheLocal ? platformRef.comm.localRank : platformRef.comm.mpiRank;
-  const int ranksCompiling = std::min(
-                               maxCompilingRanks,
-                               platform->cacheLocal ? platformRef.comm.mpiCommLocalSize : platformRef.comm.mpiCommSize
-                             );
+  const int ranksCompiling =
+      std::min(maxCompilingRanks,
+               platform->cacheLocal ? platformRef.comm.mpiCommLocalSize : platformRef.comm.mpiCommSize);
+
+  auto Nthreads = 1;
+  if (getenv("NEKRS_JITC_NTHREADS")) {
+    Nthreads = std::stoi(getenv("NEKRS_JITC_NTHREADS"));
+  }
 
   if (platformRef.comm.mpiRank == 0 && (platform->verbose || platform->buildOnly)) {
     std::cout << "requests.size(): " << requests.size() << std::endl;
+    std::cout << "Nthreads: " << Nthreads << std::endl;
   }
 
   {
     std::map<std::string, kernelRequest_t> map;
-    for (auto&& req : requests) {
+    for (auto &&req : requests) {
       const auto fileName = (requestMap.find(req.requestName)->second).fileName;
       const auto props = (requestMap.find(req.requestName)->second).props;
       const auto hash = SHA1::from_string(fileName + props.hash().getFullString());
       auto [iter, inserted] = map.insert({hash, req});
-      const std::string txt = 
-        "request collision between <" + req.requestName + "> and <" + (iter->second).requestName + ">!"; 
+      const std::string txt =
+          "request collision between <" + req.requestName + "> and <" + (iter->second).requestName + ">!";
       nekrsCheck(!inserted, platform->comm.mpiComm, EXIT_FAILURE, "%s\n", txt.c_str());
     }
   }
 
   // compile requests (assumed to have a unique occa hash) on build ranks
-  constexpr int hashLength = 16 + 1; // null-terminated 
-  auto hashes = (char*) std::calloc(requests.size() * hashLength, sizeof(char)); 
-
-  auto reqIdStart = std::numeric_limits<long int>::max();
-  auto reqIdEnd = static_cast<long int>(1);
-
-  if (rank < ranksCompiling) { 
-    for (auto&& req : requests) {
-      const auto reqId = std::distance(requests.begin(), requests.find(req));
-      if (reqId % ranksCompiling == rank) {
-        reqIdStart = std::min(reqIdStart, static_cast<long int>(reqId));
-        reqIdEnd = std::max(reqIdEnd, static_cast<long int>(reqId));
-
-        if (platform->verbose || platform->buildOnly) {
-          std::cout << "Compiling request <" << req.requestName << ">";
-          fflush(stdout);
+  constexpr int hashLength = 16 + 1; // null-terminated
+  auto hashes = (char *)std::calloc(requests.size() * hashLength, sizeof(char));
+
+  ThreadPool pool(Nthreads);
+
+  if (rank < ranksCompiling) {
+    for (auto &&req : requests) {
+      auto retVal = pool.enqueue([&]() {
+        const auto reqId = std::distance(requests.begin(), requests.find(req));
+        if (reqId % ranksCompiling != rank) {
+          return;
         }
+        try {
+          if (platform->verbose || platform->buildOnly) {
+            std::cout << "Compiling request <" << req.requestName << ">";
+          }
+
+          auto knl = device.compileKernel(req.fileName, req.props, req.suffix, MPI_COMM_SELF);
+          const auto hash = knl.hash().getString();
+          nekrsCheck(hash.size() != hashLength - 1, MPI_COMM_SELF, EXIT_FAILURE, "%s\n", "Invalid hash!");
+
+          std::strncpy(hashes + reqId * hashLength, hash.c_str(), hashLength);
 
-        auto knl = device.compileKernel(req.fileName, req.props, req.suffix, MPI_COMM_SELF);
-        const auto hash = knl.hash().getString();
-        std::strncpy(hashes + reqId*hashLength, hash.c_str(), hashLength); 
-        if (platform->verbose || platform->buildOnly) {
-          std::cout << " (" << hash << ") on rank " << rank << std::endl;
+          if (platform->verbose || platform->buildOnly) {
+            std::cout << " (" << hash << ") on rank " << rank << std::flush << std::endl;
+          }
+
+        } catch (const std::exception &e) {
+          std::cerr << "Caught exception: " << e.what() << std::endl;
         }
-      }
+      });
     }
   }
-  MPI_Barrier(platform->comm.mpiComm); // finish compilation
+
+  // finish compilation
+  pool.finish();
+  MPI_Barrier(platform->comm.mpiComm);
 
   // a-posteriori check for duplicated hash causing a potential race condition
-  // no parallel version available yet 
+  // no parallel version available yet
   if (platform->comm.mpiCommSize == 1) {
-    const auto duplicateHashFound = [&]()
-    {
+    const auto duplicateHashFound = [&]() {
       if (platform->comm.mpiRank == 0) {
         std::unordered_set<std::string> encounteredHashes;
-        for (const auto& req : requests) {
+        for (const auto &req : requests) {
           const auto reqId = distance(requests.begin(), requests.find(req));
           char hash[hashLength];
-          std::strncpy(hash, hashes + reqId*hashLength, hashLength);
+          std::strncpy(hash, hashes + reqId * hashLength, hashLength);
           if (!encounteredHashes.insert(hash).second) {
-            std::cerr << "duplicate hash <" << hash << "> found for request: " << req.requestName << std::endl;
+            std::cerr << "duplicate hash <" << hash << "> found for request: " << req.requestName
+                      << std::endl;
             return true;
           }
         }
@@ -220,7 +241,11 @@ void kernelRequestManager_t::compile()
       }
       return false;
     }();
-    nekrsCheck(duplicateHashFound, platform->comm.mpiComm, EXIT_FAILURE, "%s\n", "More than one compile request is using the same hash!");
+    nekrsCheck(duplicateHashFound,
+               platform->comm.mpiComm,
+               EXIT_FAILURE,
+               "%s\n",
+               "More than one compile request is using the same hash!");
   }
 
   free(hashes);
@@ -230,11 +255,11 @@ void kernelRequestManager_t::compile()
 
   if (platform->cacheBcast && !platform->buildOnly) {
     const auto srcPath = fs::path(getenv("OCCA_CACHE_DIR"));
-    const std::string cacheDir = platform->tmpDir / fs::path("occa/"); 
+    const std::string cacheDir = platform->tmpDir / fs::path("occa/");
     fileBcast(srcPath, fs::path(cacheDir) / "..", platform->comm.mpiComm, platform->verbose);
-  
+
     // redirect
-    occa::env::OCCA_CACHE_DIR = cacheDir; 
+    occa::env::OCCA_CACHE_DIR = cacheDir;
     setenv("OCCA_CACHE_DIR", cacheDir.c_str(), 1);
   }
 }
diff --git a/src/core/linAlg/kernels/absoluteError.okl b/src/core/linAlg/kernels/absoluteError.okl
index 1a04acc0c..f5ffe8c14 100644
--- a/src/core/linAlg/kernels/absoluteError.okl
+++ b/src/core/linAlg/kernels/absoluteError.okl
@@ -25,8 +25,8 @@ SOFTWARE.
 */
 
 @kernel void absoluteError(const dlong N, 
-                           const long Nfields, 
-                           const long fieldOffset, 
+                           const int Nfields, 
+                           const dlong fieldOffset, 
                            const dfloat absTol, 
                            @ restrict const dfloat *u, 
                            @ restrict const dfloat *uRef, 
diff --git a/src/core/linAlg/kernels/relativeError.okl b/src/core/linAlg/kernels/relativeError.okl
index e37b25f70..4a5d887e3 100644
--- a/src/core/linAlg/kernels/relativeError.okl
+++ b/src/core/linAlg/kernels/relativeError.okl
@@ -25,8 +25,8 @@ SOFTWARE.
 */
 
 @kernel void relativeError(const dlong N, 
-                           const long Nfields, 
-                           const long fieldOffset, 
+                           const int Nfields, 
+                           const dlong fieldOffset, 
                            const dfloat absTol, 
                            @ restrict const dfloat *u, 
                            @ restrict const dfloat *uRef, 
diff --git a/src/core/linAlg/linAlg.cpp b/src/core/linAlg/linAlg.cpp
index 63559d7cc..10189a183 100644
--- a/src/core/linAlg/linAlg.cpp
+++ b/src/core/linAlg/linAlg.cpp
@@ -181,7 +181,9 @@ void linAlg_t::setup()
   double tStartLoadKernel = MPI_Wtime();
   {
     std::string prefix = "";
-    if (sizeof(dfloat) != sizeof(float)) prefix = "p"; 
+    if (sizeof(dfloat) != sizeof(float)) {
+      prefix = "p";
+    }
     pfillKernel = kernelRequests.load(prefix + "fill");
     paxmyzManyKernel = kernelRequests.load(prefix + "axmyzMany");
     padyManyKernel = kernelRequests.load(prefix + "adyMany");
@@ -243,26 +245,27 @@ void linAlg_t::setup()
     magSqrSymTensorDiagKernel = kernelRequests.load("magSqrSymTensorDiag");
     magSqrTensorKernel = kernelRequests.load("magSqrTensor");
     maskKernel = kernelRequests.load("mask");
-
   }
 }
 
-linAlg_t::~linAlg_t()
-{
-}
+linAlg_t::~linAlg_t() {}
 
 /*********************/
 /* vector operations */
 /*********************/
 
-void linAlg_t::mask(const dlong N, const occa::memory& o_maskIds, occa::memory &o_a)
+void linAlg_t::mask(const dlong N, const occa::memory &o_maskIds, occa::memory &o_a)
 {
-  if(N) maskKernel(N, o_maskIds, o_a);
+  if (N) {
+    maskKernel(N, o_maskIds, o_a);
+  }
 }
 
-void linAlg_t::pmask(const dlong N, const occa::memory& o_maskIds, occa::memory &o_a)
+void linAlg_t::pmask(const dlong N, const occa::memory &o_maskIds, occa::memory &o_a)
 {
-  if(N) pmaskKernel(N, o_maskIds, o_a);
+  if (N) {
+    pmaskKernel(N, o_maskIds, o_a);
+  }
 }
 
 // o_a[n] = alpha
@@ -1401,7 +1404,6 @@ void linAlg_t::magSqrTensor(const dlong N,
   magSqrTensorKernel(N, fieldOffset, o_tensor, o_mag);
 }
 
-
 void linAlg_t::magSqrSymTensor(const dlong N,
                                const dlong fieldOffset,
                                const occa::memory &o_tensor,
@@ -1417,9 +1419,9 @@ void linAlg_t::magSqrSymTensor(const dlong N,
 }
 
 void linAlg_t::magSqrSymTensorDiag(const dlong N,
-                               const dlong fieldOffset,
-                               const occa::memory &o_tensor,
-                               occa::memory &o_mag)
+                                   const dlong fieldOffset,
+                                   const occa::memory &o_tensor,
+                                   occa::memory &o_mag)
 {
   nekrsCheck(o_tensor.length() < 6 * fieldOffset,
              MPI_COMM_SELF,
@@ -1430,8 +1432,6 @@ void linAlg_t::magSqrSymTensorDiag(const dlong N,
   magSqrSymTensorDiagKernel(N, fieldOffset, o_tensor, o_mag);
 }
 
-
-
 void linAlg_t::linearCombination(const dlong N,
                                  const dlong Nfields,
                                  const dlong fieldOffset,
@@ -1454,27 +1454,27 @@ void linAlg_t::linearCombination(const dlong N,
 }
 
 dfloat linAlg_t::maxRelativeError(const dlong N,
-                                  const dlong Nfields,
+                                  const int Nfields,
                                   const dlong fieldOffset,
                                   const dfloat absTol,
                                   const occa::memory &o_u,
                                   const occa::memory &o_uRef,
                                   MPI_Comm comm)
 {
-  auto o_err = platform->o_memPool.reserve<dfloat>(std::max(Nfields * fieldOffset, N));
+  auto o_err = platform->deviceMemoryPool.reserve<dfloat>(std::max(Nfields * fieldOffset, N));
   relativeErrorKernel(N, Nfields, fieldOffset, absTol, o_u, o_uRef, o_err);
   return this->amaxMany(N, Nfields, fieldOffset, o_err, comm);
 }
 
 dfloat linAlg_t::maxAbsoluteError(const dlong N,
-                                  const dlong Nfields,
+                                  const int Nfields,
                                   const dlong fieldOffset,
                                   const dfloat absTol,
                                   const occa::memory &o_u,
                                   const occa::memory &o_uRef,
                                   MPI_Comm comm)
 {
-  auto o_err = platform->o_memPool.reserve<dfloat>(std::max(Nfields * fieldOffset, N));
+  auto o_err = platform->deviceMemoryPool.reserve<dfloat>(std::max(Nfields * fieldOffset, N));
   absoluteErrorKernel(N, Nfields, fieldOffset, absTol, o_u, o_uRef, o_err);
   return this->amaxMany(N, Nfields, fieldOffset, o_err, comm);
 }
diff --git a/src/core/linAlg/linAlg.hpp b/src/core/linAlg/linAlg.hpp
index 18971abca..1846cd6ce 100644
--- a/src/core/linAlg/linAlg.hpp
+++ b/src/core/linAlg/linAlg.hpp
@@ -365,7 +365,7 @@ class linAlg_t
                          occa::memory &o_y);
 
   dfloat maxRelativeError(const dlong N,
-                          const dlong Nfields,
+                          const int Nfields,
                           const dlong fieldOffset,
                           const dfloat absTol,
                           const occa::memory &o_u,
@@ -373,7 +373,7 @@ class linAlg_t
                           MPI_Comm comm);
 
   dfloat maxAbsoluteError(const dlong N,
-                          const dlong Nfields,
+                          const int Nfields,
                           const dlong fieldOffset,
                           const dfloat absTol,
                           const occa::memory &o_u,
diff --git a/src/core/nekrsSys.hpp.in b/src/core/nekrsSys.hpp.in
index cc515c104..8211888cd 100644
--- a/src/core/nekrsSys.hpp.in
+++ b/src/core/nekrsSys.hpp.in
@@ -160,13 +160,13 @@ namespace
 constexpr double targetTimeBenchmark{0.5};
 
 constexpr int BLOCKSIZE = 256;
-constexpr int ALIGN_SIZE = 1024;
+constexpr int ALIGN_SIZE_BYTES = 256;
 constexpr int NSCALAR_MAX = 99;
 const occa::memory o_NULL;
 
 template <typename T> unsigned int alignStride(unsigned int stride)
 {
-  const auto pageW = ALIGN_SIZE / sizeof(T);
+  const auto pageW = ALIGN_SIZE_BYTES / sizeof(T);
   if (stride % pageW) {
     stride = (stride / pageW + 1) * pageW;
   }
diff --git a/src/core/ogs/oogs.cpp b/src/core/ogs/oogs.cpp
index 8a79b4304..f0685e844 100644
--- a/src/core/ogs/oogs.cpp
+++ b/src/core/ogs/oogs.cpp
@@ -659,17 +659,6 @@ oogs_t *oogs::setup(ogs_t *ogs,
           fflush(stdout);
         }
 
-        device.finish();
-        constexpr int Nwarmup = 30;
-        for (int test = 0; test < Nwarmup; ++test) {
-          oogs::start(o_q, nVec, stride, type, ogsAdd, gs);
-          if (callback) {
-            callback();
-          }
-          oogs::finish(o_q, nVec, stride, type, ogsAdd, gs);
-        }
-        device.finish();
-
         // run Ntests measurements and take min to eliminate runtime variations
         constexpr int Ntests = 100;
         double elapsedTest = std::numeric_limits<double>::max();
diff --git a/src/core/opSEM.cpp b/src/core/opSEM.cpp
index 105df52d4..8590dc564 100644
--- a/src/core/opSEM.cpp
+++ b/src/core/opSEM.cpp
@@ -7,7 +7,7 @@ static const std::string suffix = "Hex3D";
 namespace opSEM
 {
 
-void grad(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory& o_out)
+void grad(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory &o_out)
 {
   static occa::kernel kernel;
   if (!kernel.isInitialized()) {
@@ -18,13 +18,12 @@ void grad(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory& o_
 
 occa::memory grad(mesh_t *mesh, dlong offset, const occa::memory &o_in)
 {
-  auto o_out = platform->o_memPool.reserve<dfloat>(mesh->dim * offset);
+  auto o_out = platform->deviceMemoryPool.reserve<dfloat>(mesh->dim * offset);
   grad(mesh, offset, o_in, o_out);
   return o_out;
 }
 
-
-void strongGrad(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory& o_out)
+void strongGrad(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory &o_out)
 {
   static occa::kernel kernel;
   if (!kernel.isInitialized()) {
@@ -35,12 +34,12 @@ void strongGrad(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memo
 
 occa::memory strongGrad(mesh_t *mesh, dlong offset, const occa::memory &o_in)
 {
-  auto o_out = platform->o_memPool.reserve<dfloat>(mesh->dim * offset);
+  auto o_out = platform->deviceMemoryPool.reserve<dfloat>(mesh->dim * offset);
   strongGrad(mesh, offset, o_in, o_out);
   return o_out;
 }
 
-void strongGradVec(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory& o_out)
+void strongGradVec(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory &o_out)
 {
   for (int i = 0; i < mesh->dim; i++) {
     auto o_u = o_in.slice(i * offset, mesh->Nlocal);
@@ -56,7 +55,7 @@ occa::memory strongGradVec(mesh_t *mesh, dlong offset, const occa::memory &o_in)
   return o_out;
 }
 
-void divergence(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory& o_out)
+void divergence(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory &o_out)
 {
   static occa::kernel kernel;
   if (!kernel.isInitialized()) {
@@ -67,12 +66,12 @@ void divergence(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memo
 
 occa::memory divergence(mesh_t *mesh, dlong offset, const occa::memory &o_in)
 {
-  auto o_out = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_out = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   divergence(mesh, offset, o_in, o_out);
   return o_out;
 }
 
-void strongDivergence(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory& o_out)
+void strongDivergence(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory &o_out)
 {
   static occa::kernel kernel;
   if (!kernel.isInitialized()) {
@@ -83,12 +82,16 @@ void strongDivergence(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa
 
 occa::memory strongDivergence(mesh_t *mesh, dlong offset, const occa::memory &o_in)
 {
-  auto o_out = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_out = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   strongDivergence(mesh, offset, o_in);
   return o_out;
 }
 
-void laplacian(mesh_t *mesh, dlong offset, const occa::memory &o_lambda, const occa::memory &o_in, occa::memory& o_out)
+void laplacian(mesh_t *mesh,
+               dlong offset,
+               const occa::memory &o_lambda,
+               const occa::memory &o_in,
+               occa::memory &o_out)
 {
   static occa::memory o_fieldOffsetScan;
   static occa::kernel kernel;
@@ -101,21 +104,24 @@ void laplacian(mesh_t *mesh, dlong offset, const occa::memory &o_lambda, const o
 
 occa::memory laplacian(mesh_t *mesh, dlong offset, const occa::memory &o_lambda, const occa::memory &o_in)
 {
-  auto o_out = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_out = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   laplacian(mesh, offset, o_lambda, o_in, o_out);
   return o_out;
 }
 
-void
-strongLaplacian(mesh_t *mesh, dlong offset, const occa::memory &o_lambda, const occa::memory &o_in, occa::memory& o_out)
+void strongLaplacian(mesh_t *mesh,
+                     dlong offset,
+                     const occa::memory &o_lambda,
+                     const occa::memory &o_in,
+                     occa::memory &o_out)
 {
   auto o_grad = strongGrad(mesh, offset, o_in);
   oogs::startFinish(o_grad, mesh->dim, offset, ogsDfloat, ogsAdd, mesh->oogs);
 
-  auto o_tmp = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   platform->linAlg->axmyz(mesh->Nlocal, 1.0, mesh->o_invAJw, o_lambda, o_tmp);
   platform->linAlg->axmyVector(mesh->Nlocal, offset, 0, 1.0, o_tmp, o_grad);
-  
+
   o_out = strongDivergence(mesh, offset, o_grad);
 }
 
@@ -127,7 +133,7 @@ strongLaplacian(mesh_t *mesh, dlong offset, const occa::memory &o_lambda, const
   return o_out;
 }
 
-void strongCurl(mesh_t *mesh, dlong offset, const occa::memory& o_in, occa::memory& o_out)
+void strongCurl(mesh_t *mesh, dlong offset, const occa::memory &o_in, occa::memory &o_out)
 {
   static occa::kernel kernel;
   if (!kernel.isInitialized()) {
@@ -137,9 +143,9 @@ void strongCurl(mesh_t *mesh, dlong offset, const occa::memory& o_in, occa::memo
   kernel(mesh->Nelements, scaleJW, mesh->o_vgeo, mesh->o_D, offset, o_in, o_out);
 }
 
-occa::memory strongCurl(mesh_t *mesh, dlong offset, const occa::memory& o_in)
+occa::memory strongCurl(mesh_t *mesh, dlong offset, const occa::memory &o_in)
 {
-  auto o_out = platform->o_memPool.reserve<dfloat>(mesh->dim * offset);
+  auto o_out = platform->deviceMemoryPool.reserve<dfloat>(mesh->dim * offset);
   strongCurl(mesh, offset, o_in, o_out);
   return o_out;
 }
diff --git a/src/core/platform.cpp b/src/core/platform.cpp
index c43f10ca1..c3bf7401e 100644
--- a/src/core/platform.cpp
+++ b/src/core/platform.cpp
@@ -24,7 +24,6 @@ namespace
 static void compileDummyKernel(platform_t &plat)
 {
   const bool buildNodeLocal = plat.cacheLocal;
-  auto rank = buildNodeLocal ? plat.comm.localRank : plat.comm.mpiRank;
   const std::string dummyKernelName = "myDummyKernelName";
   const std::string dummyKernelStr = std::string("@kernel void myDummyKernelName(int N) {"
                                                  "  for (int i = 0; i < N; ++i; @tile(64, @outer, @inner)) {}"
@@ -58,7 +57,7 @@ platform_t::platform_t(setupAide &_options, MPI_Comm _commg, MPI_Comm _comm)
 
   buildOnly = false;
   if (options.compareArgs("BUILD ONLY", "TRUE")) {
-   buildOnly = true; 
+    buildOnly = true;
   }
 
   if (options.getArgs("CHECKPOINT ENGINE").empty()) {
@@ -68,7 +67,7 @@ platform_t::platform_t(setupAide &_options, MPI_Comm _commg, MPI_Comm _comm)
   exitValue = 0;
 
   // only relevant for SERIAL backend
-  setenv("OCCA_MEM_BYTE_ALIGN", std::to_string(ALIGN_SIZE).c_str(), 1);
+  setenv("OCCA_MEM_BYTE_ALIGN", std::to_string(ALIGN_SIZE_BYTES).c_str(), 1);
 
   cacheLocal = 0;
   if (getenv("NEKRS_CACHE_LOCAL")) {
@@ -128,15 +127,14 @@ platform_t::platform_t(setupAide &_options, MPI_Comm _commg, MPI_Comm _comm)
                  tmpDir.c_str());
     }
 
-    const auto multiSession = [&]() 
-    {
+    const auto multiSession = [&]() {
       int retVal;
       MPI_Comm_compare(comm.mpiComm, comm.mpiCommParent, &retVal);
       return (retVal == MPI_IDENT) ? false : true;
     }();
 
     if (multiSession) {
-      int sessionID; 
+      int sessionID;
       options.getArgs("NEKNEK SESSION ID", sessionID);
       tmpDir = fs::path(tmpDir) / fs::path("sess" + std::to_string(sessionID));
       fs::create_directories(tmpDir);
@@ -182,7 +180,6 @@ platform_t::platform_t(setupAide &_options, MPI_Comm _commg, MPI_Comm _comm)
   kernelInfo["defines/"
              "p_PI"] = M_PI;
 
-
   if (device.mode() == "CUDA") {
     kernelInfo["defines/smXX"] = 1;
   }
@@ -209,43 +206,41 @@ platform_t::platform_t(setupAide &_options, MPI_Comm _commg, MPI_Comm _comm)
 
   const std::string floatingPointType = static_cast<std::string>(kernelInfo["defines/dfloat"]);
   if (floatingPointType.find("float") != std::string::npos) {
-      kernelInfo["defines/FP32"] = 1;
+    kernelInfo["defines/FP32"] = 1;
   }
 
   const std::string extension = serial ? ".c" : ".okl";
 
-  if (rank == 0)
+  if (rank == 0) {
     compileDummyKernel(*this);
+  }
 
   {
     occa::json properties;
     properties["resize_through_host"] = 1;
-    o_memPool = device.occaDevice().createMemoryPool(properties);
-    o_memPool.setAlignment(ALIGN_SIZE);
+    deviceMemoryPool = device.occaDevice().createMemoryPool(properties);
+    deviceMemoryPool.setAlignment(ALIGN_SIZE_BYTES);
   }
 
   {
     occa::json properties;
     properties["resize_through_host"] = 1;
     properties["host"] = true;
-    memPool = device.occaDevice().createMemoryPool(properties);
-    memPool.setAlignment(ALIGN_SIZE);
+    memoryPool = device.occaDevice().createMemoryPool(properties);
+    memoryPool.setAlignment(ALIGN_SIZE_BYTES);
   }
 }
 
-// input files required for JIT kernel compilation or load 
+// input files required for JIT kernel compilation or load
 void platform_t::bcastJITKernelSourceFiles()
 {
   if (platform->verbose && comm.mpiRank == 0) {
-    std::cout << "broadcast kernel sources to " << platform->tmpDir << std::endl; 
+    std::cout << "broadcast kernel sources to " << platform->tmpDir << std::endl;
   }
 
   const auto NEKRS_HOME_NEW = fs::path(tmpDir) / "nekrs";
   const auto srcPath = fs::path(getenv("NEKRS_HOME"));
-  for (auto &entry : {
-                       fs::path("include"),
-                       fs::path("kernels")
-                     }) {
+  for (auto &entry : {fs::path("include"), fs::path("kernels")}) {
 
     fileBcast(srcPath / entry, NEKRS_HOME_NEW, comm.mpiComm, verbose);
   }
diff --git a/src/core/platform.hpp b/src/core/platform.hpp
index df5e5ca99..06b956837 100644
--- a/src/core/platform.hpp
+++ b/src/core/platform.hpp
@@ -44,8 +44,8 @@ struct platform_t {
   device_t device;
   occa::properties kernelInfo;
   timer::timer_t timer;
-  occa::memoryPool o_memPool;
-  occa::memoryPool memPool;
+  occa::memoryPool deviceMemoryPool;
+  occa::memoryPool memoryPool;
   kernelRequestManager_t kernelRequests;
   Par *par;
   solver_t *solver;
@@ -62,8 +62,9 @@ struct platform_t {
   occa::kernel copyDfloatToPfloatKernel;
   occa::kernel copyPfloatToDfloatKernel;
   occa::kernel copyDfloatToDoubleKernel;
+  occa::kernel copyDfloatToFloatKernel;
   occa::kernel copyDoubleToDfloatKernel;
-
+  occa::kernel copyFloatToDfloatKernel;
 };
 #endif
 
diff --git a/src/core/registerCoreKernels.cpp b/src/core/registerCoreKernels.cpp
index 950e63d26..0ce52a70b 100644
--- a/src/core/registerCoreKernels.cpp
+++ b/src/core/registerCoreKernels.cpp
@@ -18,9 +18,15 @@ void registerCoreKernels()
     kernelName = "core-copyDfloatToDouble";
     platform->copyDfloatToDoubleKernel = platform->kernelRequests.load(kernelName);
 
+    kernelName = "core-copyDfloatToFloat";
+    platform->copyDfloatToFloatKernel = platform->kernelRequests.load(kernelName);
+
     kernelName = "core-copyDoubleToDfloat";
     platform->copyDoubleToDfloatKernel = platform->kernelRequests.load(kernelName);
 
+    kernelName = "core-copyFloatToDfloat";
+    platform->copyFloatToDfloatKernel = platform->kernelRequests.load(kernelName);
+
     return;
   }
 
@@ -109,8 +115,18 @@ void registerCoreKernels()
       fileName = oklpath + "/core/" + kernelName + extension;
       auto prop = platform->kernelInfo;
       prop["defines/pfloat"] = "double";
+      prop["defines/dummy"] = 1; // just to make it different from copyDfloatToDouble to avoid collison
       platform->kernelRequests.add(section + "copyDfloatToDouble", fileName, prop);
     }
+
+    {
+      kernelName = "copyDfloatToPfloat";
+      fileName = oklpath + "/core/" + kernelName + extension;
+      auto prop = platform->kernelInfo;
+      prop["defines/pfloat"] = "float";
+      prop["defines/dummy"] = 2; // just to make it different from copyDfloatToDouble to avoid collison
+      platform->kernelRequests.add(section + "copyDfloatToFloat", fileName, prop);
+    }
  
     {
       kernelName = "copyDfloatToPfloat";
@@ -118,10 +134,20 @@ void registerCoreKernels()
       auto prop = platform->kernelInfo;
       prop["defines/dfloat"] = "double";
       prop["defines/pfloat"] = dfloatString;
-      prop["defines/dummy"] = 1; // just to make it different from copyDfloatToDouble to avoid collison
+      prop["defines/dummy"] = 3; // just to make it different from copyDfloatToDouble to avoid collison
       platform->kernelRequests.add(section + "copyDoubleToDfloat", fileName, prop);
     }
 
+    {
+      kernelName = "copyDfloatToPfloat";
+      fileName = oklpath + "/core/" + kernelName + extension;
+      auto prop = platform->kernelInfo;
+      prop["defines/dfloat"] = "float";
+      prop["defines/pfloat"] = dfloatString;
+      prop["defines/dummy"] = 4; // just to make it different from copyDfloatToDouble to avoid collison
+      platform->kernelRequests.add(section + "copyFloatToDfloat", fileName, prop);
+    }
+
     auto prop = platform->kernelInfo;
     kernelName = "copyDfloatToPfloat";
     fileName = oklpath + "/core/" + kernelName + extension;
diff --git a/src/core/threadPool.cpp b/src/core/threadPool.cpp
new file mode 100644
index 000000000..23ecfd5e6
--- /dev/null
+++ b/src/core/threadPool.cpp
@@ -0,0 +1,43 @@
+#include <threadPool.hpp>
+
+ThreadPool::ThreadPool(size_t numThreads) : stop(false), activeTasks(0)
+{
+  for (size_t i = 0; i < numThreads; ++i) {
+    workers.emplace_back([this] { this->workerThread(); });
+  }
+}
+
+ThreadPool::~ThreadPool()
+{
+  {
+    std::unique_lock<std::mutex> lock(queueMutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread &worker : workers) {
+    worker.join();
+  }
+}
+
+void ThreadPool::workerThread()
+{
+  while (true) {
+    std::function<void()> task;
+    {
+      std::unique_lock<std::mutex> lock(queueMutex);
+      condition.wait(lock, [this] { return this->stop || !this->tasks.empty(); });
+      if (this->stop && this->tasks.empty()) {
+        return;
+      }
+      task = std::move(this->tasks.front());
+      this->tasks.pop();
+    }
+    task();
+  }
+}
+
+void ThreadPool::finish()
+{
+  std::unique_lock<std::mutex> lock(queueMutex);
+  taskCondition.wait(lock, [this] { return tasks.empty() && activeTasks == 0; });
+}
diff --git a/src/core/threadPool.hpp b/src/core/threadPool.hpp
new file mode 100644
index 000000000..5b3c2c522
--- /dev/null
+++ b/src/core/threadPool.hpp
@@ -0,0 +1,75 @@
+#include <iostream>
+#include <vector>
+#include <thread>
+#include <functional>
+#include <future>
+#include <queue>
+#include <mutex>
+#include <condition_variable>
+
+class ThreadPool
+{
+public:
+  ThreadPool(size_t numThreads);
+  ~ThreadPool();
+
+  template <class F, class... Args>
+  auto enqueue(F &&f, Args &&...args) -> std::future<typename std::invoke_result<F, Args...>::type>;
+
+  void finish();
+
+  void safePrint(const std::string &message)
+  {
+    std::lock_guard<std::mutex> guard(coutMutex);
+    std::cout << message << std::endl;
+  }
+
+private:
+  std::vector<std::thread> workers;
+  std::queue<std::function<void()>> tasks;
+  std::mutex queueMutex;
+  std::condition_variable condition;
+  bool stop;
+
+  std::mutex coutMutex;
+
+  std::mutex taskMutex;
+  std::condition_variable taskCondition;
+  size_t activeTasks;
+
+  void workerThread();
+};
+
+template <class F, class... Args>
+auto ThreadPool::enqueue(F &&f, Args &&...args) -> std::future<typename std::invoke_result<F, Args...>::type>
+{
+  using returnType = typename std::invoke_result<F, Args...>::type;
+
+  auto task = std::make_shared<std::packaged_task<returnType()>>(
+      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+  std::future<returnType> result = task->get_future();
+
+  {
+    std::unique_lock<std::mutex> lock(queueMutex);
+    if (stop) {
+      throw std::runtime_error("Enqueue on stopped ThreadPool");
+    }
+    tasks.emplace([task, this] {
+      {
+        std::unique_lock<std::mutex> taskLock(taskMutex);
+        ++activeTasks;
+      }
+      (*task)();
+      {
+        std::unique_lock<std::mutex> taskLock(taskMutex);
+        --activeTasks;
+        if (activeTasks == 0) {
+          taskCondition.notify_all();
+        }
+      }
+    });
+  }
+
+  condition.notify_one();
+  return result;
+}
diff --git a/src/elliptic/MG/MGSolver.cpp b/src/elliptic/MG/MGSolver.cpp
index 02f4c0130..8c425e10d 100644
--- a/src/elliptic/MG/MGSolver.cpp
+++ b/src/elliptic/MG/MGSolver.cpp
@@ -28,11 +28,12 @@ SOFTWARE.
 #include "platform.hpp"
 #include "linAlg.hpp"
 
-namespace {
+namespace
+{
 
-void coarsenV(MGSolver_t* M)
+void coarsenV(MGSolver_t *M)
 {
-  for(int k = 0 ; k < M->numLevels-1; ++k){
+  for (int k = 0; k < M->numLevels - 1; ++k) {
     auto level = M->levels[k];
     auto o_rhs = level->o_rhs;
     auto o_res = level->o_res;
@@ -42,11 +43,11 @@ void coarsenV(MGSolver_t* M)
     o_res.copyFrom(o_rhs, level->Nrows);
     levelC->coarsen(o_res, o_rhsC);
   }
-
 }
-void prolongateV(MGSolver_t* M)
+
+void prolongateV(MGSolver_t *M)
 {
-  for(int k = M->numLevels-2; k >= 0; --k){
+  for (int k = M->numLevels - 2; k >= 0; --k) {
     auto level = M->levels[k];
     auto o_x = level->o_x;
 
@@ -57,9 +58,10 @@ void prolongateV(MGSolver_t* M)
     levelC->prolongate(o_xC, o_x);
   }
 }
-void schwarzSolve(MGSolver_t* M)
-{    
-  for(int k = 0 ; k < M->numLevels-1; ++k){
+
+void schwarzSolve(MGSolver_t *M)
+{
+  for (int k = 0; k < M->numLevels - 1; ++k) {
     auto level = M->levels[k];
     auto o_rhs = level->o_rhs;
     auto o_x = level->o_x;
@@ -75,13 +77,12 @@ void schwarzSolve(MGSolver_t* M)
 
     // rhsC = P^T res
     levelC->coarsen(o_res, o_rhsC);
-    }
+  }
 }
 
-}
+} // namespace
 
-MGSolver_t::MGSolver_t(occa::device device_, MPI_Comm comm_,
-                       setupAide options_) 
+MGSolver_t::MGSolver_t(occa::device device_, MPI_Comm comm_, setupAide options_)
 {
   device = device_;
   comm = comm_;
@@ -90,107 +91,120 @@ MGSolver_t::MGSolver_t(occa::device device_, MPI_Comm comm_,
   MPI_Comm_rank(comm, &rank);
   MPI_Comm_size(comm, &size);
 
-  levels = (MGSolver_t::multigridLevel **) calloc(MAX_LEVELS,sizeof(MGSolver_t::multigridLevel *));
+  levels = (MGSolver_t::multigridLevel **)calloc(MAX_LEVELS, sizeof(MGSolver_t::multigridLevel *));
 
   coarseLevel = new coarseLevel_t(options, comm);
 
   numLevels = 0;
 
-  if(options.compareArgs("MGSOLVER CYCLE", "VCYCLE")) {
+  if (options.compareArgs("MGSOLVER CYCLE", "VCYCLE")) {
     ctype = VCYCLE;
     additive = false;
-    if(options.compareArgs("MGSOLVER CYCLE", "ADDITIVE")) {
+    if (options.compareArgs("MGSOLVER CYCLE", "ADDITIVE")) {
       if (options.compareArgs("MGSOLVER SMOOTHER", "CHEBYSHEV")) {
-        if(rank==0) printf("Additive vcycle is not supported for Chebyshev!\n");
+        if (rank == 0) {
+          printf("Additive vcycle is not supported for Chebyshev!\n");
+        }
         MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
       }
       additive = true;
       overlapCrsGridSolve = false;
-      if(options.compareArgs("MGSOLVER CYCLE", "OVERLAPCRS")){
-        if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"){
+      if (options.compareArgs("MGSOLVER CYCLE", "OVERLAPCRS")) {
+        if (platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") {
           overlapCrsGridSolve = false;
         } else {
           overlapCrsGridSolve = true;
           int provided;
           MPI_Query_thread(&provided);
-          if(provided != MPI_THREAD_MULTIPLE) {
+          if (provided != MPI_THREAD_MULTIPLE) {
             overlapCrsGridSolve = false;
-            if(rank ==0 && size > 1) 
+            if (rank == 0 && size > 1) {
               printf("disable overlapping coarse solve as MPI_THREAD_MULTIPLE is not supported!\n");
+            }
           }
-          if(size == 1) overlapCrsGridSolve = true;
+          if (size == 1) {
+            overlapCrsGridSolve = true;
+          }
+        }
+        if (rank == 0 && overlapCrsGridSolve) {
+          printf("overlapping coarse grid solve enabled\n");
         }
-        if(rank ==0 && overlapCrsGridSolve) printf("overlapping coarse grid solve enabled\n");
       }
     } else {
-      if (options.compareArgs("MGSOLVER SMOOTHER", "RAS") || 
+      if (options.compareArgs("MGSOLVER SMOOTHER", "RAS") ||
           options.compareArgs("MGSOLVER SMOOTHER", "ASM")) {
-        if(!options.compareArgs("MGSOLVER SMOOTHER", "CHEBYSHEV")){
-          if(rank==0) 
+        if (!options.compareArgs("MGSOLVER SMOOTHER", "CHEBYSHEV")) {
+          if (rank == 0) {
             printf("Multiplicative vcycle is not supported for RAS/ASM smoother without Chebyshev!\n");
-          MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 
+          }
+          MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
         }
       }
     }
   } else {
-    if(rank==0) printf("Unknown multigrid cycle type!\n");
-    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 
+    if (rank == 0) {
+      printf("Unknown multigrid cycle type!\n");
+    }
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
   }
 }
 
-MGSolver_t::~MGSolver_t() {
-  for (int n=0;n<numLevels;n++)
+MGSolver_t::~MGSolver_t()
+{
+  for (int n = 0; n < numLevels; n++) {
     delete levels[n];
+  }
 
   free(levels);
 
-  if(coarseLevel) delete coarseLevel; 
+  if (coarseLevel) {
+    delete coarseLevel;
+  }
 }
 
-void MGSolver_t::Run(occa::memory o_rhsFine, occa::memory o_xFine) 
+void MGSolver_t::Run(occa::memory o_rhsFine, occa::memory o_xFine)
 {
-  levels[0]->o_x   = o_xFine;
+  levels[0]->o_x = o_xFine;
   levels[0]->o_rhs = o_rhsFine;
 
-  if(ctype == VCYCLE) {
-    if(additive)
+  if (ctype == VCYCLE) {
+    if (additive) {
       runAdditiveVcycle();
-    else
+    } else {
       runVcycle(0);
+    }
   }
 
   levels[0]->o_x = nullptr;
   levels[0]->o_rhs = nullptr;
 }
 
-void MGSolver_t::Report() {
-
-}
+void MGSolver_t::Report() {}
 
 void MGSolver_t::runVcycle(int k)
 {
   MGSolver_t::multigridLevel *level = levels[k];
-  auto& o_rhs = level->o_rhs;
-  auto& o_x   = level->o_x;
-  auto& o_res = level->o_res;
+  auto &o_rhs = level->o_rhs;
+  auto &o_x = level->o_x;
+  auto &o_res = level->o_res;
 
-  if(k == baseLevel) {
+  if (k == baseLevel) {
     // zero initialize o_x as we don't solve for masked points
     platform->linAlg->pfill(o_x.size(), 0.0, o_x);
     coarseLevel->solvePtr(coarseLevel, o_rhs, o_x);
     return;
   }
 
-  MGSolver_t::multigridLevel *levelC = levels[k+1];
-  auto& o_rhsC = levelC->o_rhs;
-  auto& o_xC   = levelC->o_x;
+  MGSolver_t::multigridLevel *levelC = levels[k + 1];
+  auto &o_rhsC = levelC->o_rhs;
+  auto &o_xC = levelC->o_x;
 
   level->smooth(o_rhs, o_x, true);
   level->residual(o_rhs, o_x, o_res);
 
   levelC->coarsen(o_res, o_rhsC);
 
-  this->runVcycle(k+1); // recursive call
+  this->runVcycle(k + 1); // recursive call
 
   levelC->prolongate(o_xC, o_x);
 
@@ -205,7 +219,7 @@ void MGSolver_t::runAdditiveVcycle()
 
   const int nThreads = this->overlapCrsGridSolve ? 2 : 1;
   occa::memory o_rhs = levels[baseLevel]->o_rhs;
-  occa::memory o_x   = levels[baseLevel]->o_x;
+  occa::memory o_x = levels[baseLevel]->o_x;
 
   auto xBuffer = this->coarseLevel->xBuffer;
   auto ogs = this->coarseLevel->ogs;
@@ -222,25 +236,26 @@ void MGSolver_t::runAdditiveVcycle()
   o_rhs.copyTo(Sx, Nlocal);
 
   o_x.getDevice().finish();
-  #pragma omp parallel proc_bind(close) num_threads(nThreads)
+#pragma omp parallel proc_bind(close) num_threads(nThreads)
   {
-    #pragma omp single
+#pragma omp single
     {
-      #pragma omp task
+#pragma omp task
       {
         schwarzSolve(this);
       }
-      #pragma omp task
+#pragma omp task
       {
-        for(int i = 0; i < Nlocal; i++)
-          Sx[i] *= this->coarseLevel->weight[i]; 
+        for (int i = 0; i < Nlocal; i++) {
+          Sx[i] *= this->coarseLevel->weight[i];
+        }
         ogsGather(Gx, Sx, ogsPfloat, ogsAdd, ogs);
-    
-        for(int i = 0; i < NlocalT; i++) {
-          xBuffer[i] = 0; 
+
+        for (int i = 0; i < NlocalT; i++) {
+          xBuffer[i] = 0;
         }
- 
-        auto boomerAMG = (hypreWrapper::boomerAMG_t*) coarseLevel->boomerAMG;
+
+        auto boomerAMG = (hypreWrapper::boomerAMG_t *)coarseLevel->boomerAMG;
         boomerAMG->solve(Gx, xBuffer);
 
         ogsScatter(Sx, xBuffer, ogsPfloat, ogsAdd, ogs);
@@ -258,11 +273,11 @@ void MGSolver_t::runAdditiveVcycle()
 void MGSolver_t::allocateWorkStorage()
 {
   for (int k = 0; k < numLevels; k++) {
-    levels[k]->o_res = platform->o_memPool.reserve<pfloat>(levels[k]->Ncols);
+    levels[k]->o_res = platform->deviceMemoryPool.reserve<pfloat>(levels[k]->Ncols);
     // allocate coarse levels only
     if (k) {
-      levels[k]->o_x = platform->o_memPool.reserve<pfloat>(levels[k]->Ncols);
-      levels[k]->o_rhs = platform->o_memPool.reserve<pfloat>(levels[k]->Nrows);
+      levels[k]->o_x = platform->deviceMemoryPool.reserve<pfloat>(levels[k]->Ncols);
+      levels[k]->o_rhs = platform->deviceMemoryPool.reserve<pfloat>(levels[k]->Nrows);
     }
   }
 }
diff --git a/src/elliptic/MG/ellipticBuildFEM.cpp b/src/elliptic/MG/ellipticBuildFEM.cpp
index 26f25a13b..d8d81699a 100644
--- a/src/elliptic/MG/ellipticBuildFEM.cpp
+++ b/src/elliptic/MG/ellipticBuildFEM.cpp
@@ -54,7 +54,6 @@ void ellipticBuildFEM(elliptic_t* elliptic,
                       dlong* nnz,
                       hlong* globalStarts)
 {
-  mesh_t *mesh = elliptic->mesh;
   MPI_Barrier(platform->comm.mpiComm);
   const double tStart = MPI_Wtime();
   if(platform->comm.mpiRank == 0) 
@@ -78,7 +77,6 @@ void ellipticBuildFEMHex3D(elliptic_t* elliptic,
 {
   
   mesh_t* mesh = elliptic->mesh;
-  setupAide& options = elliptic->options;
 
   std::vector<pfloat> ggeo(mesh->o_ggeo.length());
   mesh->o_ggeo.copyTo(ggeo.data());
@@ -322,7 +320,6 @@ void ellipticBuildFEMGalerkin(elliptic_t* elliptic,
     exit(1);
   }
 
-  mesh_t* mesh = elliptic->mesh;
   switch(elliptic->elementType) {
   case TRIANGLES:
   case TETRAHEDRA:
diff --git a/src/elliptic/MG/ellipticMultiGridLevel.cpp b/src/elliptic/MG/ellipticMultiGridLevel.cpp
index 902d9bb32..fc2aad92e 100644
--- a/src/elliptic/MG/ellipticMultiGridLevel.cpp
+++ b/src/elliptic/MG/ellipticMultiGridLevel.cpp
@@ -29,9 +29,10 @@
 #include "ellipticMultiGrid.h"
 #include "linAlg.hpp"
 #include <iostream>
+
 void pMGLevel::Ax(occa::memory o_x, occa::memory o_Ax)
 {
-  ellipticOperator(elliptic,o_x,o_Ax, pfloatString);
+  ellipticOperator(elliptic, o_x, o_Ax, pfloatString);
 }
 
 void pMGLevel::residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res)
@@ -57,15 +58,16 @@ void pMGLevel::coarsen(occa::memory o_x, occa::memory o_Rx)
   const auto workPerElem = 2 * (NqF * NqF * NqF * NqC + NqF * NqF * NqC * NqC + NqF * NqC * NqC * NqC);
   flopCounter += static_cast<double>(mesh->Nelements) * workPerElem;
 
-  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
+  if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {
     oogs::startFinish(o_Rx, elliptic->Nfields, elliptic->fieldOffset, ogsPfloat, ogsAdd, elliptic->oogs);
-    ellipticApplyMask(elliptic, o_Rx, pfloatString); // apply mask again because coarsenKernel does not preserve it
+    ellipticApplyMask(elliptic,
+                      o_Rx,
+                      pfloatString); // apply mask again because coarsenKernel does not preserve it
   }
 
-  const double factor = (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) 
-                        ? 0.5 : 1.0;
+  const double factor =
+      (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) ? 0.5 : 1.0;
   platform->flopCounter->add("pMGLevel::coarsen, N=" + std::to_string(mesh->N), factor * flopCounter);
-
 }
 
 void pMGLevel::prolongate(occa::memory o_x, occa::memory o_Px)
@@ -79,8 +81,8 @@ void pMGLevel::prolongate(occa::memory o_x, occa::memory o_Px)
   flopCounter += NqF * NqF * NqF;
   flopCounter *= static_cast<double>(mesh->Nelements);
 
-  const double factor = (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) 
-                        ? 0.5 : 1.0;
+  const double factor =
+      (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) ? 0.5 : 1.0;
   platform->flopCounter->add("pMGLevel::prolongate, N=" + std::to_string(mesh->N), factor * flopCounter);
 }
 
@@ -88,49 +90,55 @@ void pMGLevel::smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero)
 {
   platform->timer.tic(elliptic->name + " preconditioner smoother N=" + std::to_string(mesh->N), 1);
 
-  if(!x_is_zero && smootherType == SmootherType::ASM) return;
-  if(!x_is_zero && smootherType == SmootherType::RAS) return;
+  if (!x_is_zero && smootherType == SmootherType::ASM) {
+    return;
+  }
+  if (!x_is_zero && smootherType == SmootherType::RAS) {
+    return;
+  }
 
-  if (smootherType == SmootherType::CHEBYSHEV)
+  if (smootherType == SmootherType::CHEBYSHEV) {
     this->smoothChebyshev(o_rhs, o_x, x_is_zero);
-  else if (smootherType == SmootherType::OPT_FOURTH_CHEBYSHEV || smootherType == SmootherType::FOURTH_CHEBYSHEV)
+  } else if (smootherType == SmootherType::OPT_FOURTH_CHEBYSHEV ||
+             smootherType == SmootherType::FOURTH_CHEBYSHEV) {
     this->smoothFourthKindChebyshev(o_rhs, o_x, x_is_zero);
-  else if (smootherType == SmootherType::ASM)
+  } else if (smootherType == SmootherType::ASM) {
     this->smoothSchwarz(o_rhs, o_x, x_is_zero);
-  else if (smootherType == SmootherType::RAS)
+  } else if (smootherType == SmootherType::RAS) {
     this->smoothSchwarz(o_rhs, o_x, x_is_zero);
-  else if (smootherType == SmootherType::JACOBI)
+  } else if (smootherType == SmootherType::JACOBI) {
     this->smoothJacobi(o_rhs, o_x, x_is_zero);
+  }
 
   platform->timer.toc(elliptic->name + " preconditioner smoother N=" + std::to_string(mesh->N));
 }
 
 void pMGLevel::smoother(occa::memory o_x, occa::memory o_Sx, bool x_is_zero)
 {
-  if (chebySmootherType == ChebyshevSmootherType::JACOBI){
+  if (chebySmootherType == ChebyshevSmootherType::JACOBI) {
     this->smootherJacobi(o_x, o_Sx);
   } else {
     this->smoothSchwarz(o_x, o_Sx, true);
   }
 }
 
-void pMGLevel::smoothJacobi (occa::memory &o_r, occa::memory &o_x, bool xIsZero)
+void pMGLevel::smoothJacobi(occa::memory &o_r, occa::memory &o_x, bool xIsZero)
 {
-  auto o_res = platform->o_memPool.reserve<pfloat>(Ncols); 
-  auto o_d = platform->o_memPool.reserve<pfloat>(Ncols); 
+  auto o_res = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
+  auto o_d = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
 
   const pfloat one = 1.0;
   const pfloat mone = -1.0;
 
   double flopCount = 0.0;
 
-  if(xIsZero) { //skip the Ax if x is zero
-    //res = Sr
-    platform->linAlg->paxmyz(Nrows,one,o_invDiagA,o_r,o_x);
+  if (xIsZero) { // skip the Ax if x is zero
+    // res = Sr
+    platform->linAlg->paxmyz(Nrows, one, o_invDiagA, o_r, o_x);
     flopCount += Nrows;
   } else {
-    //res = S(r-Ax)
-    this->Ax(o_x,o_res);
+    // res = S(r-Ax)
+    this->Ax(o_x, o_res);
     platform->linAlg->paxpby(Nrows, one, o_r, mone, o_res);
     platform->linAlg->paxmyz(Nrows, one, o_invDiagA, o_res, o_d);
     platform->linAlg->paxpby(Nrows, one, o_d, one, o_x);
@@ -138,18 +146,19 @@ void pMGLevel::smoothJacobi (occa::memory &o_r, occa::memory &o_x, bool xIsZero)
     flopCount += 7 * Nrows;
   }
   auto mesh = elliptic->mesh;
-  const double factor = (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) 
-                        ? 0.5 : 1.0;
+  const double factor =
+      (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) ? 0.5 : 1.0;
   platform->flopCounter->add("pMGLevel::smoothJacobi, N=" + std::to_string(mesh->N), factor * flopCount);
 }
 
-void pMGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZero)
+void pMGLevel::smoothChebyshev(occa::memory &o_r, occa::memory &o_x, bool xIsZero)
 {
   const auto ChebyshevDegree = xIsZero ? DownLegChebyshevDegree : UpLegChebyshevDegree;
 
   // p_0(0) = I -> no-op smoothing
-  if (ChebyshevDegree == 0)
+  if (ChebyshevDegree == 0) {
     return;
+  }
 
   const pfloat theta = 0.5 * (lambda1 + lambda0);
   const pfloat delta = 0.5 * (lambda1 - lambda0);
@@ -159,9 +168,9 @@ void pMGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZe
 
   pfloat one = 1., mone = -1., zero = 0.0;
 
-  auto o_res = platform->o_memPool.reserve<pfloat>(Ncols);       
-  auto o_Ad = platform->o_memPool.reserve<pfloat>(Ncols); 
-  auto o_d = platform->o_memPool.reserve<pfloat>(Ncols);
+  auto o_res = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
+  auto o_Ad = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
+  auto o_d = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
 
   double flopCount = 0.0;
 
@@ -171,7 +180,7 @@ void pMGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZe
 
   // res = S(r-Ax)
   if (!xIsZero) {
-    this->Ax(o_x,o_res);
+    this->Ax(o_x, o_res);
     platform->linAlg->paxpby(Nrows, one, o_r, mone, o_res);
     flopCount += 2 * Nrows;
   } else {
@@ -186,7 +195,7 @@ void pMGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZe
   for (int k = 1; k < ChebyshevDegree; k++) {
 
     // SAd_k
-    this->Ax(o_d,o_Ad);
+    this->Ax(o_d, o_Ad);
     this->smoother(o_Ad, o_Ad, xIsZero);
 
     // x_k+1 = x_k + d_k
@@ -203,30 +212,31 @@ void pMGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZe
 
     flopCount += 5 * Nrows;
   }
-  //x_k+1 = x_k + d_k
+  // x_k+1 = x_k + d_k
   platform->linAlg->paxpby(Nrows, one, o_d, one, o_x);
   flopCount += Nrows;
   ellipticApplyMask(elliptic, o_x, pfloatString);
 
-  const double factor = (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) 
-                        ? 0.5 : 1.0;
+  const double factor =
+      (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) ? 0.5 : 1.0;
   platform->flopCounter->add("pMGLevel::smoothChebyshev, N=" + std::to_string(mesh->N), factor * flopCount);
 }
 
-void pMGLevel::smoothFourthKindChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZero)
+void pMGLevel::smoothFourthKindChebyshev(occa::memory &o_r, occa::memory &o_x, bool xIsZero)
 {
   const auto ChebyshevDegree = xIsZero ? DownLegChebyshevDegree : UpLegChebyshevDegree;
   auto &betas = xIsZero ? DownLegBetas : UpLegBetas;
 
   // p_0(0) = I -> no-op smoothing
-  if (ChebyshevDegree == 0)
+  if (ChebyshevDegree == 0) {
     return;
+  }
 
   pfloat one = 1., mone = -1., zero = 0.0;
 
-  auto o_res = platform->o_memPool.reserve<pfloat>(Ncols);       
-  auto o_Ad = platform->o_memPool.reserve<pfloat>(Ncols); 
-  auto o_d = platform->o_memPool.reserve<pfloat>(Ncols);
+  auto o_res = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
+  auto o_Ad = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
+  auto o_d = platform->deviceMemoryPool.reserve<pfloat>(Ncols);
 
   const auto rho = this->lambda1;
 
@@ -236,9 +246,8 @@ void pMGLevel::smoothFourthKindChebyshev (occa::memory &o_r, occa::memory &o_x,
   if (xIsZero) {
     platform->linAlg->pfill(Nrows, zero, o_x);
     o_res.copyFrom(o_r, Nrows);
-  }
-  else {
-    this->Ax(o_x,o_res);
+  } else {
+    this->Ax(o_x, o_res);
     platform->linAlg->paxpby(Nrows, one, o_r, mone, o_res);
     flopCount += Nrows;
   }
@@ -265,21 +274,22 @@ void pMGLevel::smoothFourthKindChebyshev (occa::memory &o_r, occa::memory &o_x,
     platform->linAlg->paxpby(Nrows, rCoeff, o_Ad, dCoeff, o_d);
   }
 
-  //x_k+1 = x_k + \beta_k d_k
+  // x_k+1 = x_k + \beta_k d_k
   platform->linAlg->paxpby(Nrows, betas.back(), o_d, one, o_x);
   flopCount += 2 * Nrows;
   ellipticApplyMask(elliptic, o_x, pfloatString);
 
-  const double factor = (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) 
-                        ? 0.5 : 1.0;  
-  platform->flopCounter->add("pMGLevel::smoothOptChebyshev, N=" + std::to_string(mesh->N), factor * flopCount);
+  const double factor =
+      (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) ? 0.5 : 1.0;
+  platform->flopCounter->add("pMGLevel::smoothOptChebyshev, N=" + std::to_string(mesh->N),
+                             factor * flopCount);
 }
 
 void pMGLevel::smootherJacobi(occa::memory &o_r, occa::memory &o_Sr)
 {
   platform->linAlg->paxmyz(Nrows, 1.0f, o_invDiagA, o_r, o_Sr);
 
-  const double factor = (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) 
-                        ? 0.5 : 1.0;
+  const double factor =
+      (std::is_same<pfloat, float>::value && !std::is_same<pfloat, dfloat>::value) ? 0.5 : 1.0;
   platform->flopCounter->add("pMGLevel::smootherJacobi, N=" + std::to_string(mesh->N), factor * Nrows);
 }
diff --git a/src/elliptic/MG/ellipticMultiGridLevelSetup.cpp b/src/elliptic/MG/ellipticMultiGridLevelSetup.cpp
index a0b8f901e..4999df092 100644
--- a/src/elliptic/MG/ellipticMultiGridLevelSetup.cpp
+++ b/src/elliptic/MG/ellipticMultiGridLevelSetup.cpp
@@ -57,8 +57,7 @@ ChebyshevSmootherType convertSmootherType(SmootherType s)
 
 pMGLevel::pMGLevel(elliptic_t *ellipticBase, int Nc, setupAide options_, MPI_Comm comm_, bool _isCoarse)
     : multigridLevel(ellipticBase->mesh->Nelements * ellipticBase->mesh->Np,
-                     (ellipticBase->mesh->Nelements) *
-                         ellipticBase->mesh->Np,
+                     (ellipticBase->mesh->Nelements) * ellipticBase->mesh->Np,
                      comm_)
 {
   isCoarse = _isCoarse;
@@ -82,8 +81,7 @@ pMGLevel::pMGLevel(elliptic_t *ellipticBase, // finest level
                    MPI_Comm comm_,
                    bool _isCoarse)
     : multigridLevel(ellipticCoarse->mesh->Nelements * ellipticCoarse->mesh->Np,
-                     ellipticCoarse->mesh->Np *
-                         (ellipticCoarse->mesh->Nelements),
+                     ellipticCoarse->mesh->Np * (ellipticCoarse->mesh->Nelements),
                      comm_)
 {
 
@@ -160,7 +158,8 @@ void pMGLevel::setupSmoother(elliptic_t *ellipticBase)
 
   std::string schedule = options.getArgs("MULTIGRID SCHEDULE");
   if (!schedule.empty()) {
-    auto [scheduleMap, errorString] = ellipticParseMultigridSchedule(schedule, options, DownLegChebyshevDegree);
+    auto [scheduleMap, errorString] =
+        ellipticParseMultigridSchedule(schedule, options, DownLegChebyshevDegree);
     if (scheduleMap[{degree, true}] > -1) {
       UpLegChebyshevDegree = scheduleMap[{degree, true}];
     }
@@ -213,7 +212,8 @@ void pMGLevel::Report()
   if (platform->comm.mpiRank == 0) {
     if (isCoarse && options.compareArgs("MULTIGRID COARSE SOLVE", "TRUE")) {
       const auto useSEMFEM = options.compareArgs("MULTIGRID SEMFEM", "TRUE");
-      if (options.compareArgs("MULTIGRID COARSE SOLVE AND SMOOTH", "TRUE") || options.compareArgs("MULTIGRID SEMFEM", "TRUE")) {
+      if (options.compareArgs("MULTIGRID COARSE SOLVE AND SMOOTH", "TRUE") ||
+          options.compareArgs("MULTIGRID SEMFEM", "TRUE")) {
 
         printf("|    pMG     |   Matrix-free   | %s\n", smootherString.c_str());
         printf("     |            |     p = %2d      |\n", degree);
@@ -308,7 +308,7 @@ dfloat pMGLevel::maxEigSmoothAx()
   hlong Nglobal = 0;
   MPI_Allreduce(&Nlocal, &Nglobal, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm);
 
-  auto o_invDegree = platform->o_memPool.reserve<dfloat>(Nlocal);
+  auto o_invDegree = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
   o_invDegree.copyFrom(elliptic->ogs->invDegree);
   const auto k = (unsigned int)std::min(pMGLevel::Narnoldi, Nglobal);
 
@@ -317,14 +317,14 @@ dfloat pMGLevel::maxEigSmoothAx()
 
   std::vector<occa::memory> o_V(k + 1);
   for (int i = 0; i <= k; i++) {
-    o_V[i] = platform->o_memPool.reserve<dfloat>(M);
+    o_V[i] = platform->deviceMemoryPool.reserve<dfloat>(M);
   }
 
-  auto o_Vx = platform->o_memPool.reserve<dfloat>(M);
-  auto o_VxPfloat = platform->o_memPool.reserve<pfloat>(M);
+  auto o_Vx = platform->deviceMemoryPool.reserve<dfloat>(M);
+  auto o_VxPfloat = platform->deviceMemoryPool.reserve<pfloat>(M);
 
-  auto o_AVx = platform->o_memPool.reserve<dfloat>(M);
-  auto o_AVxPfloat = platform->o_memPool.reserve<pfloat>(M);
+  auto o_AVx = platform->deviceMemoryPool.reserve<dfloat>(M);
+  auto o_AVxPfloat = platform->deviceMemoryPool.reserve<pfloat>(M);
 
   if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {
     ogsGatherScatter(Vx.data(), ogsDfloat, ogsAdd, mesh->ogs);
diff --git a/src/elliptic/MG/ellipticMultiGridSetup.cpp b/src/elliptic/MG/ellipticMultiGridSetup.cpp
index 78c43a8f2..b1537ede1 100644
--- a/src/elliptic/MG/ellipticMultiGridSetup.cpp
+++ b/src/elliptic/MG/ellipticMultiGridSetup.cpp
@@ -80,8 +80,8 @@ void ellipticMultiGridSetup(elliptic_t *elliptic_)
       return;
     }
 
-    auto o_p = platform->o_memPool.reserve<pfloat>(mesh->Nlocal);
-    auto o_Ap = platform->o_memPool.reserve<pfloat>(mesh->Nlocal);
+    auto o_p = platform->deviceMemoryPool.reserve<pfloat>(mesh->Nlocal);
+    auto o_Ap = platform->deviceMemoryPool.reserve<pfloat>(mesh->Nlocal);
 
     auto timeOperator = [&]() {
       const int Nsamples = 10;
@@ -223,7 +223,7 @@ void ellipticMultiGridSetup(elliptic_t *elliptic_)
               baseLevel->smooth(o_rhs, o_x, true);
               baseLevel->residual(o_rhs, o_x, o_res);
 
-              auto o_tmp = platform->o_memPool.reserve<pfloat>(o_x.size());
+              auto o_tmp = platform->deviceMemoryPool.reserve<pfloat>(o_x.size());
               elliptic->precon->SEMFEMSolver->run(o_res, o_tmp);
 
               platform->linAlg->paxpby(o_x.size(), 1.0, o_tmp, 1.0, o_x);
@@ -299,7 +299,7 @@ void ellipticMultiGridSetup(elliptic_t *elliptic_)
               baseLevel->smooth(o_rhs, o_x, true);
               baseLevel->residual(o_rhs, o_x, o_res);
 
-              auto o_tmp = platform->o_memPool.reserve<pfloat>(baseLevel->Nrows);
+              auto o_tmp = platform->deviceMemoryPool.reserve<pfloat>(baseLevel->Nrows);
               coarseLevel->solve(o_res, o_tmp);
 
               platform->linAlg->paxpby(baseLevel->Nrows, 1.0, o_tmp, 1.0, o_x);
diff --git a/src/elliptic/SEMFEMSolver.cpp b/src/elliptic/SEMFEMSolver.cpp
index 52f108940..398edb58c 100644
--- a/src/elliptic/SEMFEMSolver.cpp
+++ b/src/elliptic/SEMFEMSolver.cpp
@@ -166,8 +166,8 @@ void SEMFEMSolver_t::run(const occa::memory &o_r, occa::memory &o_z)
   const auto useDevice = elliptic->options.compareArgs("COARSE SOLVER LOCATION", "DEVICE");
   const dlong numRows = o_dofMap.size();
 
-  auto o_rT = platform->o_memPool.reserve<pfloat>(numRows);
-  auto o_zT = platform->o_memPool.reserve<pfloat>(numRows);
+  auto o_rT = platform->deviceMemoryPool.reserve<pfloat>(numRows);
+  auto o_zT = platform->deviceMemoryPool.reserve<pfloat>(numRows);
 
   static occa::kernel gatherKernel;
   if (!gatherKernel.isInitialized()) {
diff --git a/src/elliptic/ellipticPreconditionerSetup.cpp b/src/elliptic/ellipticPreconditionerSetup.cpp
index c06d7b6fd..b8219fb16 100644
--- a/src/elliptic/ellipticPreconditionerSetup.cpp
+++ b/src/elliptic/ellipticPreconditionerSetup.cpp
@@ -32,7 +32,6 @@ void ellipticPreconditionerSetup(elliptic_t *elliptic, ogs_t *ogs)
 {
   const auto pre = platform->device.occaDevice().memoryAllocated();
 
-  mesh_t *mesh = elliptic->mesh;
   setupAide &options = elliptic->options;
 
   MPI_Barrier(platform->comm.mpiComm);
diff --git a/src/elliptic/ellipticSetup.cpp b/src/elliptic/ellipticSetup.cpp
index 2905ed941..3760c7405 100644
--- a/src/elliptic/ellipticSetup.cpp
+++ b/src/elliptic/ellipticSetup.cpp
@@ -328,8 +328,8 @@ void ellipticSolveSetup(elliptic_t *elliptic, const occa::memory &o_lambda0, con
 
   if (platform->options.compareArgs("ENABLE GS COMM OVERLAP", "TRUE")) {
     const auto Nlocal = elliptic->Nfields * static_cast<size_t>(elliptic->fieldOffset);
-    auto o_p = platform->o_memPool.reserve<dfloat>(Nlocal);
-    auto o_Ap = platform->o_memPool.reserve<dfloat>(Nlocal);
+    auto o_p = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
+    auto o_Ap = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
 
     auto timeEllipticOperator = [&]() {
       const int Nsamples = 10;
diff --git a/src/elliptic/ellipticSolutionProjection.cpp b/src/elliptic/ellipticSolutionProjection.cpp
index 3e33c1e56..f0a993019 100644
--- a/src/elliptic/ellipticSolutionProjection.cpp
+++ b/src/elliptic/ellipticSolutionProjection.cpp
@@ -105,7 +105,7 @@ void SolutionProjection::updateProjectionSpace()
   // printf("norm_new:%g norm_orig:%g sumAlpha:%g\n", norm_new, norm_orig, sumAlpha);
   norm_new = sqrt(norm_new);
 
-  dfloat tol = 1e-6;
+  const dfloat tol = (sizeof(dfloat) == sizeof(double)) ? 1e-6: 1e-4;
   const dfloat test = norm_new / norm_orig;
   if (test > tol) {
     const dfloat scale = 1.0 / norm_new;
@@ -142,7 +142,6 @@ void SolutionProjection::computePreProjection(occa::memory &o_r)
   dfloat flopCount = 0.0;
 
   dfloat one = 1.0;
-  dfloat zero = 0.0;
   dfloat mone = -1.0;
   if (numVecsProjection <= 0) {
     return;
@@ -178,7 +177,7 @@ void SolutionProjection::computePreProjection(occa::memory &o_r)
 
   flopCount += Nfields * (1 + 2 * (numVecsProjection - 1)) * static_cast<double>(Nlocal);
   if (type == ProjectionType::CLASSIC) {
-    auto o_rtmp = platform->o_memPool.reserve<dfloat>(Nfields * fieldOffset);
+    auto o_rtmp = platform->deviceMemoryPool.reserve<dfloat>(Nfields * fieldOffset);
     accumulateKernel(Nlocal, numVecsProjection, fieldOffset, o_alpha, o_bb, o_rtmp);
     platform->linAlg->axpbyMany(Nlocal, Nfields, fieldOffset, mone, o_rtmp, one, o_r);
 
@@ -194,7 +193,6 @@ void SolutionProjection::computePreProjection(occa::memory &o_r)
 void SolutionProjection::computePostProjection(occa::memory &o_x)
 {
   const dfloat one = 1.0;
-  const dfloat zero = 0.0;
 
   if (numVecsProjection == 0) {
     // reset bases
diff --git a/src/elliptic/ellipticSolve.cpp b/src/elliptic/ellipticSolve.cpp
index 3b1f1d4d1..189a14549 100644
--- a/src/elliptic/ellipticSolve.cpp
+++ b/src/elliptic/ellipticSolve.cpp
@@ -66,7 +66,7 @@ void ellipticSolve(elliptic_t *elliptic,
                txt.c_str());
   };
 
-  auto o_x0 = platform->o_memPool.reserve<dfloat>(
+  auto o_x0 = platform->deviceMemoryPool.reserve<dfloat>(
       (elliptic->Nfields > 1) ? elliptic->Nfields * elliptic->fieldOffset : mesh->Nlocal);
   nekrsCheck(o_x.size() < o_x0.size(), MPI_COMM_SELF, EXIT_FAILURE, "%s!\n", "unreasonable size of o_x");
   nekrsCheck(o_rhs.size() < o_x.size(), MPI_COMM_SELF, EXIT_FAILURE, "%s!\n", "unreasonable size of o_rhs");
@@ -133,7 +133,7 @@ void ellipticSolve(elliptic_t *elliptic,
 
   // compute initial residual r = rhs - Ax0
   auto o_r = [&]() {
-    auto o_r = platform->o_memPool.reserve<dfloat>(o_x0.size());
+    auto o_r = platform->deviceMemoryPool.reserve<dfloat>(o_x0.size());
     auto &o_Ap = o_x;
     ellipticAx(elliptic, mesh->Nelements, mesh->o_elementList, o_x0, o_Ap, dfloatString);
     platform->linAlg
diff --git a/src/elliptic/ellipticWorkspace.cpp b/src/elliptic/ellipticWorkspace.cpp
index bca7fb77c..fe434782d 100644
--- a/src/elliptic/ellipticWorkspace.cpp
+++ b/src/elliptic/ellipticWorkspace.cpp
@@ -5,8 +5,8 @@ void ellipticAllocateWorkspace(elliptic_t *elliptic)
 {
   const auto Nlocal = elliptic->Nfields * static_cast<size_t>(elliptic->fieldOffset);
 
-  elliptic->o_rPfloat = platform->o_memPool.reserve<pfloat>(Nlocal);
-  elliptic->o_zPfloat = platform->o_memPool.reserve<pfloat>(Nlocal);
+  elliptic->o_rPfloat = platform->deviceMemoryPool.reserve<pfloat>(Nlocal);
+  elliptic->o_zPfloat = platform->deviceMemoryPool.reserve<pfloat>(Nlocal);
 
   if (elliptic->precon) {
     if (elliptic->precon->MGSolver) {
diff --git a/src/elliptic/kernels/ellipticBlockPartialAxCoeffHex3D.okl b/src/elliptic/kernels/ellipticBlockPartialAxCoeffHex3D.okl
index 9551a34cf..e50a9d1ba 100644
--- a/src/elliptic/kernels/ellipticBlockPartialAxCoeffHex3D.okl
+++ b/src/elliptic/kernels/ellipticBlockPartialAxCoeffHex3D.okl
@@ -713,3 +713,366 @@
   }
 }
 #endif
+
+// Tim's v5
+#if p_knl == 3
+#define p_dim 3
+@kernel void ellipticBlockPartialAxCoeffHex3D_v3(const dlong Nelements,
+                                                    const dlong offset,
+                                                    const dlong loffset,
+                                                    @ restrict const dlong *elementList,
+                                                    @ restrict const dfloat *ggeo,
+                                                    @ restrict const dfloat *D,
+                                                    @ restrict const dfloat *S,
+                                                    @ restrict const dfloat *lambda0,
+                                                    @ restrict const dfloat *lambda1,
+                                                    @ restrict const dfloat *q,
+                                                    @ restrict dfloat *Aq)
+{
+#if defined(XeHPC)
+  @simd_length(16)
+#endif
+    @outer for (dlong e = 0; e < Nelements; ++e) {
+    @exclusive dlong element;
+
+#if p_Nq==2 || p_Nq==4 || p_Nq==6 || p_Nq==8
+#define p_pad 1
+#else
+#define p_pad 0
+#endif
+    
+    @shared dfloat s_U1[p_Nq][p_Nq][p_Nq];
+    @shared dfloat s_U2[p_Nq][p_Nq][p_Nq];
+
+    @inner for (int j = 0; j < p_Nq; ++j) {
+      @inner for (int i = 0; i < p_Nq; ++i) {
+
+	element = elementList[e];
+
+	for (int d = 0; d < p_dim; ++d) {
+	  @barrier();
+
+	  dfloat r_U[p_Nq];
+#pragma unroll p_Nq
+	  for (int k = 0; k < p_Nq; ++k) {
+	    const dfloat U_ijk = q[i + p_Nq * (j + p_Nq * (k + p_Nq * element)) + d * offset];
+	    r_U[k] = U_ijk;
+	    s_U2[k][j][i] = U_ijk;
+	  }
+
+	  @barrier();
+	  
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat U_jil = s_U2[j][i][l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[k][l] * U_jil;
+	      }
+	    }
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_U1[j][i][k] = r_Ud[k];
+	    }
+	  }
+
+	  @barrier();
+	  
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat U_jli = s_U2[j][l][i];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[k][l] * U_jli;
+	      }
+	    }
+
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_U2[j][k][i] = r_Ud[k];
+	    }
+	  }
+
+	  @barrier();
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat U_lji = r_U[l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[k][l] * U_lji;
+	      }
+	    }
+        
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      const dlong lbase = i+p_Nq*j+p_Nq*p_Nq*k;
+	      const dlong gbase = lbase+p_Np*element*p_Nggeo;
+	      const dfloat lam0k = lambda0[lbase+p_Np*element+d*loffset];
+	      const dfloat lam1k = lambda1[lbase+p_Np*element+d*loffset];
+
+	      const dfloat G00 = ggeo[gbase+p_Np*p_G00ID], G01 = ggeo[gbase+p_Np*p_G01ID], G02 = ggeo[gbase+p_Np*p_G02ID];
+	      const dfloat G11 = ggeo[gbase+p_Np*p_G11ID], G12 = ggeo[gbase+p_Np*p_G12ID];
+	      const dfloat G22 = ggeo[gbase+p_Np*p_G22ID];
+	      const dfloat GWJ = ggeo[gbase+p_Np*p_GWJID];
+
+	      const dfloat Ur = s_U1[k][j][i];
+	      const dfloat Us = s_U2[k][j][i];
+	      
+	      const dfloat GUr = G00 * Ur + G01 * Us + G02 * r_Ud[k];
+	      const dfloat GUs = G01 * Ur + G11 * Us + G12 * r_Ud[k];
+	      const dfloat GUt = G02 * Ur + G12 * Us + G22 * r_Ud[k];
+
+	      r_Ud[k]       = lam0k * GUt;
+	      r_U[k] *= GWJ * lam1k;
+
+	      s_U1[k][j][i] = lam0k * GUr;
+	      s_U2[k][j][i] = lam0k * GUs;
+
+	    }
+	  
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat GUt_lji = r_Ud[l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_U[k] += c_D[l][k] * GUt_lji;
+	      }
+	    }
+	  }
+
+	  @barrier();
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat GUr_jil = s_U1[j][i][l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[l][k] * GUr_jil;
+	      }
+	    }
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_U1[j][i][k] = r_Ud[k];
+	    }
+	  }
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat GUs_jli = s_U2[j][l][i];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[l][k] * GUs_jli;
+	      }
+	    }
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_U2[j][k][i] = r_Ud[k];
+	    }
+	  }
+        
+	  @barrier();
+
+#pragma unroll p_Nq
+	  for (int k = 0; k < p_Nq; ++k) {
+	    Aq[i + p_Nq * (j + p_Nq * (k + p_Nq * element)) + d * offset] = r_U[k] + s_U1[k][j][i] + s_U2[k][j][i];
+	  }
+	}
+      }
+    }
+  }
+}
+#endif
+
+// Tim's v6
+#if p_knl == 4
+#define p_dim 3
+@kernel void ellipticBlockPartialAxCoeffHex3D_v4(const dlong Nelements,
+                                                    const dlong offset,
+                                                    const dlong loffset,
+                                                    @ restrict const dlong *elementList,
+                                                    @ restrict const dfloat *ggeo,
+                                                    @ restrict const dfloat *D,
+                                                    @ restrict const dfloat *S,
+                                                    @ restrict const dfloat *lambda0,
+                                                    @ restrict const dfloat *lambda1,
+                                                    @ restrict const dfloat *q,
+                                                    @ restrict dfloat *Aq)
+{
+#if defined(XeHPC)
+  @simd_length(16)
+#endif
+    @outer for (dlong e = 0; e < Nelements; ++e) {
+
+#if p_Nq==2 || p_Nq==4 || p_Nq==6 || p_Nq==8
+#define p_pad 1
+#else
+#define p_pad 0
+#endif
+    
+    @shared dfloat s_U[p_Nq][p_Nq][p_Nq];
+    @shared dfloat s_Ur[p_Nq][p_Nq][p_Nq];
+    @shared dfloat s_Us[p_Nq][p_Nq][p_Nq];
+
+    @inner for (int j = 0; j < p_Nq; ++j) {
+      @inner for (int i = 0; i < p_Nq; ++i) {
+
+	const dlong element = elementList[e];
+
+	for (int d = 0; d < p_dim; ++d) {
+
+	  @barrier();
+
+	  dfloat r_U[p_Nq];
+#pragma unroll p_Nq
+	  for (int k = 0; k < p_Nq; ++k) {
+	    const dfloat U_ijk = q[i + p_Nq * (j + p_Nq * (k + p_Nq * element)) + d * offset];
+	    s_U[k][j][i] = U_ijk;
+	  }
+
+	  @barrier();
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat U_jil = s_U[j][i][l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[k][l] * U_jil;
+	      }
+	    }
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_Ur[j][i][k] = r_Ud[k];
+	    }
+	  }
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat U_jli = s_U[j][l][i];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[k][l] * U_jli;
+	      }
+	    }
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_Us[j][k][i] = r_Ud[k];
+	    }
+	  }
+
+	  @barrier();
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat U_lji = s_U[l][j][i]; // r_U[l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[k][l] * U_lji;
+	      }
+	    }
+        
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      const dlong lbase = i+p_Nq*j+p_Nq*p_Nq*k;
+	      const dlong gbase = lbase+p_Np*element*p_Nggeo;
+	      const dfloat lam0k = lambda0[lbase+p_Np*element+d*loffset];
+	      const dfloat lam1k = lambda1[lbase+p_Np*element+d*loffset];
+
+	      const dfloat G00 = ggeo[gbase+p_Np*p_G00ID], G01 = ggeo[gbase+p_Np*p_G01ID], G02 = ggeo[gbase+p_Np*p_G02ID];
+	      const dfloat G11 = ggeo[gbase+p_Np*p_G11ID], G12 = ggeo[gbase+p_Np*p_G12ID];
+	      const dfloat G22 = ggeo[gbase+p_Np*p_G22ID];
+	      const dfloat GWJ = ggeo[gbase+p_Np*p_GWJID];
+	      
+	      const dfloat GUr = G00 * s_Ur[k][j][i] + G01 * s_Us[k][j][i] + G02 * r_Ud[k];
+	      const dfloat GUs = G01 * s_Ur[k][j][i] + G11 * s_Us[k][j][i] + G12 * r_Ud[k];
+	      const dfloat GUt = G02 * s_Ur[k][j][i] + G12 * s_Us[k][j][i] + G22 * r_Ud[k];
+
+	      s_Ur[k][j][i] = lam0k * GUr;
+	      s_Us[k][j][i] = lam0k * GUs;
+	      r_Ud[k]       = lam0k * GUt;
+	      //r_U[k] *= GWJ * lam1k;
+	      r_U[k] = GWJ * lam1k*s_U[k][j][i];
+	    }
+	  
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat GUt_lji = r_Ud[l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_U[k] += c_D[l][k] * GUt_lji;
+	      }
+	    }
+	  }
+
+	  @barrier();
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat GUr_jil = s_Ur[j][i][l];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[l][k] * GUr_jil;
+	      }
+	    }
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_Ur[j][i][k] = r_Ud[k];
+	    }
+	  }
+
+	  {
+	    dfloat r_Ud[p_Nq] = {0.};
+
+#pragma unroll p_Nq
+	    for (int l = 0; l < p_Nq; ++l) {
+	      const dfloat GUs_jli = s_Us[j][l][i];
+#pragma unroll p_Nq
+	      for (int k = 0; k < p_Nq; ++k) {
+		r_Ud[k] += c_D[l][k] * GUs_jli;
+	      }
+	    }
+#pragma unroll p_Nq
+	    for (int k = 0; k < p_Nq; ++k) {
+	      s_Us[j][k][i] = r_Ud[k];
+	    }
+	  }
+        
+	  @barrier();
+
+#pragma unroll p_Nq
+	  for (int k = 0; k < p_Nq; ++k) {
+	    Aq[i + p_Nq * (j + p_Nq * (k + p_Nq * element)) + d * offset] = r_U[k] + s_Ur[k][j][i] + s_Us[k][j][i];
+	  }
+	}
+      }
+    }
+  }
+}
+#endif
diff --git a/src/elliptic/linearSolver/PCG.cpp b/src/elliptic/linearSolver/PCG.cpp
index 4e25206d7..bc135ec55 100644
--- a/src/elliptic/linearSolver/PCG.cpp
+++ b/src/elliptic/linearSolver/PCG.cpp
@@ -38,6 +38,8 @@
 namespace
 {
 
+constexpr auto tiny = 10 * std::numeric_limits<dfloat>::min();
+
 occa::memory o_p;
 occa::memory o_z;
 occa::memory o_Ap;
@@ -54,7 +56,6 @@ dfloat update(elliptic_t *elliptic,
 {
   mesh_t *mesh = elliptic->mesh;
 
-
   const bool serial = platform->serial;
 
   // r <= r - alpha*A*p
@@ -69,10 +70,10 @@ dfloat update(elliptic_t *elliptic,
 
   dfloat rdotr1 = 0;
 #ifdef ELLIPTIC_ENABLE_TIMER
-  platform->timer.tic("dotp",0);
+  platform->timer.tic("dotp");
 #endif
   if (serial) {
-    rdotr1 = *((dfloat *)o_tmpReductions.ptr());
+    rdotr1 = *(o_tmpReductions.ptr<dfloat>());
   } else {
     auto tmp = h_tmpReductions.ptr<dfloat>();
     o_tmpReductions.copyTo(tmp);
@@ -89,7 +90,7 @@ dfloat update(elliptic_t *elliptic,
   platform->timer.toc("dotp");
 #endif
 
-  platform->flopCounter->add(elliptic->name + " ellipticUpdatePC",
+  platform->flopCounter->add(elliptic->name + " ellipticUpdatePCG",
                              elliptic->Nfields * static_cast<double>(mesh->Nlocal) * 6 + mesh->Nlocal);
 
   return rdotr1;
@@ -135,8 +136,11 @@ void combinedPCGReductions(elliptic_t *elliptic,
     }
   }
 
-  // batch into single, large all-reduce
+  // batch into single fused all-reduce
   MPI_Allreduce(MPI_IN_PLACE, reductions.data(), nRed, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm);
+
+  platform->flopCounter->add(elliptic->name + " ellipticCombinedPCGReductions",
+                             elliptic->Nfields * static_cast<double>(mesh->Nlocal) * 3 * 7);
 }
 
 int standardPCG(elliptic_t *elliptic,
@@ -231,7 +235,7 @@ int standardPCG(elliptic_t *elliptic,
                                                                o_p,
                                                                o_Ap,
                                                                platform->comm.mpiComm);
-    alpha = rdotz1 / (pAp + 10 * std::numeric_limits<dfloat>::min());
+    alpha = rdotz1 / (pAp + tiny);
 
 #ifdef DEBUG
     printf("alpha: %.15e\n", alpha);
@@ -276,7 +280,6 @@ int combinedPCG(elliptic_t *elliptic,
   const int verbose = platform->options.compareArgs("VERBOSE", "TRUE");
   const int preco = !options.compareArgs("PRECONDITIONER", "NONE");
 
-  constexpr auto tiny = 10 * std::numeric_limits<dfloat>::min();
 
   dfloat betakm1 = 0;
   dfloat betakm2 = 0;
@@ -288,7 +291,6 @@ int combinedPCG(elliptic_t *elliptic,
 
   /*aux variables */
   auto &o_Minv = (preco) ? precon->o_invDiagA : o_null;
-  auto &o_weight = elliptic->o_invDegree;
   platform->linAlg->fill(o_p.size(), 0.0, o_p);
   platform->linAlg->fill(o_v.size(), 0.0, o_v);
 
@@ -322,6 +324,10 @@ int combinedPCG(elliptic_t *elliptic,
                                          o_x,
                                          o_r);
 
+    platform->flopCounter->add(elliptic->name + " ellipticCombinedPCGPreMatVecKernel",
+                               elliptic->Nfields * static_cast<double>(mesh->Nlocal) * 0.5*(11 + 5));
+
+
     ellipticOperator(elliptic, o_p, o_v, dfloatString);
 
     combinedPCGReductions(elliptic, preco, o_Minv, o_v, o_p, o_r, reductions);
@@ -367,7 +373,6 @@ int combinedPCG(elliptic_t *elliptic,
       printf("it %d r norm %.15e\n", iter, rdotr);
     }
 
-
     // converged, update solution prior to exit
     if (rdotr <= tol) {
       const dlong singleVectorUpdate = iter % 2 == 1;
@@ -420,36 +425,36 @@ int pcg(elliptic_t *elliptic,
 {
   setupAide &options = elliptic->options;
 
-  const auto Nlocal = (elliptic->Nfields > 1) ?
-                      elliptic->Nfields * static_cast<size_t>(elliptic->fieldOffset) : elliptic->mesh->Nlocal;
+  const auto Nlocal = (elliptic->Nfields > 1) ? elliptic->Nfields * static_cast<size_t>(elliptic->fieldOffset)
+                                              : elliptic->mesh->Nlocal;
 
-  o_p = platform->o_memPool.reserve<dfloat>(Nlocal);
-  o_z = (elliptic->options.compareArgs("PRECONDITIONER", "NONE")) ? o_r : platform->o_memPool.reserve<dfloat>(Nlocal);
-  o_Ap = platform->o_memPool.reserve<dfloat>(Nlocal);
+  o_p = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
+  o_z = (elliptic->options.compareArgs("PRECONDITIONER", "NONE"))
+            ? o_r
+            : platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
+  o_Ap = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
   if (elliptic->options.compareArgs("SOLVER", "PCG+COMBINED")) {
-    o_v = platform->o_memPool.reserve<dfloat>(Nlocal);
+    o_v = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
   }
 
-  o_tmpReductions = [&]() 
+  const auto Nblock = [&]()
   {
-    int Nreductions = 1;
-    if (options.compareArgs("SOLVER", "PCG+COMBINED")) {
-      Nreductions = CombinedPCGId::nReduction;
-    }
-    auto mesh = elliptic->mesh; 
+    auto mesh = elliptic->mesh;
     const dlong Nlocal = mesh->Np * mesh->Nelements;
-    const dlong Nblock = (Nlocal + BLOCKSIZE - 1) / BLOCKSIZE;
+    return (Nlocal + BLOCKSIZE - 1) / BLOCKSIZE;
+  }();
 
-    if (h_tmpReductions.size() < Nreductions * Nblock) {
-      h_tmpReductions.free();
-      h_tmpReductions = platform->device.mallocHost<dfloat>(Nreductions * Nblock);
-    }
 
-    return platform->o_memPool.reserve<dfloat>(Nreductions * Nblock);
+  auto Nreductions = [&]() {
+    int n = 1;
+    if (options.compareArgs("SOLVER", "PCG+COMBINED")) n = CombinedPCGId::nReduction; 
+    return n;
   }();
 
-  const auto Niter = [&]() 
-  {
+  h_tmpReductions = platform->memoryPool.reserve<dfloat>(Nreductions * Nblock);
+  o_tmpReductions = platform->deviceMemoryPool.reserve<dfloat>(h_tmpReductions.size());
+
+  const auto Niter = [&]() {
     if (elliptic->options.compareArgs("SOLVER", "PCG+COMBINED")) {
       return combinedPCG(elliptic, tol, MAXIT, rdotr, o_r, o_x);
     } else {
@@ -465,7 +470,9 @@ int pcg(elliptic_t *elliptic,
   if (elliptic->options.compareArgs("SOLVER", "PCG+COMBINED")) {
     o_v.free();
   }
+
   o_tmpReductions.free();
+  h_tmpReductions.free();
 
   return Niter;
 }
diff --git a/src/elliptic/linearSolver/PGMRES.cpp b/src/elliptic/linearSolver/PGMRES.cpp
index b0e190d8a..06b9a51da 100644
--- a/src/elliptic/linearSolver/PGMRES.cpp
+++ b/src/elliptic/linearSolver/PGMRES.cpp
@@ -310,12 +310,14 @@ int pgmres(elliptic_t *elliptic,
 {
   const auto Nlocal = elliptic->Nfields * static_cast<size_t>(elliptic->fieldOffset);
 
-  elliptic->gmresData->o_p = platform->o_memPool.reserve<dfloat>(Nlocal);
-  elliptic->gmresData->o_z = platform->o_memPool.reserve<dfloat>(Nlocal);
-  elliptic->gmresData->o_Ap = platform->o_memPool.reserve<dfloat>(Nlocal);
-  
-  elliptic->gmresData->o_V = platform->o_memPool.reserve<dfloat>(Nlocal * elliptic->gmresData->nRestartVectors);
-  elliptic->gmresData->o_Z = platform->o_memPool.reserve<dfloat>(Nlocal * ((elliptic->gmresData->flexible) ? elliptic->gmresData->nRestartVectors : 1));
+  elliptic->gmresData->o_p = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
+  elliptic->gmresData->o_z = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
+  elliptic->gmresData->o_Ap = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
+
+  elliptic->gmresData->o_V =
+      platform->deviceMemoryPool.reserve<dfloat>(Nlocal * elliptic->gmresData->nRestartVectors);
+  elliptic->gmresData->o_Z = platform->deviceMemoryPool.reserve<dfloat>(
+      Nlocal * ((elliptic->gmresData->flexible) ? elliptic->gmresData->nRestartVectors : 1));
 
   const int Niter = _pgmres(elliptic, tol, MAXIT, rdotr, o_r, o_x);
 
@@ -325,5 +327,5 @@ int pgmres(elliptic_t *elliptic,
   elliptic->gmresData->o_V.free();
   elliptic->gmresData->o_Z.free();
 
-  return Niter; 
+  return Niter;
 }
diff --git a/src/elliptic/kernels/combinedPCGPostMatVec.c b/src/elliptic/linearSolver/kernels/combinedPCGPostMatVec.c
similarity index 100%
rename from src/elliptic/kernels/combinedPCGPostMatVec.c
rename to src/elliptic/linearSolver/kernels/combinedPCGPostMatVec.c
diff --git a/src/elliptic/kernels/combinedPCGPostMatVec.okl b/src/elliptic/linearSolver/kernels/combinedPCGPostMatVec.okl
similarity index 100%
rename from src/elliptic/kernels/combinedPCGPostMatVec.okl
rename to src/elliptic/linearSolver/kernels/combinedPCGPostMatVec.okl
diff --git a/src/elliptic/kernels/combinedPCGPreMatVec.c b/src/elliptic/linearSolver/kernels/combinedPCGPreMatVec.c
similarity index 96%
rename from src/elliptic/kernels/combinedPCGPreMatVec.c
rename to src/elliptic/linearSolver/kernels/combinedPCGPreMatVec.c
index f8251f752..38ef48e02 100644
--- a/src/elliptic/kernels/combinedPCGPreMatVec.c
+++ b/src/elliptic/linearSolver/kernels/combinedPCGPreMatVec.c
@@ -22,7 +22,7 @@ extern "C" void FUNC(combinedPCGPreMatVec)(const dlong &N,
         const dfloat pkm1 = p[id];
         const dfloat rkm1 = r[id];
         const dfloat vkm1 = v[id];
-        x[id] = x[id] + alphakm1 * pkm1 + alphaDivBetakm2 * (pkm1 - M * rkm1);
+        x[id] += alphakm1 * pkm1 + alphaDivBetakm2 * (pkm1 - M * rkm1);
         const dfloat rk = rkm1 - alphakm1 * vkm1;
         r[id] = rk;
         p[id] = M * rk + betakm1 * pkm1;
diff --git a/src/elliptic/kernels/combinedPCGPreMatVec.okl b/src/elliptic/linearSolver/kernels/combinedPCGPreMatVec.okl
similarity index 100%
rename from src/elliptic/kernels/combinedPCGPreMatVec.okl
rename to src/elliptic/linearSolver/kernels/combinedPCGPreMatVec.okl
diff --git a/src/elliptic/kernels/combinedPCGUpdateConvergedSolution.c b/src/elliptic/linearSolver/kernels/combinedPCGUpdateConvergedSolution.c
similarity index 92%
rename from src/elliptic/kernels/combinedPCGUpdateConvergedSolution.c
rename to src/elliptic/linearSolver/kernels/combinedPCGUpdateConvergedSolution.c
index 120b2dc29..1ab68953e 100644
--- a/src/elliptic/kernels/combinedPCGUpdateConvergedSolution.c
+++ b/src/elliptic/linearSolver/kernels/combinedPCGUpdateConvergedSolution.c
@@ -18,7 +18,7 @@ FUNC(combinedPCGUpdateConvergedSolution)(const dlong &N,
       for (dlong n = 0; n < N; ++n) {
         const dlong id = n + fld * fieldOffset;
         const dfloat pk = p[id];
-        x[id] = x[id] + alphak * pk;
+        x[id] += alphak * pk;
       }
     }
   } else {
@@ -28,7 +28,7 @@ FUNC(combinedPCGUpdateConvergedSolution)(const dlong &N,
         const dfloat pk = p[id];
         const dfloat rk = r[id];
         const dfloat M = preco ? Minv[id] : 1.0;
-        x[id] = x[id] + alphak * pk + alphaDivBetakm1 * (pk - M * rk);
+        x[id] += alphak * pk + alphaDivBetakm1 * (pk - M * rk);
       }
     }
   }
diff --git a/src/elliptic/kernels/combinedPCGUpdateConvergedSolution.okl b/src/elliptic/linearSolver/kernels/combinedPCGUpdateConvergedSolution.okl
similarity index 100%
rename from src/elliptic/kernels/combinedPCGUpdateConvergedSolution.okl
rename to src/elliptic/linearSolver/kernels/combinedPCGUpdateConvergedSolution.okl
diff --git a/src/elliptic/kernels/ellipticBlockUpdatePCG.c b/src/elliptic/linearSolver/kernels/ellipticBlockUpdatePCG.c
similarity index 100%
rename from src/elliptic/kernels/ellipticBlockUpdatePCG.c
rename to src/elliptic/linearSolver/kernels/ellipticBlockUpdatePCG.c
diff --git a/src/elliptic/kernels/ellipticBlockUpdatePCG.okl b/src/elliptic/linearSolver/kernels/ellipticBlockUpdatePCG.okl
similarity index 100%
rename from src/elliptic/kernels/ellipticBlockUpdatePCG.okl
rename to src/elliptic/linearSolver/kernels/ellipticBlockUpdatePCG.okl
diff --git a/src/elliptic/kernels/fusedResidualAndNorm.c b/src/elliptic/linearSolver/kernels/fusedResidualAndNorm.c
similarity index 100%
rename from src/elliptic/kernels/fusedResidualAndNorm.c
rename to src/elliptic/linearSolver/kernels/fusedResidualAndNorm.c
diff --git a/src/elliptic/kernels/fusedResidualAndNorm.okl b/src/elliptic/linearSolver/kernels/fusedResidualAndNorm.okl
similarity index 100%
rename from src/elliptic/kernels/fusedResidualAndNorm.okl
rename to src/elliptic/linearSolver/kernels/fusedResidualAndNorm.okl
diff --git a/src/elliptic/kernels/gramSchmidtOrthogonalization.c b/src/elliptic/linearSolver/kernels/gramSchmidtOrthogonalization.c
similarity index 100%
rename from src/elliptic/kernels/gramSchmidtOrthogonalization.c
rename to src/elliptic/linearSolver/kernels/gramSchmidtOrthogonalization.c
diff --git a/src/elliptic/kernels/gramSchmidtOrthogonalization.okl b/src/elliptic/linearSolver/kernels/gramSchmidtOrthogonalization.okl
similarity index 100%
rename from src/elliptic/kernels/gramSchmidtOrthogonalization.okl
rename to src/elliptic/linearSolver/kernels/gramSchmidtOrthogonalization.okl
diff --git a/src/elliptic/kernels/updatePGMRESSolution.c b/src/elliptic/linearSolver/kernels/updatePGMRESSolution.c
similarity index 100%
rename from src/elliptic/kernels/updatePGMRESSolution.c
rename to src/elliptic/linearSolver/kernels/updatePGMRESSolution.c
diff --git a/src/elliptic/kernels/updatePGMRESSolution.okl b/src/elliptic/linearSolver/kernels/updatePGMRESSolution.okl
similarity index 100%
rename from src/elliptic/kernels/updatePGMRESSolution.okl
rename to src/elliptic/linearSolver/kernels/updatePGMRESSolution.okl
diff --git a/src/elliptic/registerEllipticKernels.cpp b/src/elliptic/registerEllipticKernels.cpp
index 052cc5bc5..f951beb72 100644
--- a/src/elliptic/registerEllipticKernels.cpp
+++ b/src/elliptic/registerEllipticKernels.cpp
@@ -7,7 +7,7 @@ namespace {
 
 void registerGMRESKernels(const std::string &section, int Nfields)
 {
-  const std::string oklpath = getenv("NEKRS_KERNEL_DIR") + std::string("/elliptic/");
+  const std::string oklpath = getenv("NEKRS_KERNEL_DIR") + std::string("/elliptic/linearSolver/");
   std::string fileName;
   const bool serial = platform->serial;
 
@@ -32,7 +32,7 @@ void registerGMRESKernels(const std::string &section, int Nfields)
 
 void registerCombinedPCGKernels(const std::string &section, int Nfields)
 {
-  const std::string oklpath = getenv("NEKRS_KERNEL_DIR") + std::string("/elliptic/");
+  const std::string oklpath = getenv("NEKRS_KERNEL_DIR") + std::string("/elliptic/linearSolver/");
   std::string fileName;
   const bool serial = platform->serial;
 
@@ -125,18 +125,19 @@ void registerEllipticKernels(std::string section, int poissonEquation)
       occa::properties properties = platform->kernelInfo;
 
       kernelName = "fusedCopyDfloatToPfloat";
-      fileName = oklpath + kernelName + extension;
+      fileName = oklpath + kernelName + fileNameExtension;
       platform->kernelRequests.add(kernelName, fileName, properties);
 
       properties["defines/p_Nfields"] = Nfields;
 
       kernelName = "ellipticBlockUpdatePCG";
-      fileName = oklpath + "ellipticBlockUpdatePCG" + extension;
+      fileName = oklpath + "/linearSolver/" + "ellipticBlockUpdatePCG" + fileNameExtension;
       platform->kernelRequests.add(sectionIdentifier + kernelName, fileName, properties);
 
       kernelName = "multiScaledAddwOffset";
       fileName = oklpath + kernelName + extension;
       platform->kernelRequests.add(sectionIdentifier + kernelName, fileName, properties);
+
       kernelName = "accumulate";
       fileName = oklpath + kernelName + extension;
       platform->kernelRequests.add(sectionIdentifier + kernelName, fileName, properties);
diff --git a/src/mesh/mesh.h b/src/mesh/mesh.h
index ea54ca80d..5738f24f3 100644
--- a/src/mesh/mesh.h
+++ b/src/mesh/mesh.h
@@ -56,7 +56,7 @@ struct mesh_t {
   std::vector<dfloat> minDistance(const std::vector<dlong> &bID, std::string type, int maxIter = 10000);
 
   occa::memory intpMatrix(std::vector<dfloat> M);
-  void interpolate(const occa::memory& o_z, mesh_t *meshC, occa::memory& o_zC, bool uniform = false);
+  void interpolate(const occa::memory& o_z, mesh_t *meshC, occa::memory& o_zC, bool uniform = false, dlong nel = 0);
 
   void move();
   void update();
diff --git a/src/mesh/meshDistance.cpp b/src/mesh/meshDistance.cpp
index 3d155c43e..11db664c6 100644
--- a/src/mesh/meshDistance.cpp
+++ b/src/mesh/meshDistance.cpp
@@ -87,7 +87,7 @@ cheapDist(mesh_t *mesh, int nbID, const occa::memory &o_bID, dlong offsetFld, bo
                        o_dist);
   }
 
-  auto o_changed = platform->o_memPool.reserve<hlong>(mesh->Nlocal);
+  auto o_changed = platform->deviceMemoryPool.reserve<hlong>(mesh->Nlocal);
 
   for (int iter = 0; iter < maxIter; ++iter) {
     mesh->distanceKernel(mesh->Nelements,
@@ -162,7 +162,7 @@ occa::memory mesh_t::minDistance(int nbID, const occa::memory &o_bID, std::strin
 std::vector<dfloat>
 mesh_t::distance(const std::vector<dlong> &bID, dlong offsetFld, std::string type, int maxIter)
 {
-  auto o_bid = platform->o_memPool.reserve<dlong>(bID.size());
+  auto o_bid = platform->deviceMemoryPool.reserve<dlong>(bID.size());
   o_bid.copyFrom(bID.data());
 
   auto o_dist = this->distance(bID.size(), o_bid, offsetFld, type, maxIter);
@@ -178,7 +178,7 @@ mesh_t::distance(const std::vector<dlong> &bID, dlong offsetFld, std::string typ
 
 std::vector<dfloat> mesh_t::minDistance(const std::vector<dlong> &bID, std::string type, int maxIter)
 {
-  auto o_bid = platform->o_memPool.reserve<dlong>(bID.size());
+  auto o_bid = platform->deviceMemoryPool.reserve<dlong>(bID.size());
   o_bid.copyFrom(bID.data());
 
   auto o_dist = this->minDistance(bID.size(), o_bid, type, maxIter);
diff --git a/src/mesh/meshGeometricFactorsHex3D.cpp b/src/mesh/meshGeometricFactorsHex3D.cpp
index 01ba5de80..7ecb5f0fa 100644
--- a/src/mesh/meshGeometricFactorsHex3D.cpp
+++ b/src/mesh/meshGeometricFactorsHex3D.cpp
@@ -32,7 +32,7 @@
 
 void mesh_t::geometricFactors()
 {
-  auto o_J = platform->o_memPool.reserve<dfloat>(Nlocal * sizeof(dfloat));
+  auto o_J = platform->deviceMemoryPool.reserve<dfloat>(Nlocal * sizeof(dfloat));
 
   geometricFactorsKernel(Nelements, o_D, o_gllw, o_x, o_y, o_z, o_LMM, o_vgeo, o_ggeo, o_J);
 
diff --git a/src/mesh/meshIntp.cpp b/src/mesh/meshIntp.cpp
index 046a6445a..b310719ef 100644
--- a/src/mesh/meshIntp.cpp
+++ b/src/mesh/meshIntp.cpp
@@ -32,8 +32,10 @@ occa::memory mesh_t::intpMatrix(std::vector<dfloat> M)
   return o_J[M.size() - 1];
 }
 
-void mesh_t::interpolate(const occa::memory& o_z, mesh_t *mesh, occa::memory& o_zM, bool uniform)
+void mesh_t::interpolate(const occa::memory& o_z, mesh_t *mesh, occa::memory& o_zM, bool uniform, dlong nel_)
 {
+  auto nel = (nel_ > 0) ? nel_ : this->Nelements;
+
   if (uniform) {
     auto M = [&]()
     {
@@ -46,13 +48,12 @@ void mesh_t::interpolate(const occa::memory& o_z, mesh_t *mesh, occa::memory& o_
       return r;
     }();
 
-    this->intpKernel[mesh->N](this->Nelements, intpMatrix(M), o_z, o_zM);
+    this->intpKernel[mesh->N](nel, intpMatrix(M), o_z, o_zM);
     return;
   }
 
   std::vector<dfloat> M(mesh->Nq);
   for(int i = 0; i < M.size(); i++) M[i] = mesh->r[i];
 
-  const dlong nel = std::min(this->Nelements, mesh->Nelements);
   this->intpKernel[mesh->N](nel, intpMatrix(M), o_z, o_zM);
 }
diff --git a/src/mesh/meshSurface.cpp b/src/mesh/meshSurface.cpp
index 323f244da..0d759892a 100644
--- a/src/mesh/meshSurface.cpp
+++ b/src/mesh/meshSurface.cpp
@@ -111,7 +111,7 @@ std::vector<dfloat> mesh_t::surfaceAreaNormalMultiplyIntegrate(dlong fieldOffset
 
 occa::memory mesh_t::surfaceAreaMultiply(int nbID, const occa::memory &o_bID, const occa::memory &o_fld)
 {
-  auto o_out = platform->o_memPool.reserve<dfloat>(this->Nlocal);
+  auto o_out = platform->deviceMemoryPool.reserve<dfloat>(this->Nlocal);
 
   static occa::kernel kernel;
   if (!kernel.isInitialized()) {
diff --git a/src/mesh/planarAvg.cpp b/src/mesh/planarAvg.cpp
index a65aef1f4..1129a9531 100644
--- a/src/mesh/planarAvg.cpp
+++ b/src/mesh/planarAvg.cpp
@@ -126,7 +126,7 @@ void fusedPlanarAvg(mesh_t *mesh,
   }
 
   const auto Nlocal = nflds * mesh->Nq * elemDir;
-  auto o_scratch = platform->o_memPool.reserve<dfloat>(Nlocal);
+  auto o_scratch = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
 
   if (o_locToGlobE.byte_size() == 0) {
     std::vector<dlong> globalElement(mesh->Nelements, 0);
@@ -158,7 +158,7 @@ void fusedPlanarAvg(mesh_t *mesh,
                            o_avg,
                            o_scratch);
 
-  platform->comm.allreduce(o_scratch, Nlocal, comm_t::type::dfloat, comm_t::op::sum, platform->comm.mpiComm);
+  platform->comm.allreduce(o_scratch, Nlocal, comm_t::op::sum, platform->comm.mpiComm);
 
   scatterPlanarValuesKernel(mesh->Nelements,
                             nflds,
diff --git a/src/nekInterface/nekInterfaceAdapter.cpp b/src/nekInterface/nekInterfaceAdapter.cpp
index 1abb710d5..7a9291fbd 100644
--- a/src/nekInterface/nekInterfaceAdapter.cpp
+++ b/src/nekInterface/nekInterfaceAdapter.cpp
@@ -19,9 +19,32 @@ static void (*userbc_ptr)(void);
 static void (*useric_ptr)(void);
 static void (*userqtl_ptr)(void);
 static void (*usrsetvert_ptr)(void);
-static void (*nek_outfld_ptr)(char *, double*, int*, int *, int *, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*, int);
+static void (*nek_outfld_ptr)(char *,
+                              double *,
+                              int *,
+                              int *,
+                              int *,
+                              double *,
+                              double *,
+                              double *,
+                              double *,
+                              double *,
+                              double *,
+                              double *,
+                              double *,
+                              double *,
+                              int *,
+                              int);
 static void (*nek_openfld_ptr)(char *, double *, double *, int);
-static void (*nek_readfld_ptr)(double *, double *, double *, double *, double *, double *, double *, double *, double *);
+static void (*nek_readfld_ptr)(double *,
+                               double *,
+                               double *,
+                               double *,
+                               double *,
+                               double *,
+                               double *,
+                               double *,
+                               double *);
 static void (*nek_uic_ptr)(int *);
 static void (*nek_end_ptr)(void);
 static void (*nek_restart_ptr)(char *, int *);
@@ -59,7 +82,7 @@ static void (*nek_meshmetrics_ptr)(void);
 static int (*nek_gllnid_ptr)(int *);
 static int (*nek_gllel_ptr)(int *);
 
-static std::map<std::string, void*> ptrListData;
+static std::map<std::string, void *> ptrListData;
 
 void noop_func(void) {}
 
@@ -103,22 +126,32 @@ long long int localElementIdToGlobal(int _id)
   return static_cast<long long int>(gid - 1);
 }
 
-fldData openFld(const std::string& filename, std::vector<std::string>& _availableVariables)
+fldData openFld(const std::string &filename, std::vector<std::string> &_availableVariables)
 {
-  auto fname = const_cast<char*>(filename.c_str());
+  auto fname = const_cast<char *>(filename.c_str());
 
   double time_;
   double p0th_;
 
-  (*nek_openfld_ptr) (fname, &time_, &p0th_, static_cast<int>(filename.size())); 
+  (*nek_openfld_ptr)(fname, &time_, &p0th_, static_cast<int>(filename.size()));
 
-  if (*ptr<int>("getxr")) _availableVariables.push_back("mesh");
-  if (*ptr<int>("getur")) _availableVariables.push_back("velocity");
-  if (*ptr<int>("getpr")) _availableVariables.push_back("pressure");
-  if (*ptr<int>("gettr")) _availableVariables.push_back("temperature");
+  if (*ptr<int>("getxr")) {
+    _availableVariables.push_back("mesh");
+  }
+  if (*ptr<int>("getur")) {
+    _availableVariables.push_back("velocity");
+  }
+  if (*ptr<int>("getpr")) {
+    _availableVariables.push_back("pressure");
+  }
+  if (*ptr<int>("gettr")) {
+    _availableVariables.push_back("temperature");
+  }
 
   const auto nsr = *ptr<int>("npsr");
-  for (int i = 0; i < nsr; i++) _availableVariables.push_back("scalar" + scalarDigitStr(i));
+  for (int i = 0; i < nsr; i++) {
+    _availableVariables.push_back("scalar" + scalarDigitStr(i));
+  }
 
   fldData data;
   data.time = time_;
@@ -127,7 +160,7 @@ fldData openFld(const std::string& filename, std::vector<std::string>& _availabl
   return data;
 }
 
-void readFld(fldData& data)
+void readFld(fldData &data)
 {
   const auto nxyz = nekData.nx1 * nekData.nx1 * nekData.nx1;
   const auto Nlocal = nekData.nelt * nxyz;
@@ -135,50 +168,50 @@ void readFld(fldData& data)
 
   occa::memory xm, ym, zm;
   if (*ptr<int>("getxr")) {
-    xm = platform->memPool.reserve<double>(Nlocal);
-    ym = platform->memPool.reserve<double>(Nlocal);
-    zm = platform->memPool.reserve<double>(Nlocal);
+    xm = platform->memoryPool.reserve<double>(Nlocal);
+    ym = platform->memoryPool.reserve<double>(Nlocal);
+    zm = platform->memoryPool.reserve<double>(Nlocal);
   }
 
   occa::memory vx, vy, vz;
   if (*ptr<int>("getur")) {
-    vx = platform->memPool.reserve<double>(Nlocal);
-    vy = platform->memPool.reserve<double>(Nlocal);
-    vz = platform->memPool.reserve<double>(Nlocal);
+    vx = platform->memoryPool.reserve<double>(Nlocal);
+    vy = platform->memoryPool.reserve<double>(Nlocal);
+    vz = platform->memoryPool.reserve<double>(Nlocal);
   }
 
   occa::memory pr;
   if (*ptr<int>("getpr")) {
-    pr = platform->memPool.reserve<double>(Nlocal); 
+    pr = platform->memoryPool.reserve<double>(Nlocal);
   }
 
   occa::memory t;
   if (*ptr<int>("gettr")) {
-    t = platform->memPool.reserve<double>(Nlocal); 
+    t = platform->memoryPool.reserve<double>(Nlocal);
   }
 
   occa::memory s;
   const auto nsr = *ptr<int>("npsr");
   if (nsr) {
-    s = platform->memPool.reserve<double>(nekFieldOffset * nsr);
+    s = platform->memoryPool.reserve<double>(nekFieldOffset * nsr);
   }
 
-  (*nek_readfld_ptr) 
-  (
-    static_cast<double*>(xm.ptr()), static_cast<double*>(ym.ptr()), static_cast<double*>(zm.ptr()),
-    static_cast<double*>(vx.ptr()), static_cast<double*>(vy.ptr()), static_cast<double*>(vz.ptr()),
-    static_cast<double*>(pr.ptr()), 
-    static_cast<double*>(t.ptr()),
-    static_cast<double*>(s.ptr()) 
-  );
-
-  auto populate = [&](const std::vector<occa::memory>& fields, std::vector<occa::memory>& o_u)
-  {
+  (*nek_readfld_ptr)(static_cast<double *>(xm.ptr()),
+                     static_cast<double *>(ym.ptr()),
+                     static_cast<double *>(zm.ptr()),
+                     static_cast<double *>(vx.ptr()),
+                     static_cast<double *>(vy.ptr()),
+                     static_cast<double *>(vz.ptr()),
+                     static_cast<double *>(pr.ptr()),
+                     static_cast<double *>(t.ptr()),
+                     static_cast<double *>(s.ptr()));
+
+  auto populate = [&](const std::vector<occa::memory> &fields, std::vector<occa::memory> &o_u) {
     auto o_tmpDouble = platform->device.malloc<double>(Nlocal);
     o_u.resize(fields.size());
     for (int i = 0; i < fields.size(); i++) {
       o_tmpDouble.copyFrom(fields[i]);
-      o_u[i] = platform->o_memPool.reserve<dfloat>(Nlocal);
+      o_u[i] = platform->deviceMemoryPool.reserve<dfloat>(Nlocal);
       platform->copyDoubleToDfloatKernel(Nlocal, o_tmpDouble, o_u[i]);
     }
   };
@@ -203,42 +236,43 @@ void readFld(fldData& data)
     populate(fields, data.o_t);
   }
 
-  for(int i = 0; i < nsr; i++) {
-    std::vector<occa::memory> fields = {s.slice(i*nekFieldOffset, Nlocal)};
+  for (int i = 0; i < nsr; i++) {
+    std::vector<occa::memory> fields = {s.slice(i * nekFieldOffset, Nlocal)};
     std::vector<occa::memory> o_Si;
     data.o_s.push_back(o_Si);
     populate(fields, data.o_s.at(i));
   }
 }
 
-void writeFld(const std::string& filename,
-              const fldData& data,
+void writeFld(const std::string &filename,
+              const fldData &data,
               bool FP64,
-              const std::vector<int>& elementMask,
-              int Nout, 
+              const std::vector<int> &elementMask,
+              int Nout,
               bool uniform)
 {
   int step = 0;
   const auto nxyz = nekData.nx1 * nekData.nx1 * nekData.nx1;
   const auto Nlocal = nekData.nelt * nxyz;
 
-  const auto& time = data.time;
-  const auto& p0th = data.p0th;
+  const auto &time = data.time;
+  const auto &p0th = data.p0th;
 
-  const auto& o_x = data.o_x;
-  const auto& o_u = data.o_u;
-  const auto& o_p = data.o_p;
-  const auto& o_t = data.o_t;
-  const auto& o_s = data.o_s;
+  const auto &o_x = data.o_x;
+  const auto &o_u = data.o_u;
+  const auto &o_p = data.o_p;
+  const auto &o_t = data.o_t;
+  const auto &o_s = data.o_s;
 
-  auto copyField = [&](occa::memory o_fldIn, double* fldOut, std::string tag)
-  {
+  auto copyField = [&](occa::memory o_fldIn, double *fldOut, std::string tag) {
     nekrsCheck(o_fldIn.size() < Nlocal,
                platform->comm.mpiComm,
                EXIT_FAILURE,
                "%s%s%s\n",
-               "outfld: ",tag.c_str()," is too short on T-mesh!");
-    auto o_tmpDouble = platform->o_memPool.reserve<double>(Nlocal);
+               "outfld: ",
+               tag.c_str(),
+               " is too short on T-mesh!");
+    auto o_tmpDouble = platform->deviceMemoryPool.reserve<double>(Nlocal);
     if (o_fldIn.dtype() == occa::dtype::get<dfloat>()) {
       platform->copyDfloatToDoubleKernel(Nlocal, o_fldIn, o_tmpDouble);
       o_tmpDouble.copyTo(fldOut, Nlocal);
@@ -247,7 +281,7 @@ void writeFld(const std::string& filename,
     }
   };
 
-  std::vector<double> xm; 
+  std::vector<double> xm;
   std::vector<double> ym;
   std::vector<double> zm;
   if (o_x.size()) {
@@ -305,8 +339,8 @@ void writeFld(const std::string& filename,
     const auto nekFieldOffset = static_cast<size_t>(nekData.lelt) * nxyz;
     ps.resize(o_s.size() * nekFieldOffset, 0);
     for (int is = 0; is < o_s.size(); is++) {
-      auto& o_Si = o_s[is][0];
-      copyField(o_Si, ps.data() + nps*nekFieldOffset, "o_S[" + scalarDigitStr(is) + "]");
+      auto &o_Si = o_s[is][0];
+      copyField(o_Si, ps.data() + nps * nekFieldOffset, "o_S[" + scalarDigitStr(is) + "]");
       nps++;
     }
   }
@@ -319,52 +353,58 @@ void writeFld(const std::string& filename,
     *(nekData.p0th) = p0th;
 
     std::vector<int> fldWriteFlag;
-    fldWriteFlag.push_back(xm.size()  ? 1 : 0);
+    fldWriteFlag.push_back(xm.size() ? 1 : 0);
     fldWriteFlag.push_back(vx.size() ? 1 : 0);
     fldWriteFlag.push_back(pr.size() ? 1 : 0);
     fldWriteFlag.push_back(temp.size() ? 1 : 0);
-    for (int is = 0; is < nps; is++) fldWriteFlag.push_back(1);
+    for (int is = 0; is < nps; is++) {
+      fldWriteFlag.push_back(1);
+    }
 
-    auto& p63 = nekData.param[62];
+    auto &p63 = nekData.param[62];
     const auto p63_s = p63;
     p63 = (FP64) ? 1 : 0;
 
     int nxo = Nout + 1;
     int ifreg = uniform;
     auto nek_out_mask = ptr<int>("out_mask");
-    for(int i = 0; i < nekData.lelt; i++) nek_out_mask[i] = 1; 
+    for (int i = 0; i < nekData.lelt; i++) {
+      nek_out_mask[i] = 1;
+    }
 
     // filter elements
     int filterEnabled = elementMask.size() ? 1 : 0;
     MPI_Allreduce(MPI_IN_PLACE, &filterEnabled, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm);
-    if(filterEnabled) {
-      for(int i = 0; i < nekData.lelt; i++) nek_out_mask[i] = 0; 
-      for(auto& entry : elementMask) {
+    if (filterEnabled) {
+      for (int i = 0; i < nekData.lelt; i++) {
+        nek_out_mask[i] = 0;
+      }
+      for (auto &entry : elementMask) {
         nek_out_mask[entry] = 1;
       }
     }
 
-    (*nek_outfld_ptr)(const_cast<char*>(filename.c_str()),
-                      const_cast<double*>(&time),
-                      fldWriteFlag.data(), 
+    (*nek_outfld_ptr)(const_cast<char *>(filename.c_str()),
+                      const_cast<double *>(&time),
+                      fldWriteFlag.data(),
                       &nxo,
-                      &ifreg,          
-                      xm.data(), 
-                      ym.data(), 
-                      zm.data(), 
-                      vx.data(), 
-                      vy.data(), 
-                      vz.data(), 
-                      pr.data(), 
-                      temp.data(), 
-                      ps.data(), 
-                      &nps, 
+                      &ifreg,
+                      xm.data(),
+                      ym.data(),
+                      zm.data(),
+                      vx.data(),
+                      vy.data(),
+                      vz.data(),
+                      pr.data(),
+                      temp.data(),
+                      ps.data(),
+                      &nps,
                       filename.size());
 
     // filter reset
-    for(int i = 0; i < nekData.lelt; i++) {
+    for (int i = 0; i < nekData.lelt; i++) {
       nek_out_mask[i] = 1;
-    } 
+    }
 
     *(nekData.p0th) = p0th_s;
     *(nekData.istep) = step_s;
@@ -374,6 +414,9 @@ void writeFld(const std::string& filename,
 
 void getIC(int ifield)
 {
+  if (!useric_ptr) {
+    return;
+  }
   (*nek_uic_ptr)(&ifield);
 }
 
@@ -388,12 +431,16 @@ void restartFromFile(const std::string &str_in)
   str.erase(std::remove_if(str.begin(), str.end(), ::isspace), str.end());
 
   auto pos = str.find('+');
-  if (pos == std::string::npos) pos = str.length();
+  if (pos == std::string::npos) {
+    pos = str.length();
+  }
 
   auto fileName = str.substr(0, pos);
 
   std::string options;
-  if (pos != std::string::npos) options = str.substr(pos);
+  if (pos != std::string::npos) {
+    options = str.substr(pos);
+  }
   upperCase(options);
   std::replace_copy(options.begin(), options.end(), options.begin(), '+', ' ');
 
@@ -424,7 +471,7 @@ void xm1N(dfloat *_x, dfloat *_y, dfloat *_z, int N, dlong Nelements)
       _y[i] = nekData.ym1[i];
       _z[i] = nekData.zm1[i];
     }
-    return; 
+    return;
   }
 
   std::vector<double> x(Np);
@@ -455,6 +502,10 @@ void setics(void)
 
 void userchk(void)
 {
+  if (!userchk_ptr) {
+    return;
+  }
+
   if (rank == 0) {
     printf("calling nek_userchk ...\n");
   }
@@ -492,7 +543,7 @@ void set_usr_handles(const char *session_in, int verbose)
 
   // check if we need to append an underscore
   auto us = [handle] {
-    auto fptr = (void (*)(void)) dlsym(handle, "usrdat_");
+    auto fptr = (void (*)(void))dlsym(handle, "usrdat_");
     if (handle) {
       return "_";
     } else {
@@ -516,19 +567,48 @@ void set_usr_handles(const char *session_in, int verbose)
   nek_bootstrap_ptr =
       (void (*)(int *, char *, char *, char *, int, int, int))dlsym(handle, fname("nekf_bootstrap"));
   check_error(dlerror());
-  nek_setup_ptr =
-      (void (*)(int *, int *, int *, int *, int *, int *, int *, int *, int *, double *, double *, double *, double *, double *, int *))
-          dlsym(handle, fname("nekf_setup"));
+  nek_setup_ptr = (void (*)(int *,
+                            int *,
+                            int *,
+                            int *,
+                            int *,
+                            int *,
+                            int *,
+                            int *,
+                            int *,
+                            double *,
+                            double *,
+                            double *,
+                            double *,
+                            double *,
+                            int *))dlsym(handle, fname("nekf_setup"));
   check_error(dlerror());
   nek_uic_ptr = (void (*)(int *))dlsym(handle, fname("nekf_uic"));
   check_error(dlerror());
   nek_end_ptr = (void (*)(void))dlsym(handle, fname("nekf_end"));
   check_error(dlerror());
-  nek_outfld_ptr = (void (*)(char *, double *, int *, int *, int *, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*, int))dlsym(handle, fname("nekf_outfld"));
+  nek_outfld_ptr = (void (*)(char *,
+                             double *,
+                             int *,
+                             int *,
+                             int *,
+                             double *,
+                             double *,
+                             double *,
+                             double *,
+                             double *,
+                             double *,
+                             double *,
+                             double *,
+                             double *,
+                             int *,
+                             int))dlsym(handle, fname("nekf_outfld"));
   check_error(dlerror());
   nek_openfld_ptr = (void (*)(char *, double *, double *, int))dlsym(handle, fname("nekf_openfld"));
   check_error(dlerror());
-  nek_readfld_ptr = (void (*)(double *, double *, double *, double *, double *, double *, double *, double *, double *))dlsym(handle, fname("nekf_readfld"));
+  nek_readfld_ptr =
+      (void (*)(double *, double *, double *, double *, double *, double *, double *, double *, double *))
+          dlsym(handle, fname("nekf_readfld"));
   check_error(dlerror());
 
   nek_restart_ptr = (void (*)(char *, int *))dlsym(handle, fname("nekf_restart"));
@@ -566,19 +646,16 @@ void set_usr_handles(const char *session_in, int verbose)
   nek_gllel_ptr = (int (*)(int *))dlsym(handle, fname("gllel"));
   check_error(dlerror());
 
-
 #define postfix(x) x##_ptr
 #define load_or_noop(s)                                                                                      \
-do {                                                                                                         \
-postfix(s) = (void (*)(void))dlsym(handle, fname(#s));                                                       \
-if (!(postfix(s))) {                                                                                         \
-postfix(s) = noop_func;                                                                                      \
-if (verbose)                                                                                                 \
-printf("Setting function " #s " to noop_func.\n");                                                           \
-} else if (verbose && rank == 0) {                                                                           \
-printf("Loading " #s "\n");                                                                                  \
-}                                                                                                            \
-} while (0)
+  do {                                                                                                       \
+    postfix(s) = (void (*)(void))dlsym(handle, fname(#s));                                                   \
+    if (!(postfix(s))) {                                                                                     \
+      postfix(s) = noop_func;                                                                                \
+    } else if (verbose && rank == 0) {                                                                       \
+      printf("Loading " #s "\n");                                                                            \
+    }                                                                                                        \
+  } while (0)
 
   load_or_noop(uservp);
   load_or_noop(userf);
@@ -613,7 +690,7 @@ void mkSIZE(int lx1,
   const int verbose = options.compareArgs("VERBOSE", "TRUE") ? 1 : 0;
 
   // Read and generate the new size file.
-  sprintf(line, "%s/core/SIZE.template", nek5000_dir.c_str());
+  snprintf(line, lineSize, "%s/core/SIZE.template", nek5000_dir.c_str());
   FILE *fp = fopen(line, "r");
   nekrsCheck(!fp, MPI_COMM_SELF, EXIT_FAILURE, "Cannot open %s!\n", line);
 
@@ -632,37 +709,37 @@ void mkSIZE(int lx1,
   int count = 0;
   while (fgets(line, lineSize, fp) != NULL) {
     if (strstr(line, "parameter (lx1=") != NULL) {
-      sprintf(line, "      parameter (lx1=%d)\n", lx1);
+      snprintf(line, lineSize, "      parameter (lx1=%d)\n", lx1);
     } else if (strstr(line, "parameter (lxd=") != NULL) {
-      sprintf(line, "      parameter (lxd=%d)\n", lxd);
+      snprintf(line, lineSize, "      parameter (lxd=%d)\n", lxd);
     } else if (strstr(line, "parameter (lelt=") != NULL) {
-      sprintf(line, "      parameter (lelt=%d)\n", lelt);
+      snprintf(line, lineSize, "      parameter (lelt=%d)\n", lelt);
     } else if (strstr(line, "parameter (lelg=") != NULL) {
-      sprintf(line, "      parameter (lelg=%d)\n", lelg);
+      snprintf(line, lineSize, "      parameter (lelg=%d)\n", lelg);
     } else if (strstr(line, "parameter (ldim=") != NULL) {
-      sprintf(line, "      parameter (ldim=%d)\n", ldim);
+      snprintf(line, lineSize, "      parameter (ldim=%d)\n", ldim);
     } else if (strstr(line, "parameter (lpmin=") != NULL) {
-      sprintf(line, "      parameter (lpmin=%d)\n", lpmin);
+      snprintf(line, lineSize, "      parameter (lpmin=%d)\n", lpmin);
     } else if (strstr(line, "parameter (ldimt=") != NULL) {
-      sprintf(line, "      parameter (ldimt=%d)\n", ldimt);
+      snprintf(line, lineSize, "      parameter (ldimt=%d)\n", ldimt);
     } else if (strstr(line, "parameter (mxprev=") != NULL) {
-      sprintf(line, "      parameter (mxprev=%d)\n", 1);
+      snprintf(line, lineSize, "      parameter (mxprev=%d)\n", 1);
     } else if (strstr(line, "parameter (lgmres=") != NULL) {
-      sprintf(line, "      parameter (lgmres=%d)\n", 1);
+      snprintf(line, lineSize, "      parameter (lgmres=%d)\n", 1);
     } else if (strstr(line, "parameter (lxo=") != NULL) {
-      sprintf(line, "      parameter (lxo=%d)\n", lx1+4);
+      snprintf(line, lineSize, "      parameter (lxo=%d)\n", lx1 + 4);
     } else if (strstr(line, "parameter (lorder=") != NULL) {
-      sprintf(line, "      parameter (lorder=%d)\n", 1);
+      snprintf(line, lineSize, "      parameter (lorder=%d)\n", 1);
     } else if (strstr(line, "parameter (lhis=") != NULL) {
-      sprintf(line, "      parameter (lhis=%d)\n", 1);
+      snprintf(line, lineSize, "      parameter (lhis=%d)\n", 1);
     } else if (strstr(line, "parameter (lelr=") != NULL) {
-      sprintf(line, "      parameter (lelr=%d)\n", std::min(128 * lelt, lelg));
+      snprintf(line, lineSize, "      parameter (lelr=%d)\n", std::min(128 * lelt, lelg));
     } else if (strstr(line, "parameter (lx1m=") != NULL) {
-      sprintf(line, "      parameter (lx1m=%d)\n", lx1m);
+      snprintf(line, lineSize, "      parameter (lx1m=%d)\n", lx1m);
     } else if (strstr(line, "parameter (nsessmax=") != NULL) {
-      sprintf(line, "      parameter (nsessmax=%d)\n", nsessmax);
+      snprintf(line, lineSize, "      parameter (nsessmax=%d)\n", nsessmax);
     } else if (strstr(line, "parameter (maxobj=") != NULL) {
-      sprintf(line, "      parameter (maxobj=%d)\n", nMaxObj);
+      snprintf(line, lineSize, "      parameter (maxobj=%d)\n", nMaxObj);
     }
 
     strcpy(sizeFile + count, line);
@@ -819,26 +896,28 @@ void buildNekInterface(int ldimt, int N, int np, setupAide &options)
           out_args = "";
         }
 
-        char buf[4096];
-        sprintf(buf,
-                "cd %s"
-                " && cp -f %s/makefile.template makefile"
-                " && make %s"
-                "S=%s "
-                "OPT_INCDIR=\"%s\" "
-                "CASENAME=%s "
-                "CASEDIR=%s "
-                "-f %s/Makefile lib usr libnekInterface "
-                "%s",
-                cache_dir.c_str(),
-                nek5000_dir.c_str(),
-                make_args.c_str(),
-                nek5000_dir.c_str(),
-                include_dirs.c_str(),
-                casename.c_str(),
-                cache_dir.c_str(),
-                nekInterface_dir.c_str(),
-                out_args.c_str());
+        const int bufSize = 4096;
+        char buf[bufSize];
+        snprintf(buf,
+                 bufSize,
+                 "cd %s"
+                 " && cp -f %s/makefile.template makefile"
+                 " && make %s"
+                 "S=%s "
+                 "OPT_INCDIR=\"%s\" "
+                 "CASENAME=%s "
+                 "CASEDIR=%s "
+                 "-f %s/Makefile lib usr libnekInterface "
+                 "%s",
+                 cache_dir.c_str(),
+                 nek5000_dir.c_str(),
+                 make_args.c_str(),
+                 nek5000_dir.c_str(),
+                 include_dirs.c_str(),
+                 casename.c_str(),
+                 cache_dir.c_str(),
+                 nekInterface_dir.c_str(),
+                 out_args.c_str());
 
         if (verbose && rank == 0) {
           printf("\n%s\n", buf);
@@ -866,7 +945,10 @@ void buildNekInterface(int ldimt, int N, int np, setupAide &options)
     } // buildRank
 
     if (platform->cacheBcast) {
-      fileBcast(libFile, fs::path(platform->tmpDir) / fs::path("nek5000"), platform->comm.mpiComm, platform->verbose);
+      fileBcast(libFile,
+                fs::path(platform->tmpDir) / fs::path("nek5000"),
+                platform->comm.mpiComm,
+                platform->verbose);
     }
 
     return 0;
@@ -940,7 +1022,6 @@ void bootstrap()
       printf("done\n");
       fflush(stdout);
     }
-
   }
 }
 
@@ -1004,17 +1085,15 @@ int setup(int numberActiveFields)
   re2::nelg(options->getArgs("MESH FILE"), nelgt, nelgv, platform->comm.mpiComm);
   const int cht = (nelgt > nelgv) && nscal;
 
-
-  auto boundaryIDMap = [&](bool vMesh = false)
-  {
+  auto boundaryIDMap = [&](bool vMesh = false) {
     const std::string prefix = (cht && vMesh) ? "MESHV " : "MESH ";
 
     std::vector<std::string> list;
     options->getArgs(prefix + "BOUNDARY ID MAP", list, ",");
- 
+
     std::vector<int> map;
-    for(auto& entry : list) {
-      map.push_back(std::stoi(entry)); 
+    for (auto &entry : list) {
+      map.push_back(std::stoi(entry));
     }
     return map;
   };
@@ -1070,14 +1149,14 @@ int setup(int numberActiveFields)
   nekData.xm1 = ptr<double>("xm1");
   nekData.ym1 = ptr<double>("ym1");
   nekData.zm1 = ptr<double>("zm1");
-  nekData.xc =  ptr<double>("xc");
-  nekData.yc =  ptr<double>("yc");
-  nekData.zc =  ptr<double>("zc");
+  nekData.xc = ptr<double>("xc");
+  nekData.yc = ptr<double>("yc");
+  nekData.zc = ptr<double>("zc");
 
   nekData.unx = ptr<double>("unx");
   nekData.uny = ptr<double>("uny");
   nekData.unz = ptr<double>("unz");
-  
+
   nekData.cbc = ptr<char>("cbc");
   nekData.boundaryID = ptr<int>("boundaryID");
   nekData.boundaryIDt = ptr<int>("boundaryIDt");
@@ -1090,7 +1169,9 @@ int setup(int numberActiveFields)
     gen_bcmap();
 
     auto flow = true;
-    if (platform->options.compareArgs("VELOCITY SOLVER", "NONE")) flow = false; 
+    if (platform->options.compareArgs("VELOCITY SOLVER", "NONE")) {
+      flow = false;
+    }
 
     if (flow) {
       if (rank == 0) {
@@ -1155,11 +1236,7 @@ int setup(int numberActiveFields)
   {
     hlong NelementsV = nekData.nelv;
     MPI_Allreduce(MPI_IN_PLACE, &NelementsV, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm);
-    nekrsCheck(NelementsV != nelgv,
-               MPI_COMM_SELF,
-               EXIT_FAILURE,
-               "%s\n",
-               "Invalid element partitioning");
+    nekrsCheck(NelementsV != nelgv, MPI_COMM_SELF, EXIT_FAILURE, "%s\n", "Invalid element partitioning");
 
     if (cht) {
       hlong NelementsT = nekData.nelt;
@@ -1222,7 +1299,7 @@ void printMeshMetrics()
   (*nek_meshmetrics_ptr)();
 }
 
-const std::map<std::string, void*>& ptrList()
+const std::map<std::string, void *> &ptrList()
 {
   return ptrListData;
 }
@@ -1237,17 +1314,19 @@ void nekf_registerptr(char *id, void *val, int *nameLen)
   auto name = std::string(id, *nameLen); // id comes from Fortran and is not null terminated
   auto entry = ptrListData.find(name);
   auto entryFound = (entry != ptrListData.end());
-  nekrsCheck(entryFound && entry->second != val, MPI_COMM_SELF, EXIT_FAILURE, 
-             "%s exists already but is pointing to a different memory address\n", name.c_str());
+  nekrsCheck(entryFound && entry->second != val,
+             MPI_COMM_SELF,
+             EXIT_FAILURE,
+             "%s exists already but is pointing to a different memory address\n",
+             name.c_str());
   if (!entryFound) {
     const auto [it, success] = ptrListData.insert(std::make_pair(name, val));
     nekrsCheck(!success, MPI_COMM_SELF, EXIT_FAILURE, "Adding %s failed\n", name.c_str());
   }
 }
 
-void nekf_registerptr_(char *id, void* val, int *nameLen)
+void nekf_registerptr_(char *id, void *val, int *nameLen)
 {
   nekf_registerptr(id, val, nameLen);
 }
-
 }
diff --git a/src/nrs/advectionSubCycling.cpp b/src/nrs/advectionSubCycling.cpp
index dd349e0a3..48ebf47b9 100644
--- a/src/nrs/advectionSubCycling.cpp
+++ b/src/nrs/advectionSubCycling.cpp
@@ -166,16 +166,16 @@ static void rk44(int nFields,
 
   const bool movingMesh = platform->options.compareArgs("MOVING MESH", "TRUE");
 
-  occa::memory o_u1 = platform->o_memPool.reserve<dfloat>(nFields * fieldOffset);
+  occa::memory o_u1 = platform->deviceMemoryPool.reserve<dfloat>(nFields * fieldOffset);
   o_u1.copyFrom(o_u0);
 
   std::vector<occa::memory> o_rhs(4);
-  o_rhs[0] = platform->o_memPool.reserve<dfloat>(nFields * fieldOffset);
-  o_rhs[1] = platform->o_memPool.reserve<dfloat>(nFields * fieldOffset);
-  o_rhs[2] = platform->o_memPool.reserve<dfloat>(nFields * fieldOffset);
-  o_rhs[3] = platform->o_memPool.reserve<dfloat>(nFields * fieldOffset);
+  o_rhs[0] = platform->deviceMemoryPool.reserve<dfloat>(nFields * fieldOffset);
+  o_rhs[1] = platform->deviceMemoryPool.reserve<dfloat>(nFields * fieldOffset);
+  o_rhs[2] = platform->deviceMemoryPool.reserve<dfloat>(nFields * fieldOffset);
+  o_rhs[3] = platform->deviceMemoryPool.reserve<dfloat>(nFields * fieldOffset);
 
-  occa::memory o_LMMe = (movingMesh) ? platform->o_memPool.reserve<dfloat>(fieldOffset) : nullptr;
+  occa::memory o_LMMe = (movingMesh) ? platform->deviceMemoryPool.reserve<dfloat>(fieldOffset) : nullptr;
 
   for (int rk = 0; rk < nRK; ++rk) {
     auto extC = extCoeffs(nEXT, time, tstage, sdt, dt, nodes, rk);
@@ -243,7 +243,7 @@ occa::memory advectionSubcyclingRK(mesh_t *_meshT,
   nStagesSum3Kernel = platform->kernelRequests.load("core-nStagesSum3");
   subCycleRKKernel = platform->kernelRequests.load("nrs-subCycleRK");
 
-  occa::memory o_u0 = platform->o_memPool.reserve<dfloat>(nFields * fieldOffset);
+  occa::memory o_u0 = platform->deviceMemoryPool.reserve<dfloat>(nFields * fieldOffset);
   platform->linAlg->fill(o_u0.length(), 0.0, o_u0);
 
   for (int torder = nEXT - 1; torder >= 0; torder--) {
diff --git a/src/nrs/bdry/applyDirichlet.cpp b/src/nrs/bdry/applyDirichlet.cpp
index d9ab2c472..2bb010237 100644
--- a/src/nrs/bdry/applyDirichlet.cpp
+++ b/src/nrs/bdry/applyDirichlet.cpp
@@ -4,12 +4,17 @@
 // lower than any other possible Dirichlet value
 static constexpr dfloat TINY = -1e30;
 
-void createZeroNormalMask(nrs_t *nrs, mesh_t *mesh, const occa::memory &o_EToB, const occa::memory& o_EToBV, occa::memory &o_mask)
+void createZeroNormalMask(nrs_t *nrs,
+                          mesh_t *mesh,
+                          const occa::memory &o_EToB,
+                          const occa::memory &o_EToBV,
+                          occa::memory &o_mask)
 {
   nrs->initializeZeroNormalMaskKernel(mesh->Nlocal, nrs->fieldOffset, o_EToBV, o_mask);
 
   // normal xyz + count
-  occa::memory o_avgNormal = platform->o_memPool.reserve<dfloat>((nrs->NVfields+1) * nrs->fieldOffset);
+  occa::memory o_avgNormal =
+      platform->deviceMemoryPool.reserve<dfloat>((nrs->NVfields + 1) * nrs->fieldOffset);
 
   int bcType = ellipticBcType::ZERO_NORMAL;
   nrs->averageNormalBcTypeKernel(mesh->Nelements,
@@ -20,15 +25,15 @@ void createZeroNormalMask(nrs_t *nrs, mesh_t *mesh, const occa::memory &o_EToB,
                                  o_EToB,
                                  o_avgNormal);
 
-  oogs::startFinish(o_avgNormal, nrs->NVfields+1, nrs->fieldOffset, ogsDfloat, ogsAdd, mesh->oogs);
+  oogs::startFinish(o_avgNormal, nrs->NVfields + 1, nrs->fieldOffset, ogsDfloat, ogsAdd, mesh->oogs);
 
   nrs->fixZeroNormalMaskKernel(mesh->Nelements,
-                     nrs->fieldOffset,
-                     mesh->o_sgeo,
-                     mesh->o_vmapM,
-                     o_EToB,
-                     o_avgNormal,
-                     o_mask);
+                               nrs->fieldOffset,
+                               mesh->o_sgeo,
+                               mesh->o_vmapM,
+                               o_EToB,
+                               o_avgNormal,
+                               o_mask);
 
   oogs::startFinish(o_mask, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMin, mesh->oogs);
 }
@@ -41,8 +46,9 @@ void applyZeroNormalMask(nrs_t *nrs,
                          const occa::memory &o_mask,
                          occa::memory &o_x)
 {
-  if (Nelements == 0)
+  if (Nelements == 0) {
     return;
+  }
 
   nrs->applyZeroNormalMaskKernel(Nelements,
                                  nrs->fieldOffset,
@@ -54,7 +60,11 @@ void applyZeroNormalMask(nrs_t *nrs,
                                  o_x);
 }
 
-void applyZeroNormalMask(nrs_t *nrs, mesh_t *mesh, const occa::memory &o_EToB, const occa::memory &o_mask, occa::memory &o_x)
+void applyZeroNormalMask(nrs_t *nrs,
+                         mesh_t *mesh,
+                         const occa::memory &o_EToB,
+                         const occa::memory &o_mask,
+                         occa::memory &o_x)
 {
   nrs->applyZeroNormalMaskKernel(mesh->Nelements,
                                  nrs->fieldOffset,
@@ -66,7 +76,7 @@ void applyZeroNormalMask(nrs_t *nrs, mesh_t *mesh, const occa::memory &o_EToB, c
                                  o_x);
 }
 
-void applyDirichletVelocity(nrs_t *nrs, double time, occa::memory& o_U,occa::memory& o_Ue,occa::memory& o_P)
+void applyDirichletVelocity(nrs_t *nrs, double time, occa::memory &o_U, occa::memory &o_Ue, occa::memory &o_P)
 {
   if (bcMap::unalignedMixedBoundary("velocity")) {
     applyZeroNormalMask(nrs, nrs->mesh, nrs->uvwSolver->o_EToB(), nrs->o_zeroNormalMaskVelocity, o_U);
@@ -77,7 +87,7 @@ void applyDirichletVelocity(nrs_t *nrs, double time, occa::memory& o_U,occa::mem
 
   auto mesh = nrs->mesh;
 
-  occa::memory o_tmp = platform->o_memPool.reserve<dfloat>((nrs->NVfields+1) * nrs->fieldOffset);
+  occa::memory o_tmp = platform->deviceMemoryPool.reserve<dfloat>((nrs->NVfields + 1) * nrs->fieldOffset);
   platform->linAlg->fill((1 + nrs->NVfields) * nrs->fieldOffset, TINY, o_tmp);
 
   for (int sweep = 0; sweep < 2; sweep++) {
@@ -127,50 +137,49 @@ void applyDirichletVelocity(nrs_t *nrs, double time, occa::memory& o_U,occa::mem
 
   if (nrs->pSolver->Nmasked()) {
     auto o_dirichlet = o_tmp.slice(0, nrs->fieldOffset);
-    nrs->maskCopyKernel(nrs->pSolver->Nmasked(),
-                        0,
-                        0,
-                        nrs->pSolver->o_maskIds(),
-                        o_dirichlet,
-                        o_P);
+    nrs->maskCopyKernel(nrs->pSolver->Nmasked(), 0, 0, nrs->pSolver->o_maskIds(), o_dirichlet, o_P);
   }
 
   auto o_UDirichlet = o_tmp.slice(nrs->fieldOffset);
   if (nrs->uvwSolver) {
     if (nrs->uvwSolver->Nmasked()) {
       nrs->maskCopy2Kernel(nrs->uvwSolver->Nmasked(),
-                          0 * nrs->fieldOffset,
-                          0 * nrs->fieldOffset,
-                          nrs->uvwSolver->o_maskIds(),
-                          o_UDirichlet,
-                          o_U, o_Ue);
+                           0 * nrs->fieldOffset,
+                           0 * nrs->fieldOffset,
+                           nrs->uvwSolver->o_maskIds(),
+                           o_UDirichlet,
+                           o_U,
+                           o_Ue);
     }
   } else {
     int cnt = 0;
-    for(auto& solver : {nrs->uSolver, nrs->vSolver, nrs->wSolver}) {
+    for (auto &solver : {nrs->uSolver, nrs->vSolver, nrs->wSolver}) {
       if (solver->Nmasked()) {
         nrs->maskCopy2Kernel(solver->Nmasked(),
                              cnt * nrs->fieldOffset,
                              cnt * nrs->fieldOffset,
                              solver->o_maskIds(),
                              o_UDirichlet,
-                             o_U, o_Ue);
+                             o_U,
+                             o_Ue);
       }
       cnt++;
     }
   }
 }
 
-void applyDirichletScalars(nrs_t *nrs, double time, occa::memory& o_S, occa::memory& o_Se)
+void applyDirichletScalars(nrs_t *nrs, double time, occa::memory &o_S, occa::memory &o_Se)
 {
   cds_t *cds = nrs->cds;
 
   const auto neknekFieldOffset = nrs->neknek ? nrs->neknek->fieldOffset() : 0;
   for (int is = 0; is < cds->NSfields; is++) {
-    if (!cds->compute[is])
+    if (!cds->compute[is]) {
       continue;
-    if (cds->cvodeSolve[is])
+    }
+    if (cds->cvodeSolve[is]) {
       continue;
+    }
     mesh_t *mesh = cds->mesh[0];
     oogs_t *gsh = cds->gshT;
     if (is) {
@@ -181,7 +190,7 @@ void applyDirichletScalars(nrs_t *nrs, double time, occa::memory& o_S, occa::mem
     auto o_diff_i = cds->o_diff + cds->fieldOffsetScan[is];
     auto o_rho_i = cds->o_rho + cds->fieldOffsetScan[is];
 
-    occa::memory o_SiDirichlet = platform->o_memPool.reserve<dfloat>(cds->fieldOffset[is]);
+    occa::memory o_SiDirichlet = platform->deviceMemoryPool.reserve<dfloat>(cds->fieldOffset[is]);
     platform->linAlg->fill(cds->fieldOffset[is], TINY, o_SiDirichlet);
 
     for (int sweep = 0; sweep < 2; sweep++) {
@@ -208,40 +217,41 @@ void applyDirichletScalars(nrs_t *nrs, double time, occa::memory& o_S, occa::mem
                              *(cds->o_usrwrk),
                              o_SiDirichlet);
 
-      oogs::startFinish(o_SiDirichlet, 
+      oogs::startFinish(o_SiDirichlet,
                         1,
-                        cds->fieldOffset[is], 
+                        cds->fieldOffset[is],
                         ogsDfloat,
-                        (sweep == 0) ? ogsMax : ogsMin, 
+                        (sweep == 0) ? ogsMax : ogsMin,
                         gsh);
     }
-    occa::memory o_Si =
-        o_S.slice(cds->fieldOffsetScan[is], cds->fieldOffset[is]);
-    
-    if(o_Se.isInitialized()){
-      occa::memory o_Si_e =
-          o_Se.slice(cds->fieldOffsetScan[is], cds->fieldOffset[is]);
-
-      if (cds->solver[is]->Nmasked())
+    occa::memory o_Si = o_S.slice(cds->fieldOffsetScan[is], cds->fieldOffset[is]);
+
+    if (o_Se.isInitialized()) {
+      occa::memory o_Si_e = o_Se.slice(cds->fieldOffsetScan[is], cds->fieldOffset[is]);
+
+      if (cds->solver[is]->Nmasked()) {
         cds->maskCopy2Kernel(cds->solver[is]->Nmasked(),
-                            0,
-                            0,
-                            cds->solver[is]->o_maskIds(),
-                            o_SiDirichlet,
-                            o_Si, o_Si_e);
+                             0,
+                             0,
+                             cds->solver[is]->o_maskIds(),
+                             o_SiDirichlet,
+                             o_Si,
+                             o_Si_e);
+      }
     } else {
-      if (cds->solver[is]->Nmasked())
+      if (cds->solver[is]->Nmasked()) {
         cds->maskCopyKernel(cds->solver[is]->Nmasked(),
                             0,
                             0,
                             cds->solver[is]->o_maskIds(),
                             o_SiDirichlet,
                             o_Si);
+      }
     }
   }
 }
 
-void applyDirichletMesh(nrs_t *nrs, double time, occa::memory& o_UM, occa::memory& o_UMe, occa::memory& o_U)
+void applyDirichletMesh(nrs_t *nrs, double time, occa::memory &o_UM, occa::memory &o_UMe, occa::memory &o_U)
 {
   auto mesh = (nrs->cht) ? nrs->cds->mesh[0] : nrs->mesh;
   if (bcMap::unalignedMixedBoundary("mesh")) {
@@ -249,14 +259,14 @@ void applyDirichletMesh(nrs_t *nrs, double time, occa::memory& o_UM, occa::memor
     applyZeroNormalMask(nrs, mesh, nrs->meshSolver->o_EToB(), nrs->o_zeroNormalMaskMeshVelocity, o_UMe);
   }
 
-  occa::memory o_UDirichlet = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  occa::memory o_UDirichlet = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
   platform->linAlg->fill(nrs->NVfields * nrs->fieldOffset, TINY, o_UDirichlet);
 
   for (int sweep = 0; sweep < 2; sweep++) {
     mesh->velocityDirichletKernel(mesh->Nelements,
                                   nrs->fieldOffset,
                                   time,
-                                  (int) bcMap::useDerivedMeshBoundaryConditions(),
+                                  (int)bcMap::useDerivedMeshBoundaryConditions(),
                                   mesh->o_sgeo,
                                   nrs->o_zeroNormalMaskMeshVelocity,
                                   mesh->o_x,
@@ -279,11 +289,13 @@ void applyDirichletMesh(nrs_t *nrs, double time, occa::memory& o_UM, occa::memor
                       nrs->gshMesh);
   }
 
-  if (nrs->meshSolver->Nmasked())
+  if (nrs->meshSolver->Nmasked()) {
     nrs->maskCopy2Kernel(nrs->meshSolver->Nmasked(),
-                        0 * nrs->fieldOffset,
-                        0 * nrs->fieldOffset,
-                        nrs->meshSolver->o_maskIds(),
-                        o_UDirichlet,
-                        o_UM, o_UMe);
+                         0 * nrs->fieldOffset,
+                         0 * nrs->fieldOffset,
+                         nrs->meshSolver->o_maskIds(),
+                         o_UDirichlet,
+                         o_UM,
+                         o_UMe);
+  }
 }
diff --git a/src/nrs/cds/cvode/cbGMRES.cpp b/src/nrs/cds/cvode/cbGMRES.cpp
index 015b117ff..a35fd6796 100644
--- a/src/nrs/cds/cvode/cbGMRES.cpp
+++ b/src/nrs/cds/cvode/cbGMRES.cpp
@@ -181,11 +181,11 @@ static int CGSI(realtype **h, int k, int p, realtype *new_vk_norm, occa::memory
 }
 
 #define cbGMRESFinish(lastFlag)                                                                              \
-{                                                                                                            \
-o_s2Inv.free();                                                                                              \
-o_V.free();                                                                                                  \
-return lastFlag;                                                                                             \
-}
+  {                                                                                                          \
+    o_s2Inv.free();                                                                                          \
+    o_V.free();                                                                                              \
+    return lastFlag;                                                                                         \
+  }
 
 } // namespace
 
@@ -277,8 +277,8 @@ int cbGMRESSolve(SUNLinearSolver S, N_Vector x, N_Vector b, realtype delta)
     firstTime = 0;
   }
 
-  o_s2Inv = platform->o_memPool.reserve<sunrealtype>(N);
-  o_V = platform->o_memPool.reserve<pfloat>((l_max + 1) * static_cast<size_t>(N));
+  o_s2Inv = platform->deviceMemoryPool.reserve<sunrealtype>(N);
+  o_V = platform->deviceMemoryPool.reserve<pfloat>((l_max + 1) * static_cast<size_t>(N));
 
   /* Initialize counters and convergence flag */
   *nli = 0;
diff --git a/src/nrs/cds/solve.cpp b/src/nrs/cds/solve.cpp
index d02e0d6f4..26998aedb 100644
--- a/src/nrs/cds/solve.cpp
+++ b/src/nrs/cds/solve.cpp
@@ -15,7 +15,7 @@ void cds_t::solve(double time, int stage)
 
     platform->timer.tic("scalar rhs", 1);
 
-    auto o_rhs = platform->o_memPool.reserve<dfloat>(this->fieldOffset[is]);
+    auto o_rhs = platform->deviceMemoryPool.reserve<dfloat>(this->fieldOffset[is]);
     o_rhs.copyFrom(this->o_JwF, this->fieldOffset[is], 0, this->fieldOffsetScan[is]);
 
     this->neumannBCKernel(mesh->Nelements,
@@ -44,15 +44,13 @@ void cds_t::solve(double time, int stage)
     const auto o_diff_i = this->o_diff.slice(this->fieldOffsetScan[is], mesh->Nlocal);
 
     const auto o_lambda0 = o_diff_i;
-    const auto o_lambda1 = [&]()
-    { 
+    const auto o_lambda1 = [&]() {
       const auto o_rho_i = this->o_rho.slice(this->fieldOffsetScan[is], mesh->Nlocal);
-      auto o_l = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+      auto o_l = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
       if (this->userImplicitLinearTerm) {
         auto o_implicitLT = this->userImplicitLinearTerm(time, is);
-        if(o_implicitLT.isInitialized()) {
-          platform->linAlg
-            ->axpbyz(mesh->Nlocal, *this->g0 / this->dt[0], o_rho_i, 1.0, o_implicitLT, o_l);
+        if (o_implicitLT.isInitialized()) {
+          platform->linAlg->axpbyz(mesh->Nlocal, *this->g0 / this->dt[0], o_rho_i, 1.0, o_implicitLT, o_l);
         } else {
           platform->linAlg->axpby(mesh->Nlocal, *this->g0 / this->dt[0], o_rho_i, 0.0, o_l);
         }
@@ -63,7 +61,7 @@ void cds_t::solve(double time, int stage)
     }();
 
     auto o_Si = [&]() {
-      auto o_S0 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+      auto o_S0 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
       if (platform->options.compareArgs("SCALAR" + sid + " INITIAL GUESS", "EXTRAPOLATION") && stage == 1) {
         o_S0.copyFrom(this->o_Se, o_S0.size(), 0, this->fieldOffsetScan[is]);
       } else {
diff --git a/src/nrs/cfl.cpp b/src/nrs/cfl.cpp
index 317876325..38176998a 100644
--- a/src/nrs/cfl.cpp
+++ b/src/nrs/cfl.cpp
@@ -32,7 +32,7 @@ void setup(nrs_t *nrs)
   firstTime = false;
 }
 
-}
+} // namespace
 
 dfloat nrs_t::computeCFL()
 {
@@ -41,20 +41,15 @@ dfloat nrs_t::computeCFL()
 
 dfloat nrs_t::computeCFL(dfloat dt)
 {
-  if (firstTime) setup(this);
+  if (firstTime) {
+    setup(this);
+  }
 
-  auto o_cfl = platform->o_memPool.reserve<dfloat>(mesh->Nelements);
+  auto o_cfl = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nelements);
 
-  this->cflKernel(mesh->Nelements,
-                  dt,
-                  mesh->o_vgeo,
-                  o_dx,
-                  this->fieldOffset,
-                  this->o_U,
-                  mesh->o_U,
-                  o_cfl);
+  this->cflKernel(mesh->Nelements, dt, mesh->o_vgeo, o_dx, this->fieldOffset, this->o_U, mesh->o_U, o_cfl);
 
-  auto scratch = (dfloat *) h_scratch.ptr();
+  auto scratch = (dfloat *)h_scratch.ptr();
   o_cfl.copyTo(scratch);
 
   dfloat cfl = 0;
diff --git a/src/nrs/constantFlowRate.cpp b/src/nrs/constantFlowRate.cpp
index 048ed01d8..aeda56325 100644
--- a/src/nrs/constantFlowRate.cpp
+++ b/src/nrs/constantFlowRate.cpp
@@ -49,10 +49,10 @@ void compute(nrs_t *nrs, double time)
   double flops = 0.0;
 
   platform->timer.tic("pressure rhs", 1);
-  auto o_Prhs = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
-  auto o_lambda0 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_Prhs = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_lambda0 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   {
-    occa::memory o_gradPCoeff = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+    occa::memory o_gradPCoeff = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
 
     platform->linAlg->adyz(mesh->Nlocal, 1.0, nrs->o_rho, o_lambda0);
 
@@ -86,7 +86,7 @@ void compute(nrs_t *nrs, double time)
 
   // solve homogenous Stokes problem
   platform->timer.tic("velocity rhs", 1);
-  occa::memory o_RhsVel = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  occa::memory o_RhsVel = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
   {
     nrs->gradientVolumeKernel(mesh->Nelements,
                               mesh->o_vgeo,
@@ -101,7 +101,7 @@ void compute(nrs_t *nrs, double time)
 
     platform->linAlg->scaleMany(mesh->Nlocal, nrs->NVfields, nrs->fieldOffset, -1.0, o_RhsVel);
 
-    occa::memory o_JwF = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+    occa::memory o_JwF = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
     o_JwF.copyFrom(mesh->o_LMM, mesh->Nlocal, 0 * nrs->fieldOffset, 0);
     o_JwF.copyFrom(mesh->o_LMM, mesh->Nlocal, 1 * nrs->fieldOffset, 0);
     o_JwF.copyFrom(mesh->o_LMM, mesh->Nlocal, 2 * nrs->fieldOffset, 0);
@@ -118,7 +118,7 @@ void compute(nrs_t *nrs, double time)
 
   o_lambda0.free();
   o_lambda0 = nrs->o_mue;
-  auto o_lambda1 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_lambda1 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   platform->linAlg->axpby(mesh->Nlocal, nrs->g0 / nrs->dt[0], nrs->o_rho, 0.0, o_lambda1);
 
   if (nrs->uvwSolver) {
@@ -147,7 +147,7 @@ bool checkIfRecompute(nrs_t *nrs, int tstep)
   bool adjustFlowRate = false;
 
   // did the properties change?
-  occa::memory o_propDelta = platform->o_memPool.reserve<dfloat>(nPropertyFields * nrs->fieldOffset);
+  occa::memory o_propDelta = platform->deviceMemoryPool.reserve<dfloat>(nPropertyFields * nrs->fieldOffset);
   platform->linAlg->axpbyzMany(mesh->Nlocal,
                                nPropertyFields,
                                nrs->fieldOffset,
@@ -280,21 +280,21 @@ bool nrs_t::adjustFlowRate(int tstep, double time)
       platform->options.getArgs("CONSTANT FLOW TO BID", toBID);
 
       occa::memory o_centroid =
-          platform->o_memPool.reserve<dfloat>(this->NVfields * mesh->Nelements * mesh->Nfaces);
+          platform->deviceMemoryPool.reserve<dfloat>(this->NVfields * mesh->Nelements * mesh->Nfaces);
       platform->linAlg->fill(mesh->Nelements * mesh->Nfaces * 3, 0.0, o_centroid);
 
-      occa::memory o_counts = platform->o_memPool.reserve<dfloat>(mesh->Nelements * mesh->Nfaces);
+      occa::memory o_counts = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nelements * mesh->Nfaces);
       platform->linAlg->fill(mesh->Nelements * mesh->Nfaces, 0.0, o_counts);
 
       this->computeFaceCentroidKernel(mesh->Nelements,
-                                     fromBID,
-                                     mesh->o_EToB,
-                                     mesh->o_vmapM,
-                                     mesh->o_x,
-                                     mesh->o_y,
-                                     mesh->o_z,
-                                     o_centroid,
-                                     o_counts);
+                                      fromBID,
+                                      mesh->o_EToB,
+                                      mesh->o_vmapM,
+                                      mesh->o_x,
+                                      mesh->o_y,
+                                      mesh->o_z,
+                                      o_centroid,
+                                      o_counts);
       flops += 3 * mesh->Nlocal;
 
       dfloat NfacesContrib =
@@ -319,14 +319,14 @@ bool nrs_t::adjustFlowRate(int tstep, double time)
       platform->linAlg->fill(mesh->Nelements * mesh->Nfaces * 3, 0.0, o_centroid);
       platform->linAlg->fill(mesh->Nelements * mesh->Nfaces, 0.0, o_counts);
       this->computeFaceCentroidKernel(mesh->Nelements,
-                                     toBID,
-                                     mesh->o_EToB,
-                                     mesh->o_vmapM,
-                                     mesh->o_x,
-                                     mesh->o_y,
-                                     mesh->o_z,
-                                     o_centroid,
-                                     o_counts);
+                                      toBID,
+                                      mesh->o_EToB,
+                                      mesh->o_vmapM,
+                                      mesh->o_x,
+                                      mesh->o_y,
+                                      mesh->o_z,
+                                      o_centroid,
+                                      o_counts);
 
       flops += 3 * mesh->Nlocal;
 
@@ -366,10 +366,12 @@ bool nrs_t::adjustFlowRate(int tstep, double time)
       std::cout << "recomputing base flow rate\n";
     }
 
-    auto getSolverData = [](elliptic *solver)
-    {
+    auto getSolverData = [](elliptic *solver) {
       if (solver) {
-        std::tuple<int, dfloat, dfloat, dfloat> val(solver->Niter(), solver->initialResidual(), solver->initialGuessResidual(), solver->finalResidual());
+        std::tuple<int, dfloat, dfloat, dfloat> val(solver->Niter(),
+                                                    solver->initialResidual(),
+                                                    solver->initialGuessResidual(),
+                                                    solver->finalResidual());
         return val;
       } else {
         std::tuple<int, dfloat, dfloat, dfloat> val(0, 0, 0, 0);
@@ -386,8 +388,7 @@ bool nrs_t::adjustFlowRate(int tstep, double time)
     compute(this, time);
 
     // restore norms + update iteration count
-    auto setSolverData = [](elliptic *solver, int Niter, dfloat res00Norm, dfloat res0Norm, dfloat resNorm)
-    {
+    auto setSolverData = [](elliptic *solver, int Niter, dfloat res00Norm, dfloat res0Norm, dfloat resNorm) {
       solver->Niter(solver->Niter() + Niter);
       solver->initialResidual(res00Norm);
       solver->initialGuessResidual(res0Norm);
@@ -404,15 +405,15 @@ bool nrs_t::adjustFlowRate(int tstep, double time)
     setSolverData(this->pSolver, NiterP, res00NormP, res0NormP, resNormP);
   }
 
-  occa::memory o_currentFlowRate = platform->o_memPool.reserve<dfloat>(this->fieldOffset);
+  occa::memory o_currentFlowRate = platform->deviceMemoryPool.reserve<dfloat>(this->fieldOffset);
 
   this->computeFieldDotNormalKernel(mesh->Nlocal,
-                                   this->fieldOffset,
-                                   flowDirection[0],
-                                   flowDirection[1],
-                                   flowDirection[2],
-                                   this->o_U,
-                                   o_currentFlowRate);
+                                    this->fieldOffset,
+                                    flowDirection[0],
+                                    flowDirection[1],
+                                    flowDirection[2],
+                                    this->o_U,
+                                    o_currentFlowRate);
 
   flops += 5 * mesh->Nlocal;
 
@@ -423,14 +424,14 @@ bool nrs_t::adjustFlowRate(int tstep, double time)
       platform->linAlg->sum(mesh->Nlocal, o_currentFlowRate, platform->comm.mpiComm) / lengthScale;
 
   if (recomputeBaseFlowRate) {
-    occa::memory o_baseFlowRate = platform->o_memPool.reserve<dfloat>(this->fieldOffset);
+    occa::memory o_baseFlowRate = platform->deviceMemoryPool.reserve<dfloat>(this->fieldOffset);
     this->computeFieldDotNormalKernel(mesh->Nlocal,
-                                     this->fieldOffset,
-                                     flowDirection[0],
-                                     flowDirection[1],
-                                     flowDirection[2],
-                                     this->o_Uc,
-                                     o_baseFlowRate);
+                                      this->fieldOffset,
+                                      flowDirection[0],
+                                      flowDirection[1],
+                                      flowDirection[2],
+                                      this->o_Uc,
+                                      o_baseFlowRate);
     flops += 5 * mesh->Nlocal;
 
     // scale by mass matrix
@@ -449,19 +450,24 @@ bool nrs_t::adjustFlowRate(int tstep, double time)
   constantFlowScale = deltaFlowRate / baseFlowRate;
 
   // add corrections
-  platform->linAlg
-      ->axpbyMany(mesh->Nlocal, this->NVfields, this->fieldOffset, constantFlowScale, this->o_Uc, 1.0, this->o_U);
+  platform->linAlg->axpbyMany(mesh->Nlocal,
+                              this->NVfields,
+                              this->fieldOffset,
+                              constantFlowScale,
+                              this->o_Uc,
+                              1.0,
+                              this->o_U);
 
   platform->linAlg->axpby(mesh->Nlocal, constantFlowScale, this->o_Pc, 1.0, this->o_P);
 
   // compute flow rate after correction as diagnostic
   this->computeFieldDotNormalKernel(mesh->Nlocal,
-                                   this->fieldOffset,
-                                   flowDirection[0],
-                                   flowDirection[1],
-                                   flowDirection[2],
-                                   this->o_U,
-                                   o_currentFlowRate);
+                                    this->fieldOffset,
+                                    flowDirection[0],
+                                    flowDirection[1],
+                                    flowDirection[2],
+                                    this->o_U,
+                                    o_currentFlowRate);
 
   flops += 5 * mesh->Nlocal;
 
@@ -480,5 +486,3 @@ dfloat nrs_t::flowRatescaleFactor()
 {
   return constantFlowScale;
 }
-
-
diff --git a/src/nrs/neknek/fixCoupledSurfaceFlux.cpp b/src/nrs/neknek/fixCoupledSurfaceFlux.cpp
index 6bbfce619..24b40f9f0 100644
--- a/src/nrs/neknek/fixCoupledSurfaceFlux.cpp
+++ b/src/nrs/neknek/fixCoupledSurfaceFlux.cpp
@@ -37,11 +37,11 @@ void neknek_t::fixCoupledSurfaceFlux(occa::memory o_U)
     isCalled = true;
   }
   if (hasOutlet) {
-     return;
+    return;
   }
 
   constexpr int nReduction = 2;
-  auto o_reduction = platform->o_memPool.reserve<dfloat>(nReduction * mesh->Nelements);
+  auto o_reduction = platform->deviceMemoryPool.reserve<dfloat>(nReduction * mesh->Nelements);
 
   this->computeFluxKernel(mesh->Nelements,
                           nrs->fieldOffset,
@@ -72,8 +72,7 @@ void neknek_t::fixCoupledSurfaceFlux(occa::memory o_U)
   }
 
   if (platform->verbose && platform->comm.mpiRank == 0) {
-    printf("neknek::fixCoupledSurfaceFlux flux = %11.4e, area = %11.4e, gamma = %11.4e\n",
-           flux, area , gamma);
+    printf("neknek::fixCoupledSurfaceFlux flux = %11.4e, area = %11.4e, gamma = %11.4e\n", flux, area, gamma);
   }
 
   this->fixSurfaceFluxKernel(mesh->Nelements,
diff --git a/src/nrs/neknek/multirateNekNek.cpp b/src/nrs/neknek/multirateNekNek.cpp
index 63ee0d883..edbecd811 100644
--- a/src/nrs/neknek/multirateNekNek.cpp
+++ b/src/nrs/neknek/multirateNekNek.cpp
@@ -16,7 +16,7 @@ void neknek_t::exchangeTimes(double time)
     this->recomputePartition = true;
   }
 
-  auto o_timeFld = platform->o_memPool.reserve<dfloat>((maxOrd + 1) * nrs->fieldOffset);
+  auto o_timeFld = platform->deviceMemoryPool.reserve<dfloat>((maxOrd + 1) * nrs->fieldOffset);
   for (int s = 0; s <= maxOrd; ++s) {
     auto o_timeSlice = o_timeFld.slice(s * nrs->fieldOffset, nrs->fieldOffset);
     platform->linAlg->fill(nrs->fieldOffset, time, o_timeSlice);
diff --git a/src/nrs/neknek/neknek.cpp b/src/nrs/neknek/neknek.cpp
index 6d552228d..0c3bcc099 100644
--- a/src/nrs/neknek/neknek.cpp
+++ b/src/nrs/neknek/neknek.cpp
@@ -1,7 +1,6 @@
 #include <cfloat>
 #include "platform.hpp"
 #include "bcMap.hpp"
-#include "findpts.hpp"
 #include "neknek.hpp"
 #include "nrs.hpp"
 #include "nekInterfaceAdapter.hpp"
@@ -75,14 +74,11 @@ void neknek_t::updateInterpPoints()
     return;
   }
 
-  auto neknek = nrs->neknek;
-  const dlong nsessions = this->nsessions_;
-  const dlong sessionID = this->sessionID_;
-
   auto mesh = (nrs->cht) ? nrs->cds->mesh[0] : nrs->mesh;
 
   this->interpolator.reset();
-  this->interpolator = std::make_shared<pointInterpolation_t>(mesh, platform->comm.mpiCommParent, true, intBIDs);
+  this->interpolator =
+      std::make_shared<pointInterpolation_t>(mesh, platform->comm.mpiCommParent, true, intBIDs);
   this->interpolator->setTimerName("neknek_t::");
 
   // neknekX[:] = mesh->x[pointMap[:]]
@@ -103,13 +99,13 @@ void neknek_t::updateInterpPoints()
 
 void neknek_t::findIntPoints()
 {
-  const dlong nsessions = this->nsessions_;
   const dlong sessionID = this->sessionID_;
 
   auto mesh = (nrs->cht) ? nrs->cds->mesh[0] : nrs->mesh;
 
   this->interpolator.reset();
-  this->interpolator = std::make_shared<pointInterpolation_t>(mesh, platform->comm.mpiCommParent, true, intBIDs);
+  this->interpolator =
+      std::make_shared<pointInterpolation_t>(mesh, platform->comm.mpiCommParent, true, intBIDs);
   this->interpolator->setTimerName("neknek_t::");
 
   // int points are the same for all neknek fields
@@ -182,8 +178,6 @@ void neknek_t::setup()
   dlong globalRank;
   MPI_Comm_rank(platform->comm.mpiCommParent, &globalRank);
 
-  auto mesh = (nrs->cht) ? nrs->cds->mesh[0] : nrs->mesh;
-
   const int nsessions = this->nsessions_;
   if (platform->comm.mpiRank == 0) {
     printf("initializing neknek with %d sessions\n", nsessions);
@@ -212,6 +206,8 @@ void neknek_t::setup()
   this->fields = [&]() {
     std::vector<std::string> list;
     for (auto &&field : nrsFieldsToSolve(platform->options)) {
+      auto mesh = (field == "scalar00") ?  nrs->cds->mesh[0] : nrs->mesh;
+
       int intFound = 0;
       for (dlong e = 0; e < mesh->Nelements; ++e) {
         for (dlong f = 0; f < mesh->Nfaces; ++f) {
@@ -263,8 +259,8 @@ void neknek_t::setup()
   this->o_scalarIndices_ = platform->device.malloc<int>(nrs->Nscalar, scalarIndices.data());
 
   for (int bID = 1; bID <= bcMap::size(this->fields[0]); ++bID) {
-    if (isIntBc(bcMap::id(bID, this->fields[0]), this->fields[0])) { 
-       intBIDs.push_back(bID);
+    if (isIntBc(bcMap::id(bID, this->fields[0]), this->fields[0])) {
+      intBIDs.push_back(bID);
     }
   }
 
@@ -376,10 +372,10 @@ occa::memory neknek_t::partitionOfUnity()
 
   auto o_dist = pointInterp.distanceINT();
 
-  auto o_sess = platform->o_memPool.reserve<dlong>(nrs->fieldOffset);
-  auto o_sumDist = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
-  auto o_found = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
-  auto o_interpDist = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_sess = platform->deviceMemoryPool.reserve<dlong>(nrs->fieldOffset);
+  auto o_sumDist = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_found = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_interpDist = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
   o_sumDist.copyFrom(o_dist, mesh->Nlocal);
 
   std::vector<dfloat> found(mesh->Nlocal);
@@ -398,7 +394,7 @@ occa::memory neknek_t::partitionOfUnity()
 
     auto &data = pointInterp.data();
     for (int n = 0; n < mesh->Nlocal; ++n) {
-      found[n] = (data.code[n] == findpts::CODE_NOT_FOUND) ? 0.0 : 1.0;
+      found[n] = (data.code[n] == pointInterpolation_t::CODE_NOT_FOUND) ? 0.0 : 1.0;
     }
 
     o_found.copyFrom(found.data());
@@ -440,7 +436,6 @@ void neknek_t::lag()
 
 void neknek_t::extrapolate(int tstep)
 {
-  auto *mesh = nrs->mesh;
   int extOrder = std::min(tstep, this->nEXT_);
   int bdfOrder = std::min(tstep, nrs->nBDF);
   nek::extCoeff(this->coeffEXT.data(), nrs->dt, extOrder, bdfOrder);
@@ -517,7 +512,7 @@ void neknek_t::exchange(bool allTimeStates, bool lagState)
   if (this->Nscalar_) {
     auto o_S = nrs->cds->o_S;
     if (this->Nscalar_ != nrs->Nscalar) {
-      o_S = platform->o_memPool.reserve<dfloat>(nStates * this->Nscalar_ * nrs->fieldOffset);
+      o_S = platform->deviceMemoryPool.reserve<dfloat>(nStates * this->Nscalar_ * nrs->fieldOffset);
       this->mapScalarKernel(nrs->cds->mesh[0]->Nlocal,
                             nrs->Nscalar,
                             nrs->fieldOffset,
diff --git a/src/nrs/nrs.cpp b/src/nrs/nrs.cpp
index 740336d42..58f89ef39 100644
--- a/src/nrs/nrs.cpp
+++ b/src/nrs/nrs.cpp
@@ -16,7 +16,7 @@ static void computeDivUErr(nrs_t *nrs, dfloat &divUErrVolAvg, dfloat &divUErrL2)
 {
   auto mesh = nrs->mesh;
 
-  auto o_divErr = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_divErr = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
 
   nrs->divergenceVolumeKernel(mesh->Nelements, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, nrs->o_U, o_divErr);
 
@@ -222,7 +222,7 @@ static void setupEllipticSolvers(nrs_t *nrs)
 
       auto o_rho_i = cds->o_rho.slice(cds->fieldOffsetScan[is], mesh->Nlocal);
       auto o_lambda0 = cds->o_diff.slice(cds->fieldOffsetScan[is], mesh->Nlocal);
-      auto o_lambda1 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+      auto o_lambda1 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
       platform->linAlg->axpby(mesh->Nlocal, *cds->g0 / cds->dt[0], o_rho_i, 0.0, o_lambda1);
 
       cds->solver[is] = new elliptic("scalar" + sid, mesh, nrs->fieldOffset, EToB, o_lambda0, o_lambda1);
@@ -255,7 +255,7 @@ static void setupEllipticSolvers(nrs_t *nrs)
     }
 
     auto o_lambda0 = nrs->o_mue;
-    auto o_lambda1 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+    auto o_lambda1 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
     platform->linAlg->axpby(mesh->Nlocal, nrs->g0 / nrs->dt[0], nrs->o_rho, 0.0, o_lambda1);
 
     auto EToBx = createEToB("x-velocity", mesh);
@@ -305,7 +305,7 @@ static void setupEllipticSolvers(nrs_t *nrs)
       printf("================ ELLIPTIC SETUP PRESSURE ================\n");
     }
 
-    auto o_lambda0 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+    auto o_lambda0 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
     platform->linAlg->adyz(mesh->Nlocal, 1.0, nrs->o_rho, o_lambda0);
 
     auto EToB = createEToB("pressure", mesh);
@@ -314,7 +314,6 @@ static void setupEllipticSolvers(nrs_t *nrs)
     if (nrs->cds) {
       nrs->cds->dpdt = nrs->pSolver->nullSpace();
     }
-
   }
 
   if (!platform->options.compareArgs("MESH SOLVER", "NONE")) {
@@ -546,7 +545,6 @@ nrs_t::nrs_t()
   platform->options.getArgs("MESH DIMENSION", this->NVfields);
   platform->options.getArgs("ELEMENT TYPE", this->elementType);
 
-  checkpointWriter = iofldFactory::create();
 }
 
 void nrs_t::init()
@@ -562,8 +560,7 @@ void nrs_t::init()
     }
   }
 
-  this->cht = [&] ()
-  {
+  this->cht = [&]() {
     int nelgt, nelgv;
     const std::string meshFile = platform->options.getArgs("MESH FILE");
     re2::nelg(meshFile, nelgt, nelgv, platform->comm.mpiComm);
@@ -574,20 +571,15 @@ void nrs_t::init()
                "%s\n",
                "Conjugate heat transfer not supported in a moving mesh!");
 
-    nekrsCheck(nelgt != nelgv && !platform->options.compareArgs("SCALAR00 IS TEMPERATURE", "TRUE"),
-               platform->comm.mpiComm,
-               EXIT_FAILURE,
-               "%s\n",
-               "Conjugate heat transfer requires a temperature field!");
-
     return (nelgt > nelgv) ? 1 : 0;
   }();
 
-  nekrsCheck(platform->options.compareArgs("SCALAR00 IS TEMPERATURE", "TRUE") &&
-             (!cht && !platform->options.compareArgs("LOWMACH", "TRUE")),
-             platform->comm.mpiComm, EXIT_FAILURE,
+  nekrsCheck((cht || platform->options.compareArgs("LOWMACH", "TRUE")) &&
+                 !platform->options.compareArgs("SCALAR00 IS TEMPERATURE", "TRUE"),
+             platform->comm.mpiComm,
+             EXIT_FAILURE,
              "%s\n",
-             "TEMPERATURE field requires conjugate heat transfer or lowMach");
+             "conjugate heat transfer or lowMach requires a TEMPERATURE field");
 
   platform->options.getArgs("SUBCYCLING STEPS", this->Nsubsteps);
 
@@ -616,14 +608,15 @@ void nrs_t::init()
 
   nek::setup(numberActiveFields());
 
-  auto getMesh = [&]() 
-  {  
+  auto getMesh = [&]() {
     auto [meshT, meshV] = createMesh(platform->comm.mpiComm, N, cubN, this->cht, platform->kernelInfo);
-    if (!cht) meshV = meshT;
- 
+    if (!cht) {
+      meshV = meshT;
+    }
+
     auto offset = meshV->Np * (meshV->Nelements);
     offset = std::max(offset, meshT->Np * (meshT->Nelements));
- 
+
     auto cubOffset = offset;
     if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) {
       cubOffset = std::max(cubOffset, meshV->Nelements * meshV->cubNp);
@@ -631,7 +624,7 @@ void nrs_t::init()
 
     offset = alignStride<dfloat>(offset);
     cubOffset = alignStride<dfloat>(cubOffset);
- 
+
     this->fieldOffset = offset;
     this->cubatureOffset = cubOffset;
 
@@ -642,6 +635,7 @@ void nrs_t::init()
   }();
 
   this->mesh = getMesh.second;
+  this->meshV = this->mesh;
   auto meshT = getMesh.first;
 
   auto verifyBC = [&]() {
@@ -789,7 +783,6 @@ void nrs_t::init()
 
   assignKernels(this);
 
-
   if (this->Nscalar) {
     cdsConfig_t cfg;
 
@@ -853,88 +846,91 @@ void nrs_t::init()
     }
   }
 
-
   printMeshMetrics(meshT);
-  if (mesh != meshT) printMeshMetrics(mesh);
+  if (mesh != meshT) {
+    printMeshMetrics(mesh);
+  }
 
   setupEllipticSolvers(this);
 }
 
-void nrs_t::restartFromFile(const std::string& restartStr)
+void nrs_t::restartFromFile(const std::string &restartStr)
 {
   auto options = serializeString(restartStr, '+');
   const auto fileName = options[0];
   options.erase(options.begin());
 
   if (platform->comm.mpiRank == 0) {
-    if (options.size()) std::cout << "restart options: "; 
-    for (const auto& element : options) std::cout << element << "  "; 
+    if (options.size()) {
+      std::cout << "restart options: ";
+    }
+    for (const auto &element : options) {
+      std::cout << element << "  ";
+    }
     std::cout << std::endl;
   }
 
-  auto requestedStep = [&]()
-  {
-    auto it = std::find_if(options.begin(), options.end(), 
-      [](const std::string& s) 
-      {
-        return s.find("step") != std::string::npos;
-      }
-    );
+  auto requestedStep = [&]() {
+    auto it = std::find_if(options.begin(), options.end(), [](const std::string &s) {
+      return s.find("step") != std::string::npos;
+    });
 
     std::string val;
-    if (it != options.end()) { 
+    if (it != options.end()) {
       val = serializeString(*it, '=').at(1);
       options.erase(it);
     }
     return (val.empty()) ? -1 : std::stoi(val);
   }();
 
-
-  auto requestedTime = [&]()
-  {
-    auto it = std::find_if(options.begin(), options.end(), 
-      [](const std::string& s) 
-      {
-        return s.find("time") != std::string::npos;
-      }
-    );
+  auto requestedTime = [&]() {
+    auto it = std::find_if(options.begin(), options.end(), [](const std::string &s) {
+      return s.find("time") != std::string::npos;
+    });
 
     std::string val;
-    if (it != options.end()) { 
+    if (it != options.end()) {
       val = serializeString(*it, '=').at(1);
       options.erase(it);
     }
     return val;
   }();
 
-  const auto requestedFields = [&]()
-  { 
+  auto pointInterpolation = [&]() {
+    auto it = std::find_if(options.begin(), options.end(), [](const std::string &s) {
+      return s.find("int") != std::string::npos;
+    });
+
+    auto found = false;
+    if (it != options.end()) {
+      found = true;
+      options.erase(it);
+    }
+    return found;
+  }();
+
+  const auto requestedFields = [&]() {
     std::vector<std::string> flds;
-    for (const auto& entry : {"x", "u", "p", "t", "s"}) {
-      auto it = std::find_if(options.begin(), options.end(), 
-        [entry](const std::string& s) 
-        {
-          std::string ss = s;
-          lowerCase(ss);
-          return ss.find(entry) != std::string::npos;
-        }
-      );
+    for (const auto &entry : {"x", "u", "p", "t", "s"}) {
+      auto it = std::find_if(options.begin(), options.end(), [entry](const std::string &s) {
+        std::string ss = s;
+        lowerCase(ss);
+        return ss.find(entry) != std::string::npos;
+      });
       if (it != options.end()) {
         std::string s = *it;
         lowerCase(s);
-        std::cout << "requested field: " << s << std::endl; 
+        std::cout << "requested field: " << s << std::endl;
         flds.push_back(s);
       }
     }
     return flds;
   }();
 
-
-  auto fileNameEndsWithBp = [&]() 
-  {
+  auto fileNameEndsWithBp = [&]() {
     const std::string suffix = ".bp";
     if (fileName.size() >= suffix.size()) {
-        return fileName.compare(fileName.size() - suffix.size(), suffix.size(), suffix) == 0;
+      return fileName.compare(fileName.size() - suffix.size(), suffix.size(), suffix) == 0;
     }
     return false;
   }();
@@ -942,29 +938,29 @@ void nrs_t::restartFromFile(const std::string& restartStr)
   iofld->open((cht) ? cds->mesh[0] : mesh, iofld::mode::read, fileName, requestedStep);
 
   const auto avaiableFields = iofld->availableVariables();
-  if (platform->comm.mpiRank == 0 && platform->verbose)  {
-    for(const auto& entry : avaiableFields) {
+  if (platform->comm.mpiRank == 0 && platform->verbose) {
+    for (const auto &entry : avaiableFields) {
       std::cout << " found variable " << entry << std::endl;
-    } 
+    }
   }
 
   double time = -1;
   iofld->addVariable("time", time);
   if (platform->options.compareArgs("LOWMACH", "TRUE")) {
     iofld->addVariable("p0th", p0th[0]);
-  } 
+  }
 
-  auto checkOption = [&](const std::string& name)
-  {
-    if (requestedFields.size() == 0) return true; // nothing specfied -> assign all
+  auto checkOption = [&](const std::string &name) {
+    if (requestedFields.size() == 0) {
+      return true; // nothing specfied -> assign all
+    }
     if (std::find(requestedFields.begin(), requestedFields.end(), name) != requestedFields.end()) {
-      return true; 
+      return true;
     }
     return false;
-  }; 
+  };
 
-  auto isAvailable = [&](const std::string& name)
-  {
+  auto isAvailable = [&](const std::string &name) {
     return std::find(avaiableFields.begin(), avaiableFields.end(), name) != avaiableFields.end();
   };
 
@@ -979,9 +975,9 @@ void nrs_t::restartFromFile(const std::string& restartStr)
 
   if (checkOption("u") && isAvailable("velocity")) {
     std::vector<occa::memory> o_iofldU;
-    o_iofldU.push_back(o_U.slice(0*fieldOffset, mesh->Nlocal));
-    o_iofldU.push_back(o_U.slice(1*fieldOffset, mesh->Nlocal));
-    o_iofldU.push_back(o_U.slice(2*fieldOffset, mesh->Nlocal));
+    o_iofldU.push_back(o_U.slice(0 * fieldOffset, mesh->Nlocal));
+    o_iofldU.push_back(o_U.slice(1 * fieldOffset, mesh->Nlocal));
+    o_iofldU.push_back(o_U.slice(2 * fieldOffset, mesh->Nlocal));
     iofld->addVariable("velocity", o_iofldU);
   }
 
@@ -991,7 +987,7 @@ void nrs_t::restartFromFile(const std::string& restartStr)
   }
 
   if (Nscalar) {
-    std::vector<occa::memory> o_iofldT; 
+    std::vector<occa::memory> o_iofldT;
     if (checkOption("t") && isAvailable("temperature")) {
       auto mesh = (cht) ? cds->mesh[0] : this->mesh;
       o_iofldT.push_back(cds->o_S.slice(0, mesh->Nlocal));
@@ -999,32 +995,45 @@ void nrs_t::restartFromFile(const std::string& restartStr)
     }
 
     const auto scalarStart = (o_iofldT.size()) ? 1 : 0;
-    for(int i = scalarStart; i < Nscalar; i++) {
+    for (int i = scalarStart; i < Nscalar; i++) {
       const auto sid = scalarDigitStr(i - scalarStart);
       if (checkOption("s" + sid) && isAvailable("scalar" + sid)) {
-        auto o_Si = cds->o_S.slice(cds->fieldOffsetScan[i], mesh->Nlocal); 
+        auto o_Si = cds->o_S.slice(cds->fieldOffsetScan[i], mesh->Nlocal);
         std::vector<occa::memory> o_iofldSi = {o_Si};
         iofld->addVariable("scalar" + sid, o_iofldSi);
       }
     }
   }
 
+  if (pointInterpolation) {
+    iofld->readAttribute("interpolate", "true");
+  }
+
   iofld->process();
+  iofld->close();
 
   platform->options.setArgs("START TIME", (requestedTime.size()) ? requestedTime : to_string_f(time));
-} 
+}
 
 void nrs_t::setIC()
 {
+  getICFromNek();
+
   if (!platform->options.getArgs("RESTART FILE NAME").empty()) {
     restartFromFile(platform->options.getArgs("RESTART FILE NAME"));
   }
 
-  if (platform->comm.mpiRank == 0) std::cout << "calling UDF_Setup ... \n" << std::flush; 
+  if (platform->comm.mpiRank == 0) {
+    std::cout << "calling UDF_Setup ... \n" << std::flush;
+  }
   udf.setup();
-  if (platform->comm.mpiRank == 0) std::cout << "done\n" << std::flush; 
+  if (platform->comm.mpiRank == 0) {
+    std::cout << "done\n" << std::flush;
+  }
 
-  if (cht) cds->mesh[0]->update(); 
+  if (cht) {
+    cds->mesh[0]->update();
+  }
   mesh->update();
 
   auto projC0 = [&](oogs_t *gsh, mesh_t *mesh, int nFields, dlong fieldOffset, occa::memory &o_in) {
@@ -1051,12 +1060,13 @@ void nrs_t::setIC()
 
   double startTime;
   platform->options.getArgs("START TIME", startTime);
-  copyToNek(startTime, 0, true); // ensure both codes are in sync 
+  copyToNek(startTime, 0, true); // ensure both codes are in sync
 
   nekrsCheck(platform->options.compareArgs("LOWMACH", "TRUE") && p0th[0] <= 1e-6,
              platform->comm.mpiComm,
              EXIT_FAILURE,
-             "Unreasonable p0th value %g!", p0th[0]);
+             "Unreasonable p0th value %g!",
+             p0th[0]);
 }
 
 void nrs_t::printRunStat(int step)
@@ -1323,9 +1333,13 @@ void nrs_t::printRunStat(int step)
 
   platform->timer.printStatEntry("    dotp multi          ", "dotpMulti", "DEVICE:MAX", tElapsedTimeSolve);
 
-  if (platform->comm.mpiRank == 0) std::cout << std::endl;
+  if (platform->comm.mpiRank == 0) {
+    std::cout << std::endl;
+  }
   platform->device.printMemoryUsage(platform->comm.mpiComm);
-  if (platform->comm.mpiRank == 0) std::cout << std::endl;
+  if (platform->comm.mpiRank == 0) {
+    std::cout << std::endl;
+  }
 
   std::cout.unsetf(std::ios::scientific);
   std::cout.precision(outPrecisionSave);
@@ -1398,7 +1412,7 @@ void nrs_t::makeNLT(double time, int tstep, occa::memory &o_Usubcycling)
     if (this->Nsubsteps) {
       o_Usubcycling = this->advectionSubcycling(std::min(tstep, this->nEXT), time);
     } else {
-      auto o_adv = platform->o_memPool.reserve<dfloat>(this->NVfields * this->fieldOffset);
+      auto o_adv = platform->deviceMemoryPool.reserve<dfloat>(this->NVfields * this->fieldOffset);
 
       if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) {
         this->strongAdvectionCubatureVolumeKernel(mesh->Nelements,
@@ -1506,7 +1520,9 @@ void nrs_t::printStepInfo(double time, int tstep, bool printStepInfo, bool print
 
     if (printStepInfo) {
       printf("step= %d  t= %.8e  dt=%.1e  C= %.3f", tstep, time, this->dt[0], cfl);
-      if (!printTimers) std::cout << std::endl;
+      if (!printTimers) {
+        std::cout << std::endl;
+      }
     }
 
     if (printTimers) {
@@ -1531,14 +1547,20 @@ void nrs_t::printStepInfo(double time, int tstep, bool printStepInfo, bool print
 
 void nrs_t::writeCheckpoint(double t, int step, bool enforceOutXYZ, bool enforceFP64, int N_, bool uniform)
 {
-  const auto outXYZ = (enforceOutXYZ) ? true : platform->options.compareArgs("CHECKPOINT OUTPUT MESH", "TRUE");
+  if (!checkpointWriter) {
+    checkpointWriter = iofldFactory::create();
+  }
+
+  const auto outXYZ =
+      (enforceOutXYZ) ? true : platform->options.compareArgs("CHECKPOINT OUTPUT MESH", "TRUE");
 
   if (!checkpointWriter->isInitialized()) {
     auto visMesh = (cht) ? cds->mesh[0] : mesh;
     checkpointWriter->open(visMesh, iofld::mode::write, platform->options.getArgs("CASENAME"));
 
-    if (platform->options.compareArgs("LOWMACH", "TRUE"))
+    if (platform->options.compareArgs("LOWMACH", "TRUE")) {
       checkpointWriter->addVariable("p0th", p0th[0]);
+    }
 
     if (platform->options.compareArgs("VELOCITY CHECKPOINTING", "TRUE")) {
       std::vector<occa::memory> o_V;
@@ -1547,7 +1569,7 @@ void nrs_t::writeCheckpoint(double t, int step, bool enforceOutXYZ, bool enforce
       }
       checkpointWriter->addVariable("velocity", o_V);
     }
- 
+
     if (platform->options.compareArgs("PRESSURE CHECKPOINTING", "TRUE")) {
       auto o_p = std::vector<occa::memory>{o_P.slice(0, visMesh->Nlocal)};
       checkpointWriter->addVariable("pressure", o_p);
@@ -1567,23 +1589,24 @@ void nrs_t::writeCheckpoint(double t, int step, bool enforceOutXYZ, bool enforce
     }
   }
 
-  const auto Nfld = [&] () 
-  {
+  const auto Nfld = [&]() {
     int N;
     platform->options.getArgs("POLYNOMIAL DEGREE", N);
-    return (N_) ? N_ : N;  
+    return (N_) ? N_ : N;
   }();
   checkpointWriter->writeAttribute("polynomialOrder", std::to_string(Nfld));
 
   auto FP64 = platform->options.compareArgs("CHECKPOINT PRECISION", "FP64");
-  if (enforceFP64) FP64 = true; 
+  if (enforceFP64) {
+    FP64 = true;
+  }
   checkpointWriter->writeAttribute("precision", (FP64) ? "64" : "32");
   checkpointWriter->writeAttribute("uniform", (uniform) ? "true" : "false");
   checkpointWriter->writeAttribute("outputMesh", (outXYZ) ? "true" : "false");
 
   checkpointWriter->addVariable("time", t);
 
-  for (const auto& entry : userCheckpointFields) {
+  for (const auto &entry : userCheckpointFields) {
     checkpointWriter->addVariable(entry.first, entry.second);
   }
 
@@ -1638,8 +1661,7 @@ void nrs_t::copyToNek(double time, bool updateMesh_)
   *(nekData.time) = time;
   *(nekData.p0th) = p0th[0];
 
-  auto updateMesh = [&]()
-  {
+  auto updateMesh = [&]() {
     auto mesh = (cht) ? cds->mesh[0] : this->mesh;
 
     auto [x, y, z] = mesh->xyzHost();
@@ -1648,11 +1670,11 @@ void nrs_t::copyToNek(double time, bool updateMesh_)
       nekData.ym1[i] = y[i];
       nekData.zm1[i] = z[i];
     }
-    nek::recomputeGeometry(); 
+    nek::recomputeGeometry();
   };
 
   {
-    auto U = platform->memPool.reserve<dfloat>(mesh->dim * fieldOffset);
+    auto U = platform->memoryPool.reserve<dfloat>(mesh->dim * fieldOffset);
     o_U.copyTo(U, U.size());
     auto vx = U.ptr<dfloat>() + 0 * fieldOffset;
     auto vy = U.ptr<dfloat>() + 1 * fieldOffset;
@@ -1667,7 +1689,7 @@ void nrs_t::copyToNek(double time, bool updateMesh_)
   if (platform->options.compareArgs("MOVING MESH", "TRUE")) {
     auto mesh = (cht) ? cds->mesh[0] : this->mesh;
 
-    auto U = platform->memPool.reserve<dfloat>(mesh->dim * fieldOffset);
+    auto U = platform->memoryPool.reserve<dfloat>(mesh->dim * fieldOffset);
     mesh->o_U.copyTo(U, U.size());
     auto wx = U.ptr<dfloat>() + 0 * fieldOffset;
     auto wy = U.ptr<dfloat>() + 1 * fieldOffset;
@@ -1680,12 +1702,14 @@ void nrs_t::copyToNek(double time, bool updateMesh_)
     updateMesh_ = true;
   }
 
-  if (updateMesh_) updateMesh();
+  if (updateMesh_) {
+    updateMesh();
+  }
 
   {
-    auto P = platform->memPool.reserve<dfloat>(mesh->Nlocal);
+    auto P = platform->memoryPool.reserve<dfloat>(mesh->Nlocal);
     o_P.copyTo(P, P.size());
-    auto Pptr = P.ptr<dfloat>(); 
+    auto Pptr = P.ptr<dfloat>();
     for (int i = 0; i < mesh->Nlocal; i++) {
       nekData.pr[i] = Pptr[i];
     }
@@ -1696,10 +1720,10 @@ void nrs_t::copyToNek(double time, bool updateMesh_)
     for (int is = 0; is < Nscalar; is++) {
       auto mesh = cds->mesh[is];
 
-      auto S = platform->memPool.reserve<dfloat>(mesh->Nlocal);
+      auto S = platform->memoryPool.reserve<dfloat>(mesh->Nlocal);
       cds->o_S.copyTo(S, S.size(), 0, cds->fieldOffsetScan[is]);
 
-      auto Sptr = S.ptr<dfloat>(); 
+      auto Sptr = S.ptr<dfloat>();
       auto Ti = nekData.t + is * nekFieldOffset;
       for (int i = 0; i < mesh->Nlocal; i++) {
         Ti[i] = Sptr[i];
@@ -1725,7 +1749,7 @@ void nrs_t::copyFromNek(double &time)
   p0th[0] = *(nekData.p0th);
 
   {
-    auto U = platform->memPool.reserve<dfloat>(mesh->dim * fieldOffset);
+    auto U = platform->memoryPool.reserve<dfloat>(mesh->dim * fieldOffset);
     auto vx = U.ptr<dfloat>() + 0 * fieldOffset;
     auto vy = U.ptr<dfloat>() + 1 * fieldOffset;
     auto vz = U.ptr<dfloat>() + 2 * fieldOffset;
@@ -1740,7 +1764,7 @@ void nrs_t::copyFromNek(double &time)
   if (platform->options.compareArgs("MOVING MESH", "TRUE")) {
     auto mesh = (cht) ? cds->mesh[0] : this->mesh;
 
-    auto U = platform->memPool.reserve<dfloat>(mesh->dim * fieldOffset);
+    auto U = platform->memoryPool.reserve<dfloat>(mesh->dim * fieldOffset);
     auto wx = U.ptr<dfloat>() + 0 * fieldOffset;
     auto wy = U.ptr<dfloat>() + 1 * fieldOffset;
     auto wz = U.ptr<dfloat>() + 2 * fieldOffset;
@@ -1753,8 +1777,8 @@ void nrs_t::copyFromNek(double &time)
   }
 
   {
-    auto P = platform->memPool.reserve<dfloat>(o_P.size());
-    auto Pptr = P.ptr<dfloat>(); 
+    auto P = platform->memoryPool.reserve<dfloat>(o_P.size());
+    auto Pptr = P.ptr<dfloat>();
     for (int i = 0; i < mesh->Nlocal; i++) {
       Pptr[i] = nekData.pr[i];
     }
@@ -1767,9 +1791,9 @@ void nrs_t::copyFromNek(double &time)
       auto mesh = cds->mesh[is];
       auto Ti = nekData.t + is * nekFieldOffset;
 
-      auto S = platform->memPool.reserve<dfloat>(mesh->Nlocal);
+      auto S = platform->memoryPool.reserve<dfloat>(mesh->Nlocal);
 
-      auto Sptr = S.ptr<dfloat>(); 
+      auto Sptr = S.ptr<dfloat>();
       for (int i = 0; i < mesh->Nlocal; i++) {
         Sptr[i] = Ti[i];
       }
@@ -1778,3 +1802,8 @@ void nrs_t::copyFromNek(double &time)
   }
 }
 
+void nrs_t::getICFromNek()
+{
+  nek::getIC();
+  copyFromNek();
+}
diff --git a/src/nrs/nrs.hpp b/src/nrs/nrs.hpp
index 8343b0520..2314d424f 100644
--- a/src/nrs/nrs.hpp
+++ b/src/nrs/nrs.hpp
@@ -46,7 +46,8 @@ class nrs_t : public solver_t
 
   int elementType = HEXAHEDRA;
 
-  mesh_t* mesh = nullptr;
+  mesh_t *mesh = nullptr;
+  mesh_t *meshV = nullptr; 
 
   elliptic *uSolver = nullptr;
   elliptic *vSolver = nullptr;
@@ -254,6 +255,7 @@ class nrs_t : public solver_t
 
   void copyFromNek(double &time);
   void copyFromNek();
+  void getICFromNek();
 
 private:
   void initInnerStep(double time, dfloat dt, int tstep);
diff --git a/src/nrs/plugins/RANSktau.cpp b/src/nrs/plugins/RANSktau.cpp
index b1862acb3..86485ce31 100644
--- a/src/nrs/plugins/RANSktau.cpp
+++ b/src/nrs/plugins/RANSktau.cpp
@@ -50,8 +50,12 @@ static dfloat coeff[] = {
 
 occa::memory implicitK(double time, int scalarIdx)
 {
-  if (scalarIdx == kFieldIndex) return o_implicitKtau.slice(0 * nrs->fieldOffset, nrs->fieldOffset);
-  if (scalarIdx == kFieldIndex + 1) return o_implicitKtau.slice(1 * nrs->fieldOffset, nrs->fieldOffset);
+  if (scalarIdx == kFieldIndex) {
+    return o_implicitKtau.slice(0 * nrs->fieldOffset, nrs->fieldOffset);
+  }
+  if (scalarIdx == kFieldIndex + 1) {
+    return o_implicitKtau.slice(1 * nrs->fieldOffset, nrs->fieldOffset);
+  }
   return o_NULL;
 }
 
@@ -178,8 +182,8 @@ void RANSktau::updateSourceTerms()
   auto mesh = nrs->mesh;
   cds_t *cds = nrs->cds;
 
-  occa::memory o_OiOjSk = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
-  occa::memory o_SijMag2 = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+  occa::memory o_OiOjSk = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
+  occa::memory o_SijMag2 = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
 
   occa::memory o_FS = cds->o_NLT + cds->fieldOffsetScan[kFieldIndex];
 
@@ -223,15 +227,20 @@ void RANSktau::setup(int ifld)
 
     const std::string sid = scalarDigitStr(kFieldIndex + i);
     nekrsCheck(!platform->options.getArgs("SCALAR" + sid + " DIFFUSIVITY").empty() ||
-               !platform->options.getArgs("SCALAR" + sid + " DENSITY").empty(),
-               platform->comm.mpiComm, EXIT_FAILURE, "%s\n", "illegal property specificition for k/tau in par!");
+                   !platform->options.getArgs("SCALAR" + sid + " DENSITY").empty(),
+               platform->comm.mpiComm,
+               EXIT_FAILURE,
+               "%s\n",
+               "illegal property specificition for k/tau in par!");
   }
 
   auto cds = nrs->cds;
-  auto mesh = nrs->mesh;
 
-  nekrsCheck(cds->NSfields < kFieldIndex+1, platform->comm.mpiComm, EXIT_FAILURE, 
-    "%s\n", "number of scalar fields too low!");
+  nekrsCheck(cds->NSfields < kFieldIndex + 1,
+             platform->comm.mpiComm,
+             EXIT_FAILURE,
+             "%s\n",
+             "number of scalar fields too low!");
 
   o_k = cds->o_S + cds->fieldOffsetScan[kFieldIndex];
   o_tau = cds->o_S + cds->fieldOffsetScan[kFieldIndex + 1];
diff --git a/src/nrs/plugins/lowMach.cpp b/src/nrs/plugins/lowMach.cpp
index ee7e3bac3..4ecca2634 100644
--- a/src/nrs/plugins/lowMach.cpp
+++ b/src/nrs/plugins/lowMach.cpp
@@ -32,8 +32,7 @@ static bool setupCalled = false;
 
 void lowMach::buildKernel(occa::properties kernelInfo)
 {
-  auto buildKernel = [&kernelInfo](const std::string& kernelName)
-  { 
+  auto buildKernel = [&kernelInfo](const std::string &kernelName) {
     const auto path = getenv("NEKRS_KERNEL_DIR") + std::string("/nrs/plugins/");
     const auto fileName = path + "lowMach.okl";
     const auto reqName = "lowMach::";
@@ -41,18 +40,18 @@ void lowMach::buildKernel(occa::properties kernelInfo)
       platform->kernelRequests.add(reqName, fileName, kernelInfo);
       return occa::kernel();
     } else {
-      buildKernelCalled = 1; 
+      buildKernelCalled = 1;
       return platform->kernelRequests.load(reqName, kernelName);
     }
   };
 
-  qtlKernel= buildKernel("qtlHex3D");
+  qtlKernel = buildKernel("qtlHex3D");
   p0thHelperKernel = buildKernel("p0thHelper");
 
   platform->options.setArgs("LOWMACH", "TRUE");
 }
 
-void lowMach::setup(dfloat alpha_, const occa::memory& o_beta_, const occa::memory& o_kappa_)
+void lowMach::setup(dfloat alpha_, const occa::memory &o_beta_, const occa::memory &o_kappa_)
 {
   static bool isInitialized = false;
   if (isInitialized) {
@@ -60,14 +59,14 @@ void lowMach::setup(dfloat alpha_, const occa::memory& o_beta_, const occa::memo
   }
   isInitialized = true;
 
-  _nrs = dynamic_cast<nrs_t*>(platform->solver);;
+  _nrs = dynamic_cast<nrs_t *>(platform->solver);
+  ;
 
   alpha0 = alpha_;
   _nrs->alpha0Ref = alpha0;
   o_beta = o_beta_;
   o_kappa = o_kappa_;
 
-  auto mesh = _nrs->mesh;
   int err = 1;
   if (platform->options.compareArgs("SCALAR00 IS TEMPERATURE", "TRUE")) {
     err = 0;
@@ -112,7 +111,7 @@ void lowMach::qThermalSingleComponent(double time)
     rhsCVODE = cds->cvode->isRhsEvaluation();
   }
 
-  auto o_gradT = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  auto o_gradT = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
   nrs->gradientVolumeKernel(mesh->Nelements, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, cds->o_S, o_gradT);
 
   double flopsGrad = 6 * mesh->Np * mesh->Nq + 18 * mesh->Np;
@@ -122,7 +121,7 @@ void lowMach::qThermalSingleComponent(double time)
 
   platform->linAlg->axmyVector(mesh->Nlocal, nrs->fieldOffset, 0, 1.0, nrs->mesh->o_invLMM, o_gradT);
 
-  auto o_src = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+  auto o_src = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
   platform->linAlg->fill(mesh->Nlocal, 0.0, o_src);
   if (cds->userSource) {
     platform->timer.tic(scope + "udfSEqnSource", 1);
@@ -168,13 +167,13 @@ void lowMach::qThermalSingleComponent(double time)
                "computing p0th and dp0thdt using CVODE is not supported!");
 
     const auto termQ = [&]() {
-      auto o_tmp = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+      auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
       linAlg->axmyz(mesh->Nlocal, 1.0, mesh->o_LMM, o_div, o_tmp);
       return linAlg->sum(mesh->Nlocal, o_tmp, platform->comm.mpiComm);
     }();
 
-    auto o_tmp1 = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
-    auto o_tmp2 = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
+    auto o_tmp1 = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
+    auto o_tmp2 = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
     p0thHelperKernel(mesh->Nlocal,
                      alpha0,
                      nrs->p0th[0],
diff --git a/src/nrs/plugins/lpm.cpp b/src/nrs/plugins/lpm.cpp
index 925def42f..1a0c36673 100644
--- a/src/nrs/plugins/lpm.cpp
+++ b/src/nrs/plugins/lpm.cpp
@@ -16,7 +16,7 @@ lpm_t::lpm_t(dfloat bb_tol_, dfloat newton_tol_)
       bb_tol(bb_tol_),
       newton_tol(newton_tol_), 
       interp(std::make_unique<pointInterpolation_t>(nrs->mesh, platform->comm.mpiComm, 
-        nrs->mesh->Nlocal, nrs->mesh->Nlocal, bb_tol, newton_tol))
+                                                    true, std::vector<int>{}, bb_tol, newton_tol))
 {
   nekrsCheck(!kernelsRegistered_,
              platform->comm.mpiComm,
@@ -503,7 +503,7 @@ void lpm_t::integrate(double tf)
     interp.reset();
 
     interp = std::make_unique<pointInterpolation_t>(nrs->mesh, platform->comm.mpiComm,
-      nrs->mesh->Nlocal, nrs->mesh->Nlocal, bb_tol, newton_tol);
+     true, std::vector<int>{}, bb_tol, newton_tol);
   }
 
   // set extrapolated state to t^n (copy from laggedInterpFields)
@@ -1717,8 +1717,6 @@ void lpm_t::writeFld()
   ++out_step;
 
   MPI_Comm mpi_comm = platform->comm.mpiComm;
-  int mpi_rank = platform->comm.mpiRank;
-  int mpi_size = platform->comm.mpiCommSize;
 
   long long int globalNPartOutput = nPartOutput;
 
diff --git a/src/nrs/plugins/velRecycling.cpp b/src/nrs/plugins/velRecycling.cpp
index 2c72371f4..309a517f5 100644
--- a/src/nrs/plugins/velRecycling.cpp
+++ b/src/nrs/plugins/velRecycling.cpp
@@ -26,7 +26,6 @@ dfloat area;
 static bool buildKernelCalled = false;
 static bool setupCalled = false;
 
-int Nblock;
 } // namespace
 
 static void _setup(occa::memory &o_wrk_, const int bID_, const dfloat wbar_)
diff --git a/src/nrs/postProcessing/Qcriterion.cpp b/src/nrs/postProcessing/Qcriterion.cpp
index 10ec7d538..9dd41f59f 100644
--- a/src/nrs/postProcessing/Qcriterion.cpp
+++ b/src/nrs/postProcessing/Qcriterion.cpp
@@ -20,14 +20,14 @@ void nrs_t::Qcriterion(occa::memory &o_Q)
 
 occa::memory nrs_t::Qcriterion(const occa::memory &o_U)
 {
-  auto o_Q = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_Q = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   Qcriterion(o_U, o_Q);
   return o_Q;
 }
 
 occa::memory nrs_t::Qcriterion()
 {
-  auto o_Q = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  auto o_Q = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   Qcriterion(this->o_U, o_Q);
   return o_Q;
 }
diff --git a/src/nrs/postProcessing/aeroForces.cpp b/src/nrs/postProcessing/aeroForces.cpp
index b6e441d3e..d84685d92 100644
--- a/src/nrs/postProcessing/aeroForces.cpp
+++ b/src/nrs/postProcessing/aeroForces.cpp
@@ -20,7 +20,7 @@ AeroForce *nrs_t::aeroForces(int nbID, const occa::memory &o_bID, const occa::me
     o_rho = this->o_rho;
   }
 
-  auto o_forces = platform->o_memPool.reserve<dfloat>(2 * mesh->dim * mesh->Nelements);
+  auto o_forces = platform->deviceMemoryPool.reserve<dfloat>(2 * mesh->dim * mesh->Nelements);
   static occa::kernel kernel;
   if (!kernel.isInitialized()) {
     kernel = platform->kernelRequests.load("nrs-aeroForces");
diff --git a/src/nrs/postProcessing/strainRotationRate.cpp b/src/nrs/postProcessing/strainRotationRate.cpp
index d5847712e..5135355ae 100644
--- a/src/nrs/postProcessing/strainRotationRate.cpp
+++ b/src/nrs/postProcessing/strainRotationRate.cpp
@@ -1,12 +1,12 @@
 #include "nrs.hpp"
 
-static occa::memory _strainRotationRate(nrs_t* nrs, bool rotationRate, const occa::memory &o_U, bool smooth)
+static occa::memory _strainRotationRate(nrs_t *nrs, bool rotationRate, const occa::memory &o_U, bool smooth)
 {
   auto mesh = nrs->mesh;
 
   const int nFields = (rotationRate) ? 2 * nrs->NVfields + nrs->NVfields : 2 * nrs->NVfields;
 
-  auto o_SO = platform->o_memPool.reserve<dfloat>(nFields * nrs->fieldOffset);
+  auto o_SO = platform->deviceMemoryPool.reserve<dfloat>(nFields * nrs->fieldOffset);
 
   nrs->SijOijKernel(mesh->Nelements,
                     nrs->fieldOffset,
diff --git a/src/nrs/timeStepper.cpp b/src/nrs/timeStepper.cpp
index 9f11a78d9..f342cd2a5 100644
--- a/src/nrs/timeStepper.cpp
+++ b/src/nrs/timeStepper.cpp
@@ -270,14 +270,14 @@ static occa::memory meshSolve(nrs_t *nrs, double time, int iter)
   auto mesh = (nrs->cht) ? nrs->cds->mesh[0] : nrs->mesh;
   linAlg_t *linAlg = platform->linAlg;
 
-  auto o_rhs = platform->o_memPool.reserve<dfloat>(mesh->dim * nrs->fieldOffset);
+  auto o_rhs = platform->deviceMemoryPool.reserve<dfloat>(mesh->dim * nrs->fieldOffset);
   platform->linAlg->fill(mesh->dim * nrs->fieldOffset, 0, o_rhs);
 
   platform->timer.tic("meshSolve", 1);
 
   auto o_lambda0 = nrs->o_meshMue;
 
-  auto o_U = platform->o_memPool.reserve<dfloat>(mesh->dim * nrs->fieldOffset);
+  auto o_U = platform->deviceMemoryPool.reserve<dfloat>(mesh->dim * nrs->fieldOffset);
   if (platform->options.compareArgs("MESH INITIAL GUESS", "EXTRAPOLATION") && iter == 1) {
     o_U.copyFrom(mesh->o_Ue);
   } else {
@@ -417,7 +417,7 @@ void nrs_t::initInnerStep(double time, dfloat _dt, int tstep)
   if (this->flow) {
     platform->timer.tic("makef", 1);
 
-     platform->linAlg->fill(this->fieldOffset * this->NVfields, 0.0, this->o_NLT);
+    platform->linAlg->fill(this->fieldOffset * this->NVfields, 0.0, this->o_NLT);
 
     if (this->userVelocitySource) {
       platform->timer.tic("udfUEqnSource", 1);
diff --git a/src/nrs/tombo.cpp b/src/nrs/tombo.cpp
index 6a0c85fd9..5ce7a605c 100644
--- a/src/nrs/tombo.cpp
+++ b/src/nrs/tombo.cpp
@@ -9,16 +9,14 @@ occa::memory pressureSolve(nrs_t *nrs, double time, int stage)
   double flopCount = 0.0;
   platform->timer.tic("pressure rhs", 1);
 
-  const auto o_lambda0 = [&]()
-  {
-    auto o_lambda0 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  const auto o_lambda0 = [&]() {
+    auto o_lambda0 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
     platform->linAlg->adyz(mesh->Nlocal, 1.0, nrs->o_rho, o_lambda0);
     return o_lambda0;
   }();
 
-  const auto o_stressTerm = [&]()
-  {
-    auto o_curl = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  const auto o_stressTerm = [&]() {
+    auto o_curl = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
 
     nrs->curlKernel(mesh->Nelements, 1, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, nrs->o_Ue, o_curl);
     flopCount += static_cast<double>(mesh->Nelements) * (18 * mesh->Np * mesh->Nq + 36 * mesh->Np);
@@ -28,27 +26,26 @@ occa::memory pressureSolve(nrs_t *nrs, double time, int stage)
     platform->linAlg->axmyVector(mesh->Nlocal, nrs->fieldOffset, 0, 1.0, nrs->mesh->o_invLMM, o_curl);
     flopCount += mesh->Nlocal;
 
-    auto o_stressTerm = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+    auto o_stressTerm = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
     nrs->curlKernel(mesh->Nelements, 1, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, o_curl, o_stressTerm);
     flopCount += static_cast<double>(mesh->Nelements) * (18 * mesh->Np * mesh->Nq + 36 * mesh->Np);
 
     if (platform->options.compareArgs("VELOCITY STRESSFORMULATION", "TRUE")) {
       nrs->pressureStressKernel(mesh->Nelements,
-                              mesh->o_vgeo,
-                              mesh->o_D,
-                              nrs->fieldOffset,
-                              nrs->o_mue,
-                              nrs->o_Ue,
-                              nrs->o_div,
-                              o_stressTerm);
+                                mesh->o_vgeo,
+                                mesh->o_D,
+                                nrs->fieldOffset,
+                                nrs->o_mue,
+                                nrs->o_Ue,
+                                nrs->o_div,
+                                o_stressTerm);
       flopCount += static_cast<double>(mesh->Nelements) * (18 * mesh->Nq * mesh->Np + 100 * mesh->Np);
     }
     return o_stressTerm;
   }();
 
-  const auto o_rhs = [&]()
-  { 
-    auto o_gradDiv = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  const auto o_rhs = [&]() {
+    auto o_gradDiv = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
     nrs->gradientVolumeKernel(mesh->Nelements,
                               mesh->o_vgeo,
                               mesh->o_D,
@@ -57,8 +54,8 @@ occa::memory pressureSolve(nrs_t *nrs, double time, int stage)
                               o_gradDiv);
     flopCount += static_cast<double>(mesh->Nelements) * (6 * mesh->Np * mesh->Nq + 18 * mesh->Np);
 
-    auto o_rhs = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
- 
+    auto o_rhs = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+
     if (platform->options.compareArgs("PRESSURE VISCOUS TERMS", "TRUE")) {
       nrs->pressureRhsKernel(mesh->Nlocal,
                              nrs->fieldOffset,
@@ -79,16 +76,15 @@ occa::memory pressureSolve(nrs_t *nrs, double time, int stage)
     return o_rhs;
   }();
 
-  const auto o_pRhs = [&]()
-  { 
-    auto o_pRhs = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
- 
+  const auto o_pRhs = [&]() {
+    auto o_pRhs = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
+
     nrs->wDivergenceVolumeKernel(mesh->Nelements, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, o_rhs, o_pRhs);
     flopCount += static_cast<double>(mesh->Nelements) * (6 * mesh->Np * mesh->Nq + 18 * mesh->Np);
- 
+
     nrs->pressureAddQtlKernel(mesh->Nlocal, mesh->o_LMM, nrs->g0 * 1 / nrs->dt[0], nrs->o_div, o_pRhs);
     flopCount += 3 * mesh->Nlocal;
- 
+
     nrs->divergenceSurfaceKernel(mesh->Nelements,
                                  mesh->o_sgeo,
                                  mesh->o_vmapM,
@@ -106,7 +102,7 @@ occa::memory pressureSolve(nrs_t *nrs, double time, int stage)
   platform->timer.toc("pressure rhs");
   platform->flopCounter->add("pressure RHS", flopCount);
 
-  occa::memory o_P = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  occa::memory o_P = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
   o_P.copyFrom(nrs->o_P);
 
   nrs->pSolver->solve(o_lambda0, o_NULL, o_pRhs, o_P);
@@ -133,19 +129,16 @@ occa::memory velocitySolve(nrs_t *nrs, double time, int stage)
   double flopCount = 0.0;
   platform->timer.tic("velocity rhs", 1);
 
-  const auto o_gradMueDiv = [&]()
-  {
-    dfloat scale = 1./3;
-    if (platform->options.compareArgs("VELOCITY STRESSFORMULATION", "TRUE")) scale = -2*scale;
+  const auto o_gradMueDiv = [&]() {
+    dfloat scale = 1. / 3;
+    if (platform->options.compareArgs("VELOCITY STRESSFORMULATION", "TRUE")) {
+      scale = -2 * scale;
+    }
 
-    auto o_mueDiv = platform->o_memPool.reserve<dfloat>(nrs->fieldOffset);
-    platform->linAlg->axmyz(mesh->Nlocal,
-                            scale,
-                            nrs->o_mue,
-                            nrs->o_div,
-                            o_mueDiv);
+    auto o_mueDiv = platform->deviceMemoryPool.reserve<dfloat>(nrs->fieldOffset);
+    platform->linAlg->axmyz(mesh->Nlocal, scale, nrs->o_mue, nrs->o_div, o_mueDiv);
 
-    auto o_gradMueDiv = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+    auto o_gradMueDiv = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
     nrs->gradientVolumeKernel(mesh->Nelements,
                               mesh->o_vgeo,
                               mesh->o_D,
@@ -157,19 +150,23 @@ occa::memory velocitySolve(nrs_t *nrs, double time, int stage)
     return o_gradMueDiv;
   }();
 
-  const auto o_gradP = [&]()
-  {
-    occa::memory o_gradP = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  const auto o_gradP = [&]() {
+    occa::memory o_gradP = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
     nrs->wgradientVolumeKernel(mesh->Nelements, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, nrs->o_P, o_gradP);
     flopCount += static_cast<double>(mesh->Nelements) * 18 * (mesh->Np * mesh->Nq + mesh->Np);
 
     return o_gradP;
   }();
 
-  const auto o_rhs = [&]()
-  {
-    auto o_rhs = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
-    nrs->velocityRhsKernel(mesh->Nlocal, nrs->fieldOffset, nrs->o_rho, nrs->o_JwF, o_gradMueDiv, o_gradP, o_rhs);
+  const auto o_rhs = [&]() {
+    auto o_rhs = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+    nrs->velocityRhsKernel(mesh->Nlocal,
+                           nrs->fieldOffset,
+                           nrs->o_rho,
+                           nrs->o_JwF,
+                           o_gradMueDiv,
+                           o_gradP,
+                           o_rhs);
     flopCount += 9 * mesh->Nlocal;
 
     nrs->velocityNeumannBCKernel(mesh->Nelements,
@@ -187,7 +184,7 @@ occa::memory velocitySolve(nrs_t *nrs, double time, int stage)
                                  nrs->o_usrwrk,
                                  nrs->o_Ue,
                                  o_rhs);
- 
+
     flopCount += static_cast<double>(mesh->Nelements) * (3 * mesh->Np + 36 * mesh->Nq * mesh->Nq);
 
     return o_rhs;
@@ -197,26 +194,23 @@ occa::memory velocitySolve(nrs_t *nrs, double time, int stage)
   platform->flopCounter->add("velocity RHS", flopCount);
 
   const auto o_lambda0 = nrs->o_mue;
-  const auto o_lambda1 = [&]()
-  {
-    auto o_lambda1 = platform->o_memPool.reserve<dfloat>(mesh->Nlocal);
+  const auto o_lambda1 = [&]() {
+    auto o_lambda1 = platform->deviceMemoryPool.reserve<dfloat>(mesh->Nlocal);
     if (nrs->userVelocityImplicitLinearTerm) {
       auto o_implicitLT = nrs->userVelocityImplicitLinearTerm(time);
-      platform->linAlg
-          ->axpbyz(mesh->Nlocal, nrs->g0 / nrs->dt[0], nrs->o_rho, 1.0, o_implicitLT, o_lambda1);
+      platform->linAlg->axpbyz(mesh->Nlocal, nrs->g0 / nrs->dt[0], nrs->o_rho, 1.0, o_implicitLT, o_lambda1);
     } else {
       platform->linAlg->axpby(mesh->Nlocal, nrs->g0 / nrs->dt[0], nrs->o_rho, 0.0, o_lambda1);
     }
     return o_lambda1;
   }();
 
-  const auto o_U = [&]()
-  {
-    auto o_U = platform->o_memPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
+  const auto o_U = [&]() {
+    auto o_U = platform->deviceMemoryPool.reserve<dfloat>(nrs->NVfields * nrs->fieldOffset);
     o_U.copyFrom(platform->options.compareArgs("VELOCITY INITIAL GUESS", "EXTRAPOLATION") && stage == 1
-                 ? nrs->o_Ue
-                 : nrs->o_U);
- 
+                     ? nrs->o_Ue
+                     : nrs->o_U);
+
     if (nrs->uvwSolver) {
       nrs->uvwSolver->solve(o_lambda0, o_lambda1, o_rhs, o_U);
     } else {
diff --git a/src/plugins/nekAscent.hpp b/src/plugins/nekAscent.hpp
index 7b8f2bbb2..3ade8af71 100644
--- a/src/plugins/nekAscent.hpp
+++ b/src/plugins/nekAscent.hpp
@@ -57,9 +57,9 @@ void initializeAscent()
   conduit::utils::set_error_handler(errHandler);
 
   ascent_opts["mpi_comm"] = MPI_Comm_c2f(comm);
-  //ascent_opts["runtime/vtkm/backend"] = "serial";
-  // ascent_opts["exceptions"] = "forward";
-  // ascent_opts["messages"] = "verbose";
+  // ascent_opts["runtime/vtkm/backend"] = "serial";
+  //  ascent_opts["exceptions"] = "forward";
+  //  ascent_opts["messages"] = "verbose";
 
   mAscent.open(ascent_opts);
 
@@ -160,7 +160,7 @@ void updateFieldData()
       auto data = [&]() {
         occa::memory o_fldOut;
         if (interpolate || uniform) {
-          auto o_tmp = platform->o_memPool.reserve<dfloat>(mesh_vis->Nlocal);
+          auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(mesh_vis->Nlocal);
           if (uniform) {
             mesh_fld->interpolate(o_fldIn.at(idim), mesh_vis, o_tmp, true);
           } else {
@@ -178,13 +178,13 @@ void updateFieldData()
         } else {
           occa::memory o_tmp;
           if (stageThroughHost) {
-            o_tmp = platform->memPool.reserve<dfloat>(mesh_vis->Nlocal);
+            o_tmp = platform->memoryPool.reserve<dfloat>(mesh_vis->Nlocal);
             auto ptr = o_tmp.ptr<dfloat>();
             for (int i = 0; i < o_tmp.size(); i++) {
               ptr[i] = 0.0;
             }
           } else {
-            o_tmp = platform->o_memPool.reserve<dfloat>(mesh_vis->Nlocal);
+            o_tmp = platform->deviceMemoryPool.reserve<dfloat>(mesh_vis->Nlocal);
             platform->linAlg->fill(o_tmp.size(), 0.0, o_tmp);
           }
           o_tmp.copyFrom(o_fldIn.at(idim), o_fldIn.at(idim).size());
@@ -234,9 +234,7 @@ void updateFieldData()
 
 } // namespace
 
-void addVariable(const std::string &name,
-                            mesh_t *mesh_fld,
-                            const std::vector<deviceMemory<dfloat>> &fld)
+void addVariable(const std::string &name, mesh_t *mesh_fld, const std::vector<deviceMemory<dfloat>> &fld)
 {
   std::vector<occa::memory> fld_;
   for (const auto &entry : fld) {
@@ -255,11 +253,10 @@ void clearData()
 }
 
 void setup(mesh_t *mesh_,
-                      const std::string &actionFile,
-                      int Nin_ = 0,
-                      bool uniform_ = false,
-                      bool stageThroughHost_ = false
-                      bool async_ = false)
+           const std::string &actionFile,
+           int Nin_ = 0,
+           bool uniform_ = false,
+           bool stageThroughHost_ = false bool async_ = false)
 {
   mesh_in = mesh_;
   const int Nin = (Nin_) ? Nin_ : mesh_in->N;
@@ -270,7 +267,9 @@ void setup(mesh_t *mesh_,
     stageThroughHost = false;
   }
 
-  if (stageThroughHost) async = true; 
+  if (stageThroughHost) {
+    async = true;
+  }
 
   if (async) {
     int provided;
@@ -346,7 +345,7 @@ void setup(mesh_t *mesh_,
     std::vector<dlong> etov(Nverts);
 
     occa::memory o_etov;
-    if (stageThroughHost) { 
+    if (stageThroughHost) {
       o_etov = platform->device.mallocHost<dlong>(etov.size());
     } else {
       o_etov = platform->device.malloc<dlong>(etov.size());
diff --git a/src/plugins/tavg.cpp b/src/plugins/tavg.cpp
index 8eacee9c7..00935a124 100644
--- a/src/plugins/tavg.cpp
+++ b/src/plugins/tavg.cpp
@@ -6,8 +6,6 @@
 // private members
 namespace
 {
-ogs_t *ogs;
-
 dlong fieldOffset;
 
 std::vector< std::vector<deviceMemory<dfloat>> > userFieldList;
diff --git a/src/pointInterpolation/findpts/findpts.cpp b/src/pointInterpolation/findpts/findpts.cpp
index 6aead3d99..9e0a27214 100644
--- a/src/pointInterpolation/findpts/findpts.cpp
+++ b/src/pointInterpolation/findpts/findpts.cpp
@@ -54,8 +54,8 @@ struct hash_data_3 {
 };
 
 struct findpts_dummy_ms_data {
-    unsigned int *nsid;
-    double       *distfint;
+  unsigned int *nsid;
+  double *distfint;
 };
 
 struct findpts_data_3 {
@@ -64,7 +64,7 @@ struct findpts_data_3 {
   struct hash_data_3 hash;
   struct array savpt;
   struct findpts_dummy_ms_data fdms;
-  uint   fevsetup;
+  uint fevsetup;
 };
 
 auto *gslibFindptsSetup(MPI_Comm mpi_comm,
@@ -145,98 +145,6 @@ auto *gslibFindptsSetup(MPI_Comm mpi_comm,
 
 namespace findpts
 {
-namespace
-{
-
-namespace pool
-{
-static occa::memory o_scratch;
-static occa::memory h_out;
-static occa::memory h_r;
-static occa::memory h_el;
-static occa::memory h_dist2;
-static occa::memory h_code;
-
-static dfloat *out;
-static dfloat *r;
-static dlong *el;
-static dfloat *dist2;
-static dlong *code;
-
-static void manageBuffers(dlong pn, dlong outputOffset, dlong nOutputFields)
-{
-  if (pn == 0) {
-    return;
-  }
-
-  dlong Nbytes = 0;
-  Nbytes += pn * sizeof(dlong);                            // code
-  Nbytes += pn * sizeof(dlong);                            // element
-  Nbytes += pn * sizeof(dlong);                            // elsid
-  Nbytes += pn * sizeof(dlong);                            // session
-  Nbytes += pn * sizeof(dfloat);                           // dist2
-  Nbytes += pn * sizeof(dfloat);                           // disti
-  Nbytes += dim * pn * sizeof(dfloat);                     // r,s,t data
-  Nbytes += dim * pn * sizeof(dfloat);                     // x,y,z coordinates
-  Nbytes += nOutputFields * outputOffset * sizeof(dfloat); // output buffer
-
-  if (Nbytes > pool::o_scratch.byte_size()) {
-    if (pool::o_scratch.byte_size()) {
-      pool::o_scratch.free();
-    }
-    void *buffer = std::calloc(Nbytes, 1);
-    pool::o_scratch = platform->device.malloc(Nbytes);
-    pool::o_scratch.copyFrom(buffer);
-    std::free(buffer);
-  }
-
-  const auto NbytesR = dim * pn * sizeof(dfloat);
-  if (NbytesR > pool::h_r.size()) {
-    if (pool::h_r.size()) {
-      pool::h_r.free();
-    }
-    pool::h_r = platform->device.mallocHost(NbytesR);
-    pool::r = (dfloat *)pool::h_r.ptr();
-  }
-
-  const auto NbytesEl = pn * sizeof(dlong);
-  if (NbytesEl > pool::h_el.size()) {
-    if (pool::h_el.size()) {
-      pool::h_el.free();
-    }
-    pool::h_el = platform->device.mallocHost(NbytesEl);
-    pool::el = (dlong *)pool::h_el.ptr();
-  }
-
-  const auto NbytesCode = pn * sizeof(dlong);
-  if (NbytesCode > pool::h_code.size()) {
-    if (pool::h_code.size()) {
-      pool::h_code.free();
-    }
-    pool::h_code = platform->device.mallocHost(NbytesCode);
-    pool::code = (dlong *)pool::h_code.ptr();
-  }
-
-  const auto NbytesDist2 = pn * sizeof(dfloat);
-  if (NbytesDist2 > pool::h_dist2.size()) {
-    if (pool::h_dist2.size()) {
-      pool::h_dist2.free();
-    }
-    pool::h_dist2 = platform->device.mallocHost(NbytesDist2);
-    pool::dist2 = (dfloat *)pool::h_dist2.ptr();
-  }
-
-  const auto NbytesOut = nOutputFields * outputOffset * sizeof(dfloat);
-  if (NbytesOut > pool::h_out.size() && NbytesOut > 0) {
-    if (pool::h_out.size()) {
-      pool::h_out.free();
-    }
-    pool::h_out = platform->device.mallocHost(NbytesOut);
-    pool::out = (dfloat *)pool::h_out.ptr();
-  }
-}
-} // namespace pool
-} // namespace
 
 void findpts_t::findptsLocal(int *const code,
                              int *const el,
@@ -268,45 +176,24 @@ void findpts_t::findptsLocal(int *const code,
     return;
   }
 
-  pool::manageBuffers(pn, 0, 0);
-
-  dlong byteOffset = 0;
-
-  occa::memory o_code = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dlong) * pn;
-
-  occa::memory o_el = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dlong) * pn;
-
-  occa::memory o_elsid = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dlong) * pn;
-
-  occa::memory o_sess = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dlong) * pn;
-
-  occa::memory o_r = pool::o_scratch + byteOffset;
-  byteOffset += dim * sizeof(dfloat) * pn;
-
-  occa::memory o_dist2 = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dfloat) * pn;
-
-  occa::memory o_disti = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dfloat) * pn;
-
-  occa::memory o_xint = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dfloat) * pn;
+  auto o_code = platform->deviceMemoryPool.reserve<dlong>(pn);
+  auto o_el = platform->deviceMemoryPool.reserve<dlong>(pn);
+  auto o_elsid = platform->deviceMemoryPool.reserve<dlong>(pn);
+  auto o_r = platform->deviceMemoryPool.reserve<dfloat>(dim * pn);
+  auto o_dist2 = platform->deviceMemoryPool.reserve<dfloat>(pn);
+  auto o_disti = platform->deviceMemoryPool.reserve<dfloat>(pn);
 
-  occa::memory o_yint = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dfloat) * pn;
+  auto o_sess = platform->deviceMemoryPool.reserve<dlong>(pn);
+  auto o_xint = platform->deviceMemoryPool.reserve<dfloat>(pn);
+  auto o_yint = platform->deviceMemoryPool.reserve<dfloat>(pn);
+  auto o_zint = platform->deviceMemoryPool.reserve<dfloat>(pn);
 
-  occa::memory o_zint = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dfloat) * pn;
+  o_xint.copyFrom(x);
+  o_yint.copyFrom(y);
+  o_zint.copyFrom(z);
 
-  o_xint.copyFrom(x, sizeof(dfloat) * pn);
-  o_yint.copyFrom(y, sizeof(dfloat) * pn);
-  o_zint.copyFrom(z, sizeof(dfloat) * pn);
   if (useMultiSessionSupport) {
-    o_sess.copyFrom(sess, sizeof(dlong) * pn);
+    o_sess.copyFrom(sess);
   }
 
   if (timerLevel != TimerLevel::None) {
@@ -347,13 +234,14 @@ void findpts_t::findptsLocal(int *const code,
   }
 
   if (pn > 0) {
-    o_code.copyTo(code, sizeof(dlong) * pn);
-    o_el.copyTo(el, sizeof(dlong) * pn);
-    o_elsid.copyTo(elsid, sizeof(dlong) * pn);
-    o_r.copyTo(r, dim * sizeof(dfloat) * pn);
-    o_dist2.copyTo(dist2, sizeof(dfloat) * pn);
-    o_disti.copyTo(disti, sizeof(dfloat) * pn);
+    o_code.copyTo(code);
+    o_el.copyTo(el);
+    o_elsid.copyTo(elsid);
+    o_r.copyTo(r);
+    o_dist2.copyTo(dist2);
+    o_disti.copyTo(disti);
   }
+
   if (timerLevel == TimerLevel::Detailed) {
     platform->timer.toc(timerName + "findptsLocal");
   }
@@ -388,27 +276,12 @@ void findpts_t::findptsLocal(int *const code,
     return;
   }
 
-  pool::manageBuffers(pn, 0, 0);
-
-  dlong byteOffset = 0;
-
-  occa::memory o_code = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dlong) * pn;
-
-  occa::memory o_el = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dlong) * pn;
-
-  occa::memory o_elsid = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dlong) * pn;
-
-  occa::memory o_r = pool::o_scratch + byteOffset;
-  byteOffset += dim * sizeof(dfloat) * pn;
-
-  occa::memory o_dist2 = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dfloat) * pn;
-
-  occa::memory o_disti = pool::o_scratch + byteOffset;
-  byteOffset += sizeof(dfloat) * pn;
+  auto o_code = platform->deviceMemoryPool.reserve<dlong>(pn);
+  auto o_el = platform->deviceMemoryPool.reserve<dlong>(pn);
+  auto o_elsid = platform->deviceMemoryPool.reserve<dlong>(pn);
+  auto o_r = platform->deviceMemoryPool.reserve<dfloat>(dim * pn);
+  auto o_dist2 = platform->deviceMemoryPool.reserve<dfloat>(pn);
+  auto o_disti = platform->deviceMemoryPool.reserve<dfloat>(pn);
 
   if (timerLevel != TimerLevel::None) {
     platform->timer.tic(timerName + "findptsLocal::localKernel");
@@ -444,452 +317,238 @@ void findpts_t::findptsLocal(int *const code,
                     o_r,
                     o_dist2,
                     o_disti);
+
   if (timerLevel != TimerLevel::None) {
     platform->timer.toc(timerName + "findptsLocal::localKernel");
   }
 
   if (pn > 0) {
-    o_code.copyTo(code, sizeof(dlong) * pn);
-    o_el.copyTo(el, sizeof(dlong) * pn);
-    o_elsid.copyTo(elsid, sizeof(dlong) * pn);
-    o_r.copyTo(r, dim * sizeof(dfloat) * pn);
-    o_dist2.copyTo(dist2, sizeof(dfloat) * pn);
-    o_disti.copyTo(disti, sizeof(dfloat) * pn);
+    o_code.copyTo(code);
+    o_el.copyTo(el);
+    o_elsid.copyTo(elsid);
+    o_r.copyTo(r);
+    o_dist2.copyTo(dist2);
+    o_disti.copyTo(disti);
   }
+
   if (timerLevel == TimerLevel::Detailed) {
     platform->timer.toc(timerName + "findptsLocal");
   }
 }
 
 template <typename OutputType>
-void findpts_t::findptsLocalEvalInternal(OutputType *opt,
-                                         const evalSrcPt_t *spt,
-                                         const int pn,
-                                         const int nFields,
-                                         const int inputOffset,
-                                         const int outputOffset,
-                                         const occa::memory &o_in)
+void findpts_t::findptsEvalImpl(occa::memory &o_out,
+                                dlong findPtsDataOffset,
+                                data_t *findPtsData,
+                                const dlong npt,
+                                const int nFields,
+                                const dlong inputOffset,
+                                const dlong outputOffset,
+                                const occa::memory &o_in,
+                                hashData_t &hash,
+                                crystal &cr)
 {
   if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.tic(timerName + "findptsLocalEvalInternal");
-  }
-
-  if (pn == 0) {
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsLocalEvalInternal");
-    }
-
-    // provide 0-time tic/toc to allow global reduction later in timer reporting
-    if (timerLevel != TimerLevel::None) {
-      platform->timer.tic(timerName + "findptsLocalEvalInternal::localEvalKernel");
-      platform->timer.toc(timerName + "findptsLocalEvalInternal::localEvalKernel");
-    }
-    return;
+    platform->timer.tic(timerName + "findptsEvalImpl");
   }
 
-  pool::manageBuffers(pn, outputOffset, nFields);
-
-  dlong byteOffset = 0;
-
-  auto o_out = pool::o_scratch;
-  byteOffset += nFields * pn * sizeof(dfloat);
-
-  auto o_r = pool::o_scratch + byteOffset;
-  byteOffset += dim * pn * sizeof(dfloat);
-
-  auto o_el = pool::o_scratch + byteOffset;
-  byteOffset += pn * sizeof(dlong);
-
-  // pack host buffers
-  for (int point = 0; point < pn; ++point) {
-    for (int component = 0; component < dim; ++component) {
-      pool::r[dim * point + component] = spt[point].r[component];
-    }
-    pool::el[point] = spt[point].el;
+  static std::vector<dfloat> out;
+  if (out.size() < nFields * outputOffset) {
+    constexpr int growthFactor = 2;
+    out.resize(growthFactor * nFields * outputOffset);
   }
 
-  o_r.copyFrom(pool::r, dim * pn * sizeof(dfloat));
-  o_el.copyFrom(pool::el, pn * sizeof(dlong));
-
-  if (timerLevel != TimerLevel::None) {
-    platform->timer.tic(timerName + "findptsLocalEvalInternal::localEvalKernel");
-  }
-  this->localEvalKernel(pn, nFields, inputOffset, outputOffset, o_el, o_r, o_in, o_out);
+  // evaluate local points
   if (timerLevel != TimerLevel::None) {
-    platform->timer.toc(timerName + "findptsLocalEvalInternal::localEvalKernel");
+    platform->timer.tic(timerName + "findptsEvalImpl::eval local points");
   }
+  if (npt > 0) {
+  platform->device.occaDevice().setStream(localEvalStream);
 
-  o_out.copyTo(pool::out, nFields * outputOffset * sizeof(dfloat));
+    this->localEvalMaskKernel(npt,
+                              nFields,
+                              inputOffset,
+                              outputOffset,
+                              this->rank,
+                              this->o_proc + findPtsDataOffset,
+                              this->o_code + findPtsDataOffset,
+                              this->o_el + findPtsDataOffset,
+                              this->o_r + dim * findPtsDataOffset,
+                              o_in,
+                              o_out);
 
-  // unpack buffer
-  for (int point = 0; point < pn; ++point) {
-    for (int field = 0; field < nFields; ++field) {
-      opt[point].out[field] = pool::out[point + field * outputOffset];
-    }
-  }
-  if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.toc(timerName + "findptsLocalEvalInternal");
-  }
-}
+    o_out.copyTo(out.data(), nFields * outputOffset, 0, "async: true");
+  platform->device.occaDevice().setStream(defaultStream);
 
-template <typename OutputType>
-void findpts_t::findptsEvalImpl(occa::memory &o_out,
-                                const int *const code_base,
-                                const int *const proc_base,
-                                const int *const el_base,
-                                const dfloat *const r_base,
-                                const int npt,
-                                const int nFields,
-                                const int inputOffset,
-                                const int outputOffset,
-                                const occa::memory &o_in,
-                                hashData_t &hash,
-                                crystal &cr)
-{
-  if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.tic(timerName + "findptsEvalImpl");
   }
-  static std::vector<dfloat> out_base;
-
-  if (out_base.size() < nFields * outputOffset) {
-    constexpr int growthFactor = 2;
-    out_base.resize(growthFactor * nFields * outputOffset);
+  if (timerLevel != TimerLevel::None) {
+    platform->timer.toc(timerName + "findptsEvalImpl::eval local points");
   }
 
-  struct array src, outpt;
-  /* copy user data, weed out unfound points, send out */
+  // transfer non-local (found on a remote rank) points
   if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.tic(timerName + "findptsEvalImpl::copy data");
+    platform->timer.tic(timerName + "findptsEvalImpl::copy data to target");
   }
-  // avoid doing SoA -> AoS conversion for points that remain on the same rank
-  {
-    int index;
-    const int *code = code_base, *proc = proc_base, *el = el_base;
-    const dfloat *r = r_base;
 
-    int numSend = 0;
+  if (findPtsData->updateCache) {
+    const int *code = findPtsData->code_base + findPtsDataOffset;
+    const int *proc = findPtsData->proc_base + findPtsDataOffset;
+    const int *el = findPtsData->el_base + findPtsDataOffset;
+    const dfloat *r = findPtsData->r_base + dim * findPtsDataOffset;
 
-    for (index = 0; index < npt; ++index) {
-      numSend += (code_base[index] != CODE_NOT_FOUND && proc_base[index] != this->rank);
+    int numSend = 0;
+    for (int index = 0; index < npt; ++index) {
+      numSend += (code[index] != CODE_NOT_FOUND && proc[index] != this->rank);
     }
 
     if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::copy data::pack sarray_transfer buf");
+      platform->timer.tic(timerName + "findptsEvalImpl::copy data to target::sarray_transfer");
     }
 
-    evalSrcPt_t *pt;
-    array_init(evalSrcPt_t, &src, numSend);
-    pt = (evalSrcPt_t *)src.ptr;
-
-    int ctr = 0;
+    static struct array src;
+    array_reserve(evalSrcPt_t, &src, numSend);
+    auto spt = (evalSrcPt_t *)src.ptr;
 
-    for (index = 0; index < npt; ++index) {
-      if (*code != CODE_NOT_FOUND && *proc != this->rank) {
+    int cnt = 0;
+    for (int index = 0; index < npt; ++index) {
+      if (code[index] != CODE_NOT_FOUND && proc[index] != this->rank) {
         for (int d = 0; d < dim; ++d) {
-          pt->r[d] = r[d];
+          spt[cnt].r[d] = r[index * dim + d];
         }
-        pt->index = index;
-        pt->proc = *proc;
-        pt->el = *el;
-        ++pt;
-      }
-      r += dim;
-      code++;
-      proc++;
-      el++;
-    }
+        spt[cnt].index = index;
+        spt[cnt].proc = proc[index];
+        spt[cnt].el = el[index];
 
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::copy data::pack sarray_transfer buf");
+        cnt++;
+      }
     }
+    src.n = cnt;
 
-    src.n = pt - (evalSrcPt_t *)src.ptr;
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::copy data::sarray_transfer");
-    }
     sarray_transfer(evalSrcPt_t, &src, proc, 1, &cr);
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::copy data::sarray_transfer");
-    }
-  }
 
-  if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.toc(timerName + "findptsEvalImpl::copy data");
-    platform->timer.tic(timerName + "findptsEvalImpl::eval points");
-  }
-
-  /* evaluate points, send back */
-  {
     if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::eval points::do allocation");
-    }
-    int n = src.n;
-    const evalSrcPt_t *spt;
-    OutputType *opt;
-    array_init(OutputType, &outpt, n);
-    outpt.n = n;
-    spt = (evalSrcPt_t *)src.ptr;
-    opt = (OutputType *)outpt.ptr;
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::eval points::do allocation");
+      platform->timer.toc(timerName + "findptsEvalImpl::copy data to target::sarray_transfer");
     }
 
-    auto timerNameSave = timerName;
-    timerName = timerName + "findptsEvalImpl::eval points::";
-    findptsLocalEvalInternal(opt, spt, src.n, nFields, inputOffset, src.n, o_in);
-    timerName = timerNameSave;
+    // update cache
+    {
+      const auto n = src.n;
+      findPtsData->cache.index.resize(n);
+      findPtsData->cache.proc.resize(n);
 
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::eval points::copy index and proc");
-    }
-    spt = (evalSrcPt_t *)src.ptr;
-    opt = (OutputType *)outpt.ptr;
-    for (; n; --n, ++spt, ++opt) {
-      opt->index = spt->index;
-      opt->proc = spt->proc;
-    }
-    array_free(&src);
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::eval points::copy index and proc");
-      platform->timer.tic(timerName + "findptsEvalImpl::eval points::sarray_transfer");
-    }
-    sarray_transfer(OutputType, &outpt, proc, 1, &cr);
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::eval points::sarray_transfer");
-    }
-  }
+      findPtsData->cache.o_el.free();
+      findPtsData->cache.o_r.free();
+      findPtsData->cache.o_el = platform->deviceMemoryPool.reserve<dlong>(n);
+      findPtsData->cache.o_r = platform->deviceMemoryPool.reserve<dfloat>(dim * n);
 
-  if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.toc(timerName + "findptsEvalImpl::eval points");
-    platform->timer.tic(timerName + "findptsEvalImpl::copy results");
-  }
+      auto r = platform->memoryPool.reserve<dfloat>(findPtsData->cache.o_r.size());
+      auto rPtr = r.ptr<dfloat>();
+      auto el = platform->memoryPool.reserve<dlong>(findPtsData->cache.o_el.size());
+      auto elPtr = el.ptr<dlong>();
 
-  /* copy results to user data */
-  {
-    int n = outpt.n;
-    OutputType *opt = (OutputType *)outpt.ptr;
-
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::copy results::unpack sarray_transfer buf");
-    }
-    for (; n; --n, ++opt) {
-      for (int field = 0; field < nFields; ++field) {
-        out_base[opt->index + outputOffset * field] = opt->out[field];
+      auto spt = (evalSrcPt_t *)src.ptr;
+      for (int i = 0; i < n; i++) {
+        for (int d = 0; d < dim; ++d) {
+          rPtr[dim * i + d] = spt[i].r[d];
+        }
+        elPtr[i] = spt[i].el;
+        findPtsData->cache.index[i] = spt[i].index;
+        findPtsData->cache.proc[i] = spt[i].proc;
       }
-    }
-    if (outputOffset) {
-      o_out.copyFrom(out_base.data(), nFields * outputOffset * sizeof(dfloat));
-    }
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::copy results::unpack sarray_transfer buf");
-    }
+      findPtsData->cache.o_r.copyFrom(r);
+      findPtsData->cache.o_el.copyFrom(el);
+      findPtsData->updateCache = false;
 
-    // launch local eval kernel on all points that can be evaluated on the current rank
-    if (timerLevel != TimerLevel::None) {
-      platform->timer.tic(timerName + "findptsEvalImpl::copy results::localEvalKernel");
-    }
-    if (npt > 0) {
-      this->localEvalMaskKernel(npt,
-                                nFields,
-                                inputOffset,
-                                outputOffset,
-                                this->rank,
-                                this->o_proc,
-                                this->o_code,
-                                this->o_el,
-                                this->o_r,
-                                o_in,
-                                o_out);
     }
-    if (timerLevel != TimerLevel::None) {
-      platform->timer.toc(timerName + "findptsEvalImpl::copy results::localEvalKernel");
-    }
-
-    array_free(&outpt);
-  }
-  if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.toc(timerName + "findptsEvalImpl::copy results");
-    platform->timer.toc(timerName + "findptsEvalImpl");
   }
-}
 
-template <typename OutputType>
-void findpts_t::findptsEvalImpl(dfloat *out,
-                                const int *const code_base,
-                                const int *const proc_base,
-                                const int *const el_base,
-                                const dfloat *const r_base,
-                                const int npt,
-                                const int nFields,
-                                const int inputOffset,
-                                const int outputOffset,
-                                const dfloat *const in,
-                                hashData_t &hash,
-                                crystal &cr)
-{
   if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.tic(timerName + "findptsEvalImpl");
+    platform->timer.toc(timerName + "findptsEvalImpl::copy data to target");
   }
 
-  static occa::memory o_in;
-  static occa::memory o_out;
-
-  {
-    const auto Nbytes = inputOffset * nFields * sizeof(dfloat);
-    if (o_in.byte_size() < Nbytes) {
-      if (o_in.byte_size()) {
-        o_in.free();
-      }
-      constexpr int growthFactor = 2;
-      o_in = platform->device.malloc(growthFactor * Nbytes);
-    }
-    o_in.copyFrom(in, Nbytes);
-  }
-
-  {
-    const auto Nbytes = outputOffset * nFields * sizeof(dfloat);
-    if (o_out.byte_size() < Nbytes) {
-      if (o_out.byte_size()) {
-        o_out.free();
-      }
-      constexpr int growthFactor = 2;
-      o_out = platform->device.malloc(growthFactor * Nbytes);
-    }
-  }
-
-  struct array src, outpt;
-  /* copy user data, weed out unfound or local points, send out */
+  // evaluate non-local points
   if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.tic(timerName + "findptsEvalImpl::copy data");
+    platform->timer.tic(timerName + "findptsEvalImpl::eval non-local points");
   }
+
+  static struct array outpt;
   {
-    int index;
-    const int *code = code_base, *proc = proc_base, *el = el_base;
-    const dfloat *r = r_base;
+    const dlong n = findPtsData->cache.index.size();
+    array_reserve(OutputType, &outpt, n);
+    outpt.n = n; 
 
-    int numSend = 0;
+    auto o_tmp = platform->deviceMemoryPool.reserve<dfloat>(nFields * n);
+    const dlong offset = o_tmp.size() / nFields;
 
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::copy data::count_num_send");
+    if (timerLevel != TimerLevel::None) {
+      platform->timer.tic(timerName + "findptsEvalImpl::eval non-local points::kernel");
     }
-    for (index = 0; index < npt; ++index) {
-      numSend += (code_base[index] != CODE_NOT_FOUND && proc_base[index] != this->rank);
+
+    if (n) { 
+      this->localEvalKernel(n, 
+                            nFields, 
+                            inputOffset, 
+                            offset, 
+                            findPtsData->cache.o_el, 
+                            findPtsData->cache.o_r, 
+                            o_in, 
+                            o_tmp);
     }
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::copy data::count_num_send");
+ 
+    if (timerLevel != TimerLevel::None) {
+      platform->timer.toc(timerName + "findptsEvalImpl::eval non-local points::kernel");
     }
+ 
+    auto tmp = platform->memoryPool.reserve<dfloat>(o_tmp.size());
+    o_tmp.copyTo(tmp);
 
-    evalSrcPt_t *pt;
-    array_init(evalSrcPt_t, &src, numSend);
-    pt = (evalSrcPt_t *)src.ptr;
-
-    int ctr = 0;
-
-    for (index = 0; index < npt; ++index) {
-      if (*code != CODE_NOT_FOUND && *proc != this->rank) {
-        for (int d = 0; d < dim; ++d) {
-          pt->r[d] = r[d];
-        }
-        pt->index = index;
-        pt->proc = *proc;
-        pt->el = *el;
-        ++pt;
+    auto opt = (OutputType *)outpt.ptr;
+    auto tmpPtr = tmp.ptr<dfloat>();
+    for (dlong i = 0; i < n; i++) { 
+      for (int field = 0; field < nFields; ++field) {
+        opt[i].out[field] = tmpPtr[i + field * offset];
       }
-      r += dim;
-      code++;
-      proc++;
-      el++;
+      opt[i].index = findPtsData->cache.index[i];
+      opt[i].proc = findPtsData->cache.proc[i];
     }
 
-    src.n = pt - (evalSrcPt_t *)src.ptr;
     if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::copy data::sarray_transfer");
+      platform->timer.tic(timerName + "findptsEvalImpl::eval non-local points::sarray_transfer");
     }
-    sarray_transfer(evalSrcPt_t, &src, proc, 1, &cr);
+
+    sarray_transfer(OutputType, &outpt, proc, 1, &cr);
+
     if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::copy data::sarray_transfer");
+      platform->timer.toc(timerName + "findptsEvalImpl::eval non-local points::sarray_transfer");
     }
   }
 
   if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.toc(timerName + "findptsEvalImpl::copy data");
-    platform->timer.tic(timerName + "findptsEvalImpl::eval points");
-  }
-
-  /* evaluate points, send back */
-  {
-    int n = src.n;
-    const evalSrcPt_t *spt;
-    OutputType *opt;
-    array_init(OutputType, &outpt, n);
-    outpt.n = n;
-    spt = (evalSrcPt_t *)src.ptr;
-    opt = (OutputType *)outpt.ptr;
-
-    auto timerNameSave = timerName;
-    timerName = timerName + "findptsEvalImpl::eval points::";
-    findptsLocalEvalInternal(opt, spt, src.n, nFields, inputOffset, src.n, o_in);
-    timerName = timerNameSave;
-
-    spt = (evalSrcPt_t *)src.ptr;
-    opt = (OutputType *)outpt.ptr;
-    for (; n; --n, ++spt, ++opt) {
-      opt->index = spt->index;
-      opt->proc = spt->proc;
-    }
-    array_free(&src);
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.tic(timerName + "findptsEvalImpl::eval points::sarray_transfer");
-    }
-    sarray_transfer(OutputType, &outpt, proc, 1, &cr);
-    if (timerLevel == TimerLevel::Detailed) {
-      platform->timer.toc(timerName + "findptsEvalImpl::eval points::sarray_transfer");
-    }
+    platform->timer.toc(timerName + "findptsEvalImpl::eval non-local points");
   }
 
+  // copy to user buffer
   if (timerLevel == TimerLevel::Detailed) {
-    platform->timer.toc(timerName + "findptsEvalImpl::eval points");
     platform->timer.tic(timerName + "findptsEvalImpl::copy results");
   }
 
-  /* copy results to user data */
   {
-    int n = outpt.n;
-    OutputType *opt = (OutputType *)outpt.ptr;
-
-    // launch local eval kernel on all points that can be evaluated on the current rank
-    if (timerLevel != TimerLevel::None) {
-      platform->timer.tic(timerName + "findptsEvalImpl::localEvalKernel");
-    }
-    if (npt > 0) {
-      this->localEvalMaskKernel(npt,
-                                nFields,
-                                inputOffset,
-                                outputOffset,
-                                this->rank,
-                                this->o_proc,
-                                this->o_code,
-                                this->o_el,
-                                this->o_r,
-                                o_in,
-                                o_out);
-    }
-    if (timerLevel != TimerLevel::None) {
-      platform->timer.toc(timerName + "findptsEvalImpl::localEvalKernel");
-    }
+    platform->device.occaDevice().setStream(localEvalStream);
+    platform->device.finish();
+    platform->device.occaDevice().setStream(defaultStream);
 
-    if (outputOffset) {
-      o_out.copyTo(out, nFields * outputOffset * sizeof(dfloat));
-    }
-
-    for (; n; --n, ++opt) {
+    auto opt = (OutputType *)outpt.ptr;
+    for (int i = 0; i < outpt.n; i++) {
       for (int field = 0; field < nFields; ++field) {
-        out[opt->index + outputOffset * field] = opt->out[field];
+        out[opt[i].index + outputOffset * field] = opt[i].out[field];
       }
     }
 
-    array_free(&outpt);
+    if (outputOffset) {
+      o_out.copyFrom(out.data(), nFields * outputOffset);
+    }
   }
+
   if (timerLevel == TimerLevel::Detailed) {
     platform->timer.toc(timerName + "findptsEvalImpl::copy results");
     platform->timer.toc(timerName + "findptsEvalImpl");
@@ -963,8 +622,12 @@ findpts_t::findpts_t(MPI_Comm comm,
   const int n[dim] = {Nq, Nq, Nq};
   const int ms[dim] = {m, m, m};
 
-  if (platform->options.compareArgs("ENABLE FINDPTS DETAILED TIMER", "TRUE"))
+  defaultStream = platform->device.occaDevice().getStream();
+  localEvalStream = platform->device.occaDevice().createStream();
+
+  if (platform->options.compareArgs("ENABLE FINDPTS DETAILED TIMER", "TRUE")) {
     this->timerLevel = TimerLevel::Detailed;
+  }
 
   this->_findptsData = gslibFindptsSetup(comm,
                                          elx,
@@ -979,7 +642,7 @@ findpts_t::findpts_t(MPI_Comm comm,
                                          sessionId_,
                                          distfint);
 
-  auto *findptsData = (findpts_data_3 *)this->_findptsData;
+  auto findptsData = (findpts_data_3 *)this->_findptsData;
 
   this->comm = comm;
   MPI_Comm_rank(comm, &this->rank);
@@ -989,18 +652,18 @@ findpts_t::findpts_t(MPI_Comm comm,
   this->cr = &findptsData->cr;
 
   if (x != nullptr) {
-    this->o_x = platform->device.malloc(Nlocal * sizeof(dfloat));
-    this->o_y = platform->device.malloc(Nlocal * sizeof(dfloat));
-    this->o_z = platform->device.malloc(Nlocal * sizeof(dfloat));
+    this->o_x = platform->device.malloc<dfloat>(Nlocal);
+    this->o_y = platform->device.malloc<dfloat>(Nlocal);
+    this->o_z = platform->device.malloc<dfloat>(Nlocal);
     if (useMultiSessionSupport) {
-      this->o_distfint = platform->device.malloc(Nlocal * sizeof(dfloat));
+      this->o_distfint = platform->device.malloc<dfloat>(Nlocal);
     }
 
-    this->o_x.copyFrom(x, Nlocal * sizeof(dfloat));
-    this->o_y.copyFrom(y, Nlocal * sizeof(dfloat));
-    this->o_z.copyFrom(z, Nlocal * sizeof(dfloat));
+    this->o_x.copyFrom(x, Nlocal);
+    this->o_y.copyFrom(y, Nlocal);
+    this->o_z.copyFrom(z, Nlocal);
     if (useMultiSessionSupport) {
-      this->o_distfint.copyFrom(distfint, Nlocal * sizeof(dfloat));
+      this->o_distfint.copyFrom(distfint, Nlocal);
     }
     std::vector<dfloat> c(dim * Nelements, 0.0);
     std::vector<dfloat> A(dim * dim * Nelements, 0.0);
@@ -1027,15 +690,15 @@ findpts_t::findpts_t(MPI_Comm comm,
       }
     }
 
-    this->o_c = platform->device.malloc(c.size() * sizeof(dfloat));
-    this->o_A = platform->device.malloc(A.size() * sizeof(dfloat));
-    this->o_min = platform->device.malloc(minBound.size() * sizeof(dfloat));
-    this->o_max = platform->device.malloc(maxBound.size() * sizeof(dfloat));
+    this->o_c = platform->device.malloc<dfloat>(c.size());
+    this->o_A = platform->device.malloc<dfloat>(A.size());
+    this->o_min = platform->device.malloc<dfloat>(minBound.size());
+    this->o_max = platform->device.malloc<dfloat>(maxBound.size());
 
-    this->o_c.copyFrom(c.data(), c.size() * sizeof(dfloat));
-    this->o_A.copyFrom(A.data(), A.size() * sizeof(dfloat));
-    this->o_min.copyFrom(minBound.data(), minBound.size() * sizeof(dfloat));
-    this->o_max.copyFrom(maxBound.data(), maxBound.size() * sizeof(dfloat));
+    this->o_c.copyFrom(c.data(), c.size());
+    this->o_A.copyFrom(A.data(), A.size());
+    this->o_min.copyFrom(minBound.data(), minBound.size());
+    this->o_max.copyFrom(maxBound.data(), maxBound.size());
   }
 
   auto hash = findptsData->local.hd;
@@ -1046,21 +709,23 @@ findpts_t::findpts_t(MPI_Comm comm,
     hashFac[d] = hash.fac[d];
   }
   this->hash_n = hash.hash_n;
-  this->o_hashMin = platform->device.malloc(dim * sizeof(dfloat));
-  this->o_hashFac = platform->device.malloc(dim * sizeof(dfloat));
-  this->o_hashMin.copyFrom(hashMin, dim * sizeof(dfloat));
-  this->o_hashFac.copyFrom(hashFac, dim * sizeof(dfloat));
-
-  this->localEvalKernel = platform->kernelRequests.load("findptsLocalEval");
-  this->localEvalMaskKernel = platform->kernelRequests.load("findptsLocalEvalMask");
-  this->localKernel = platform->kernelRequests.load("findptsLocal");
-
-  this->o_wtend_x = platform->device.malloc(6 * Nq * sizeof(dfloat));
-  this->o_wtend_y = platform->device.malloc(6 * Nq * sizeof(dfloat));
-  this->o_wtend_z = platform->device.malloc(6 * Nq * sizeof(dfloat));
-  this->o_wtend_x.copyFrom(findptsData->local.fed.wtend[0], 6 * Nq * sizeof(dfloat));
-  this->o_wtend_y.copyFrom(findptsData->local.fed.wtend[1], 6 * Nq * sizeof(dfloat));
-  this->o_wtend_z.copyFrom(findptsData->local.fed.wtend[2], 6 * Nq * sizeof(dfloat));
+  this->o_hashMin = platform->device.malloc<dfloat>(dim);
+  this->o_hashFac = platform->device.malloc<dfloat>(dim);
+  this->o_hashMin.copyFrom(hashMin, dim);
+  this->o_hashFac.copyFrom(hashFac, dim);
+
+  std::string orderSuffix = "_" + std::to_string(Nq - 1);
+
+  this->localEvalKernel = platform->kernelRequests.load("findptsLocalEval" + orderSuffix);
+  this->localEvalMaskKernel = platform->kernelRequests.load("findptsLocalEvalMask" + orderSuffix);
+  this->localKernel = platform->kernelRequests.load("findptsLocal" + orderSuffix);
+
+  this->o_wtend_x = platform->device.malloc<dfloat>(6 * Nq);
+  this->o_wtend_y = platform->device.malloc<dfloat>(6 * Nq);
+  this->o_wtend_z = platform->device.malloc<dfloat>(6 * Nq);
+  this->o_wtend_x.copyFrom(findptsData->local.fed.wtend[0], 6 * Nq);
+  this->o_wtend_y.copyFrom(findptsData->local.fed.wtend[1], 6 * Nq);
+  this->o_wtend_z.copyFrom(findptsData->local.fed.wtend[2], 6 * Nq);
 
   const auto hd_d_size = getHashSize(findptsData, Nelements, local_hash_size);
 
@@ -1068,8 +733,8 @@ findpts_t::findpts_t(MPI_Comm comm,
   for (dlong i = 0; i < hd_d_size; ++i) {
     offsets[i] = findptsData->local.hd.offset[i];
   }
-  this->o_offset = platform->device.malloc(offsets.size() * sizeof(dlong));
-  this->o_offset.copyFrom(offsets.data(), offsets.size() * sizeof(dlong));
+  this->o_offset = platform->device.malloc<dlong>(offsets.size());
+  this->o_offset.copyFrom(offsets.data(), offsets.size());
 }
 
 findpts_t::~findpts_t()
@@ -1087,11 +752,6 @@ static slong lfloor(dfloat x)
   return floor(x);
 }
 
-static slong lceil(dfloat x)
-{
-  return ceil(x);
-}
-
 static ulong hash_index_aux(dfloat low, dfloat fac, ulong n, dfloat x)
 {
   const slong i = lfloor((x - low) * fac);
@@ -1128,18 +788,13 @@ void findpts_t::find(data_t *const findPtsData,
 }
 
 void findpts_t::find(data_t *const findPtsData,
-                     const occa::memory &o_xintIn,
-                     const occa::memory &o_yintIn,
-                     const occa::memory &o_zintIn,
-                     const occa::memory &o_sessIn,
+                     const occa::memory &o_xint,
+                     const occa::memory &o_yint,
+                     const occa::memory &o_zint,
+                     const occa::memory &o_session,
                      const dlong sessionIdMatch,
                      const dlong npt)
 {
-  const auto o_xint = o_xintIn.isInitialized() ? o_xintIn.cast(occa::dtype::byte) : o_xintIn;
-  const auto o_yint = o_yintIn.isInitialized() ? o_yintIn.cast(occa::dtype::byte) : o_yintIn;
-  const auto o_zint = o_zintIn.isInitialized() ? o_zintIn.cast(occa::dtype::byte) : o_zintIn;
-  const auto o_session = o_sessIn.isInitialized() ? o_sessIn.cast(occa::dtype::byte) : o_sessIn;
-
   if (timerLevel != TimerLevel::None) {
     platform->timer.tic(timerName + "find");
   }
@@ -1174,11 +829,11 @@ void findpts_t::find(data_t *const findPtsData,
 
   platform->timer.tic(timerName + "find::initial copy op");
   if (npt) {
-    o_xint.copyTo(x_base.data(), npt * sizeof(dfloat));
-    o_yint.copyTo(y_base.data(), npt * sizeof(dfloat));
-    o_zint.copyTo(z_base.data(), npt * sizeof(dfloat));
+    o_xint.copyTo(x_base.data(), npt);
+    o_yint.copyTo(y_base.data(), npt);
+    o_zint.copyTo(z_base.data(), npt);
     if (useMultiSessionSupport) {
-      o_session.copyTo(session.data(), npt * sizeof(dlong));
+      o_session.copyTo(session.data(), npt);
     } else {
       std::fill(session.begin(), session.end(), 0);
     }
@@ -1218,7 +873,6 @@ void findpts_t::find(data_t *const findPtsData,
     platform->timer.tic(timerName + "find::unfound");
   }
   {
-    int index;
     int *code = code_base, *proc = proc_base;
     const dfloat *xp[dim];
     struct srcPt_t *pt;
@@ -1232,7 +886,7 @@ void findpts_t::find(data_t *const findPtsData,
 
     dfloat x[dim];
 
-    for (index = 0; index < npt; ++index) {
+    for (int index = 0; index < npt; ++index) {
       for (int d = 0; d < dim; ++d) {
         x[d] = *xp[d];
       }
@@ -1489,8 +1143,7 @@ void findpts_t::find(data_t *const findPtsData,
             cselsid = elsid[index];
             csproc = proc_base[index];
             csel = el_base[index];
-            unsigned d;
-            for (d = 0; d < findpts::dim; ++d) {
+            for (int d = 0; d < findpts::dim; ++d) {
               csr[d] = r_base[d];
             }
             ioriginator = 0;
@@ -1507,8 +1160,7 @@ void findpts_t::find(data_t *const findPtsData,
             cselsid = opt->elsid;
             csproc = opt->proc;
             csel = opt->el;
-            unsigned d;
-            for (d = 0; d < findpts::dim; ++d) {
+            for (int d = 0; d < findpts::dim; ++d) {
               csr[d] = opt->r[d];
             }
           }
@@ -1522,8 +1174,7 @@ void findpts_t::find(data_t *const findPtsData,
             aselsid = cselsid;
             asproc = csproc;
             asel = csel;
-            unsigned d;
-            for (d = 0; d < findpts::dim; ++d) {
+            for (int d = 0; d < findpts::dim; ++d) {
               asr[d] = csr[d];
             }
           }
@@ -1568,23 +1219,23 @@ void findpts_t::find(data_t *const findPtsData,
   }
 
   if (npt) {
-    if (o_code.byte_size() < npt * sizeof(dlong)) {
-      if (o_code.byte_size()) {
+    if (o_code.size() < npt) {
+      if (o_code.size()) {
         o_code.free();
         o_el.free();
         o_r.free();
         o_proc.free();
       }
-      o_code = platform->device.malloc(npt * sizeof(dlong));
-      o_el = platform->device.malloc(npt * sizeof(dlong));
-      o_r = platform->device.malloc(npt * dim * sizeof(dfloat));
-      o_proc = platform->device.malloc(npt * sizeof(dlong));
+      o_code = platform->device.malloc<dlong>(npt);
+      o_el = platform->device.malloc<dlong>(npt);
+      o_r = platform->device.malloc<dfloat>(npt * dim);
+      o_proc = platform->device.malloc<dlong>(npt);
     }
 
-    o_code.copyFrom(code_base, npt * sizeof(dlong));
-    o_el.copyFrom(el_base, npt * sizeof(dlong));
-    o_r.copyFrom(r_base, npt * dim * sizeof(dfloat));
-    o_proc.copyFrom(proc_base, npt * sizeof(dlong));
+    o_code.copyFrom(code_base, npt);
+    o_el.copyFrom(el_base, npt);
+    o_r.copyFrom(r_base, npt * dim);
+    o_proc.copyFrom(proc_base, npt);
   }
   if (timerLevel == TimerLevel::Detailed) {
     platform->timer.toc(timerName + "find::copy to device");
@@ -1613,10 +1264,10 @@ void findpts_t::find(data_t *const findPtsData,
 {
   occa::memory o_xint, o_yint, o_zint, o_session;
   if (npt > 0) {
-    o_xint = platform->o_memPool.reserve<dfloat>(npt);
-    o_yint = platform->o_memPool.reserve<dfloat>(npt);
-    o_zint = platform->o_memPool.reserve<dfloat>(npt);
-    o_session = platform->o_memPool.reserve<dlong>(npt);
+    o_xint = platform->deviceMemoryPool.reserve<dfloat>(npt);
+    o_yint = platform->deviceMemoryPool.reserve<dfloat>(npt);
+    o_zint = platform->deviceMemoryPool.reserve<dfloat>(npt);
+    o_session = platform->deviceMemoryPool.reserve<dlong>(npt);
   }
 
   o_xint.copyFrom(x_base, npt);
@@ -1638,25 +1289,18 @@ void findpts_t::find(data_t *const findPtsData,
 
 void findpts_t::eval(const dlong npt, const occa::memory &o_in, data_t *findPtsData, occa::memory &o_out)
 {
-  this->eval(npt, 1, 0, npt, o_in, findPtsData, o_out);
-}
-
-void findpts_t::eval(const dlong npt, dfloat *in, data_t *findPtsData, dfloat *out)
-{
-  this->eval(npt, 1, 0, npt, in, findPtsData, out);
+  this->eval(npt, 0, 1, 0, npt, o_in, findPtsData, o_out);
 }
 
 void findpts_t::eval(const dlong npt,
+                     const dlong findPtsOffset,
                      const dlong nFields,
                      const dlong inputOffset,
                      const dlong outputOffset,
-                     const occa::memory &o_fldIn,
+                     const occa::memory &o_in,
                      data_t *findPtsData,
-                     occa::memory &o_fldOut)
+                     occa::memory &o_out)
 {
-  auto o_in = o_fldIn.isInitialized() ? o_fldIn.cast(occa::dtype::byte) : o_fldIn;
-  auto o_out = o_fldOut.isInitialized() ? o_fldOut.cast(occa::dtype::byte) : o_fldOut;
-
   if (timerLevel != TimerLevel::None) {
     platform->timer.tic(timerName + "eval");
   }
@@ -1670,10 +1314,8 @@ void findpts_t::eval(const dlong npt,
       return;
     }
     findptsEvalImpl<evalOutPt_t<decltype(T)::value>>(o_out,
-                                                     findPtsData->code_base,
-                                                     findPtsData->proc_base,
-                                                     findPtsData->el_base,
-                                                     findPtsData->r_base,
+                                                     findPtsOffset,
+                                                     findPtsData,
                                                      npt,
                                                      nFields,
                                                      inputOffset,
@@ -1684,55 +1326,7 @@ void findpts_t::eval(const dlong npt,
   });
 
   nekrsCheck(nFields < 1 || nFields > findpts_t::maxFields,
-             platform->comm.mpiComm,
-             EXIT_FAILURE,
-             "Error: nFields = %d is not supported. nFields must be between 1 and %d.",
-             nFields,
-             findpts_t::maxFields);
-
-  timerName = timerNameSave;
-
-  if (timerLevel != TimerLevel::None) {
-    platform->timer.toc(timerName + "eval");
-  }
-}
-
-void findpts_t::eval(const dlong npt,
-                     const dlong nFields,
-                     const dlong inputOffset,
-                     const dlong outputOffset,
-                     dfloat *in,
-                     data_t *findPtsData,
-                     dfloat *out)
-{
-  if (timerLevel != TimerLevel::None) {
-    platform->timer.tic(timerName + "eval");
-  }
-
-  const auto timerNameSave = timerName;
-  timerName = timerName + "eval::";
-  auto fieldSizesTuple = n_tuple<int, findpts_t::maxFields>{};
-  tuple_for_each(fieldSizesTuple, [&](auto T) {
-    if (nFields != decltype(T)::value) {
-      return;
-    }
-
-    findptsEvalImpl<evalOutPt_t<decltype(T)::value>>(out,
-                                                     findPtsData->code_base,
-                                                     findPtsData->proc_base,
-                                                     findPtsData->el_base,
-                                                     findPtsData->r_base,
-                                                     npt,
-                                                     nFields,
-                                                     inputOffset,
-                                                     outputOffset,
-                                                     in,
-                                                     *this->hash,
-                                                     *this->cr);
-  });
-
-  nekrsCheck(nFields < 1 || nFields > findpts_t::maxFields,
-             platform->comm.mpiComm,
+             MPI_COMM_SELF,
              EXIT_FAILURE,
              "Error: nFields = %d is not supported. nFields must be between 1 and %d.",
              nFields,
@@ -1750,29 +1344,4 @@ crystal *findpts_t::crystalRouter()
   return this->cr;
 }
 
-void findpts_t::o_update(data_t &data)
-{
-  auto npt = data.code.size();
-  if (npt == 0) {
-    return;
-  }
-  if (o_code.byte_size() < npt * sizeof(dlong)) {
-    if (o_code.byte_size()) {
-      o_code.free();
-      o_el.free();
-      o_r.free();
-      o_proc.free();
-    }
-    o_code = platform->device.malloc(npt * sizeof(dlong));
-    o_el = platform->device.malloc(npt * sizeof(dlong));
-    o_r = platform->device.malloc(npt * dim * sizeof(dfloat));
-    o_proc = platform->device.malloc(npt * sizeof(dlong));
-  }
-
-  o_code.copyFrom(data.code_base, npt * sizeof(dlong));
-  o_el.copyFrom(data.el_base, npt * sizeof(dlong));
-  o_r.copyFrom(data.r_base, npt * dim * sizeof(dfloat));
-  o_proc.copyFrom(data.proc_base, npt * sizeof(dlong));
-}
-
 } // namespace findpts
diff --git a/src/pointInterpolation/findpts/findpts.hpp b/src/pointInterpolation/findpts/findpts.hpp
index 643610319..7e26402e3 100644
--- a/src/pointInterpolation/findpts/findpts.hpp
+++ b/src/pointInterpolation/findpts/findpts.hpp
@@ -21,7 +21,18 @@ static constexpr int CODE_BORDER = 1;
 static constexpr int CODE_NOT_FOUND = 2;
 static constexpr int dim = 3;
 
+// src cache on target
+struct cache_t { 
+  occa::memory o_el;
+  occa::memory o_r; 
+  std::vector<dlong> proc;
+  std::vector<dlong> index;
+};
+
 struct data_t {
+  bool updateCache = true;
+  cache_t cache;
+
   std::vector<dlong> code;
   std::vector<dlong> proc;
   std::vector<dlong> el;
@@ -128,6 +139,7 @@ class findpts_t
   void eval(const dlong npt, const occa::memory &o_in, data_t *findPtsData, occa::memory &o_out);
 
   void eval(const dlong npt,
+            const dlong offset,
             const dlong nFields,
             const dlong inputOffset,
             const dlong outputOffset,
@@ -135,17 +147,6 @@ class findpts_t
             data_t *findPtsData,
             occa::memory &o_out);
 
-  // Host versions (copies to device when needed)
-  void eval(const dlong npt, dfloat *in, data_t *findPtsData, dfloat *out);
-
-  void eval(const dlong npt,
-            const dlong nFields,
-            const dlong inputOffset,
-            const dlong outputOffset,
-            dfloat *in,
-            data_t *findPtsData,
-            dfloat *out);
-
   // set timer level
   void setTimerLevel(TimerLevel level)
   {
@@ -166,11 +167,6 @@ class findpts_t
 
   crystal *crystalRouter();
 
-  // For use in, e.g., nek-nek
-  // If altering code, proc, el, r, or dist2 after a find call,
-  // update device arrays with this function
-  void o_update(data_t &data);
-
 private:
   static constexpr int maxFields = 30;
 
@@ -188,6 +184,9 @@ class findpts_t
   occa::kernel localEvalMaskKernel;
   occa::kernel localKernel;
 
+  occa::stream defaultStream;
+  occa::stream localEvalStream;
+
   // data for elx
   occa::memory o_x;
   occa::memory o_y;
@@ -260,42 +259,18 @@ class findpts_t
                     const int sessionIdMatch,
                     const int pn);
 
-  template <typename OutputType>
-  void findptsEvalImpl(dfloat *out,
-                       const int *const code_base,
-                       const int *const proc_base,
-                       const int *const el_base,
-                       const dfloat *const r_base,
-                       const int npt,
-                       const int nFields,
-                       const int inputOffset,
-                       const int outputOffset,
-                       const dfloat *const in,
-                       hashData_t &hash,
-                       crystal &cr);
-
   template <typename OutputType>
   void findptsEvalImpl(occa::memory &o_out,
-                       const int *const code_base,
-                       const int *const proc_base,
-                       const int *const el_base,
-                       const dfloat *const r_base,
-                       const int npt,
+                       dlong offset,
+                       data_t *findPtsData,
+                       const dlong npt,
                        const int nFields,
-                       const int inputOffset,
-                       const int outputOffset,
+                       const dlong inputOffset,
+                       const dlong outputOffset,
                        const occa::memory &o_in,
                        hashData_t &hash,
                        crystal &cr);
 
-  template <typename OutputType>
-  void findptsLocalEvalInternal(OutputType *opt,
-                                const evalSrcPt_t *spt,
-                                const int pn,
-                                const int nFields,
-                                const int inputOffset,
-                                const int outputOffset,
-                                const occa::memory &o_in);
 };
 
 } // namespace findpts
diff --git a/src/pointInterpolation/findpts/kernels/findptsLocal.okl b/src/pointInterpolation/findpts/kernels/findptsLocal.okl
index 6d1995ced..ee6591705 100644
--- a/src/pointInterpolation/findpts/kernels/findptsLocal.okl
+++ b/src/pointInterpolation/findpts/kernels/findptsLocal.okl
@@ -1414,6 +1414,7 @@ dfloat tensor_ig3_j(@ restrict dfloat *g_partials,
 
           bool converged_internal = (fpt->flags & FLAG_MASK) == CONVERGED_FLAG;
           if (*code_i == CODE_NOT_FOUND || converged_internal || fpt->dist2 < *dist2_i) {
+            @barrier(); // ensure if-condition is evaluated first before changing code_i and dist2_i
             for (dlong j = 0; j < p_innerSize; ++j; @inner) {
               if (j == 0) {
                 *el_i = el;
@@ -1431,6 +1432,7 @@ dfloat tensor_ig3_j(@ restrict dfloat *g_partials,
               }
             }
             @barrier();
+
             if (converged_internal) {
               break;
             }
diff --git a/src/pointInterpolation/findpts/kernels/findptsLocalEval.okl b/src/pointInterpolation/findpts/kernels/findptsLocalEval.okl
index 8f8377916..5c44a697c 100644
--- a/src/pointInterpolation/findpts/kernels/findptsLocalEval.okl
+++ b/src/pointInterpolation/findpts/kernels/findptsLocalEval.okl
@@ -1,6 +1,6 @@
 #if 1
 @kernel void findptsLocalEval(const dlong pn,
-                              const dlong Nfields,
+                              const int Nfields,
                               const dlong fieldOffset,
                               const dlong outputOffset,
                               @ restrict const dlong *const el,
diff --git a/src/pointInterpolation/findpts/kernels/findptsLocalEvalMask.okl b/src/pointInterpolation/findpts/kernels/findptsLocalEvalMask.okl
index fe5d123ad..bcd452c95 100644
--- a/src/pointInterpolation/findpts/kernels/findptsLocalEvalMask.okl
+++ b/src/pointInterpolation/findpts/kernels/findptsLocalEvalMask.okl
@@ -1,7 +1,7 @@
 #if 1
 #define CODE_NOT_FOUND (2)
 @kernel void findptsLocalEvalMask(const dlong pn,
-                              const dlong Nfields,
+                              const int Nfields,
                               const dlong fieldOffset,
                               const dlong outputOffset,
                               const dlong myRank,
diff --git a/src/pointInterpolation/pointInterpolation.cpp b/src/pointInterpolation/pointInterpolation.cpp
index 3c190f95c..868ac74ec 100644
--- a/src/pointInterpolation/pointInterpolation.cpp
+++ b/src/pointInterpolation/pointInterpolation.cpp
@@ -3,31 +3,19 @@
 #include "findpts.hpp"
 #include "pointInterpolation.hpp"
 
-pointInterpolation_t::pointInterpolation_t(mesh_t *mesh,
-                                           MPI_Comm comm,
-                                           bool mySession_,
-                                           std::vector<int> bID)
-    : pointInterpolation_t(mesh,
-                           comm,
-                           mesh->Nlocal,
-                           mesh->Nlocal,
-                           0.01,
-                           0,
-                           true,
-                           bID)
-{
-}
-
 pointInterpolation_t::pointInterpolation_t(mesh_t *mesh_,
                                            MPI_Comm comm,
-                                           dlong localHashSize,
-                                           dlong globalHashSize,
+                                           bool mySession_,
+                                           std::vector<int> bIntID,
                                            double bb_tol,
                                            double newton_tol_,
-                                           bool mySession_,
-                                           std::vector<int> bIntID)
-    : mesh(mesh_), newton_tol(newton_tol_), mySession(mySession_), nPoints(0)
+                                           dlong localHashSize,
+                                           dlong globalHashSize)
+
+    : mesh(mesh_), mySession(mySession_), nPoints(0)
 {
+  if (localHashSize == 0) localHashSize = mesh->Nlocal; 
+  if (globalHashSize == 0) globalHashSize = mesh->Nlocal;
 
   // communicator is implicitly required to be either platform->comm.mpiComm or platform->comm.mpiCommParent
   // due to other communicator synchronous calls, such as platform->timer.tic
@@ -44,13 +32,11 @@ pointInterpolation_t::pointInterpolation_t(mesh_t *mesh_,
              "Communicator must be either platform->comm.mpiComm or platform->comm.mpiCommParent");
 
   newton_tol =
-      (sizeof(dfloat) == sizeof(double))
-      ? std::max(5e-13, newton_tol_)
-      : std::max(1e-6, newton_tol_);
+    (sizeof(dfloat) == sizeof(double)) ? std::max(5e-13, newton_tol_) : std::max(1e-6, newton_tol_);
 
-  auto x = platform->memPool.reserve<dfloat>(mesh->Nlocal);
-  auto y = platform->memPool.reserve<dfloat>(mesh->Nlocal);
-  auto z = platform->memPool.reserve<dfloat>(mesh->Nlocal);
+  auto x = platform->memoryPool.reserve<dfloat>(mesh->Nlocal);
+  auto y = platform->memoryPool.reserve<dfloat>(mesh->Nlocal);
+  auto z = platform->memoryPool.reserve<dfloat>(mesh->Nlocal);
 
   if (mySession) {
     mesh->o_x.copyTo(x, mesh->Nlocal);
@@ -60,7 +46,7 @@ pointInterpolation_t::pointInterpolation_t(mesh_t *mesh_,
 
   std::vector<dfloat> distanceINT;
   if (bIntID.size()) {
-    auto o_bIntID = platform->o_memPool.reserve<int>(bIntID.size());
+    auto o_bIntID = platform->deviceMemoryPool.reserve<int>(bIntID.size());
     o_bIntID.copyFrom(bIntID.data());
     _o_distanceINT = mesh->minDistance(bIntID.size(), o_bIntID, "cheap_dist");
     distanceINT.resize(mesh->Nlocal);
@@ -137,30 +123,41 @@ void pointInterpolation_t::find(pointInterpolation_t::VerbosityLevel verbosity,
       _o_z.copyTo(h_z, n);
     }
 
+    const auto maxVerbosePoints = 5;
+
     dlong nOutside = 0;
     dlong nBoundary = 0;
+    dfloat maxDistNorm = 0;
     for (int in = 0; in < n; ++in) {
       if (data_.code_base[in] == findpts::CODE_BORDER) {
         if (data_.dist2_base[in] > 10 * newton_tol) {
-          nBoundary += 1;
-          if (nBoundary < 5 && verbosity == VerbosityLevel::Detailed) {
-            std::cout << " WARNING: point on boundary or outside the mesh distNorm2: " << h_x[in] << ","
-                      << h_y[in] << ", " << h_z[in] << ", " << data_.dist2_base[in] << std::endl;
+          const auto distNorm = data_.dist2_base[in];
+          maxDistNorm = std::max(maxDistNorm, distNorm);
+          nBoundary++;
+          if (nBoundary < maxVerbosePoints && verbosity == VerbosityLevel::Detailed) {
+            std::cout << "pointInterpolation_t::find: WARNING point on boundary or outside the mesh"
+                      << " xyz= " << h_x[in] << " " << h_y[in] << " " << h_z[in]
+                      << " distNorm= " << std::scientific << std::setprecision(3) << distNorm << std::endl;
           }
         }
       } else if (data_.code_base[in] == findpts::CODE_NOT_FOUND) {
-        nOutside += 1;
-        if (nOutside < 5 && verbosity == VerbosityLevel::Detailed) {
-          std::cout << " WARNING: point not within mesh xy[z]: " << h_x[in] << "," << h_y[in] << ", "
-                    << h_z[in] << std::endl;
+        nOutside++;
+        if (nOutside < maxVerbosePoints && verbosity == VerbosityLevel::Detailed) {
+          std::cout << "pointInterpolation_t::find: WARNING point outside the mesh"
+                    << " xyz= " << h_x[in] << " " << h_y[in] << " " << h_z[in] << std::endl;
         }
       }
     }
+
     std::array<hlong, 3> counts = {n, nBoundary, nOutside};
     MPI_Allreduce(MPI_IN_PLACE, counts.data(), counts.size(), MPI_HLONG, MPI_SUM, platform->comm.mpiComm);
-    if (platform->comm.mpiRank == 0 && counts[2] > 0) {
-      std::cout << "WARNING interp::find - total = " << counts[0] << ", boundary = " << counts[1]
-                << ", outside = " << counts[2] << "\n";
+    MPI_Allreduce(MPI_IN_PLACE, &maxDistNorm, 1, MPI_DFLOAT, MPI_MAX, platform->comm.mpiComm);
+
+    if (platform->comm.mpiRank == 0 && verbosity == VerbosityLevel::Detailed) {
+      std::cout << "pointInterpolation_t::find:"
+                << " total= " << counts[0] << " boundary= " << counts[1] << " (max distNorm=" << maxDistNorm
+                << ")"
+                << " outside= " << counts[2] << std::endl;
     }
   }
 
@@ -169,89 +166,82 @@ void pointInterpolation_t::find(pointInterpolation_t::VerbosityLevel verbosity,
   }
 
   findCalled = true;
+  data_.updateCache = true;
 }
 
 void pointInterpolation_t::eval(dlong nFields,
                                 dlong inputFieldOffset,
-                                const occa::memory& o_in,
+                                const occa::memory &o_in,
                                 dlong outputFieldOffset,
-                                occa::memory &o_out)
+                                occa::memory &o_out,
+                                dlong nPointsIn,
+                                dlong offset)
 {
-  nekrsCheck(!findCalled,
-             platform->comm.mpiComm,
-             EXIT_FAILURE,
-             "%s\n",
-             "find has not been called prior to eval!");
+  if (inputFieldOffset == 0) {
+    inputFieldOffset = o_in.size();
+  }
+  if (outputFieldOffset == 0) {
+    outputFieldOffset = o_out.size();
+  }
 
-  if (timerLevel != TimerLevel::None) {
-    platform->timer.tic("pointInterpolation_t::eval");
+  auto nPoints_ = (nPointsIn > -1) ? nPointsIn : nPoints;
+  if (nPointsIn >= 0) {
+    data_.updateCache = true; // enforce update as cache cannot be used
   }
 
-  nekrsCheck(mesh->Nlocal > inputFieldOffset,
-             platform->comm.mpiComm,
+  nekrsCheck(!findCalled, MPI_COMM_SELF, EXIT_FAILURE, "%s\n", "find has not been called prior to eval!");
+
+  nekrsCheck(nFields > 1 && mesh->Nlocal > inputFieldOffset,
+             MPI_COMM_SELF,
              EXIT_FAILURE,
              "pointInterpolation_t::eval inputFieldOffset (%d) is less than mesh->Nlocal (%d)\n",
              inputFieldOffset,
              mesh->Nlocal);
 
+  nekrsCheck(nFields > 1 && nPoints_ > outputFieldOffset,
+             MPI_COMM_SELF,
+             EXIT_FAILURE,
+             "pointInterpolation_t::eval outputFieldOffset (%d) is less than nPoints (%d)\n",
+             inputFieldOffset,
+             nPoints_);
+
   nekrsCheck(o_in.byte_size() < nFields * inputFieldOffset * sizeof(dfloat),
-             platform->comm.mpiComm,
+             MPI_COMM_SELF,
              EXIT_FAILURE,
              "pointInterpolation_t::eval input size (%" PRId64 ") is smaller than expected (%ld)\n",
              o_in.byte_size(),
              nFields * inputFieldOffset * sizeof(dfloat));
 
   nekrsCheck(o_out.byte_size() < nFields * outputFieldOffset * sizeof(dfloat),
-             platform->comm.mpiComm,
+             MPI_COMM_SELF,
              EXIT_FAILURE,
              "pointInterpolation_t::eval output size (%" PRId64 ") is smaller than expected (%ld)\n",
              o_out.byte_size(),
              nFields * outputFieldOffset * sizeof(dfloat));
 
-  findpts_->eval(nPoints, nFields, inputFieldOffset, outputFieldOffset, o_in, &data_, o_out);
-
-  if (timerLevel != TimerLevel::None) {
-    platform->timer.toc("pointInterpolation_t::eval");
-  }
-}
-
-void pointInterpolation_t::eval(dlong nFields,
-                                dlong inputFieldOffset,
-                                const std::vector<dfloat>& in,
-                                dlong outputFieldOffset,
-                                std::vector<dfloat>& out)
-{
-  nekrsCheck(!findCalled,
-             platform->comm.mpiComm,
-             EXIT_FAILURE,
-             "%s\n",
-             "find has not been called prior to eval!");
-
   if (timerLevel != TimerLevel::None) {
     platform->timer.tic("pointInterpolation_t::eval");
   }
 
-  nekrsCheck(mesh->Nlocal > inputFieldOffset,
-             platform->comm.mpiComm,
-             EXIT_FAILURE,
-             "pointInterpolation_t::eval inputFieldOffset (%d) is less than mesh->Nlocal (%d)\n",
-             inputFieldOffset,
-             mesh->Nlocal);
-
-  findpts_->eval(nPoints, nFields, inputFieldOffset, outputFieldOffset, const_cast<dfloat*>(in.data()), &data_, out.data());
+  findpts_->eval(nPoints_, offset, nFields, inputFieldOffset, outputFieldOffset, o_in, &data_, o_out);
 
   if (timerLevel != TimerLevel::None) {
     platform->timer.toc("pointInterpolation_t::eval");
   }
 }
 
-void pointInterpolation_t::setPoints(const std::vector<dfloat>& x, const std::vector<dfloat>& y, const std::vector<dfloat>& z)
+void pointInterpolation_t::setPoints(const std::vector<dfloat> &x,
+                                     const std::vector<dfloat> &y,
+                                     const std::vector<dfloat> &z)
 {
   std::vector<dlong> session;
   this->setPoints(x, y, z, session);
 }
 
-void pointInterpolation_t::setPoints(const std::vector<dfloat>& x, const std::vector<dfloat>& y, const std::vector<dfloat>& z, const std::vector<dlong>& session)
+void pointInterpolation_t::setPoints(const std::vector<dfloat> &x,
+                                     const std::vector<dfloat> &y,
+                                     const std::vector<dfloat> &z,
+                                     const std::vector<dlong> &session)
 {
   auto o_x = platform->device.malloc<dfloat>(x.size());
   o_x.copyFrom(x.data());
@@ -264,7 +254,7 @@ void pointInterpolation_t::setPoints(const std::vector<dfloat>& x, const std::ve
   if (session.size()) {
     o_session = platform->device.malloc<dlong>(session.size());
     o_session.copyFrom(session.data());
-  } 
+  }
   this->setPoints(o_x, o_y, o_z, o_session);
 }
 
@@ -320,9 +310,3 @@ void pointInterpolation_t::setTimerName(std::string name)
   timerName = name;
   findpts_->setTimerName(name);
 }
-
-void pointInterpolation_t::o_update()
-{
-  findCalled = true;
-  findpts_->o_update(data_);
-}
diff --git a/src/pointInterpolation/pointInterpolation.hpp b/src/pointInterpolation/pointInterpolation.hpp
index 4b8423edd..88856b962 100644
--- a/src/pointInterpolation/pointInterpolation.hpp
+++ b/src/pointInterpolation/pointInterpolation.hpp
@@ -11,19 +11,20 @@ using findpts::TimerLevel;
 class pointInterpolation_t
 {
 public:
+  static constexpr int CODE_INTERNAL = 0;
+  static constexpr int CODE_BORDER = 1;
+  static constexpr int CODE_NOT_FOUND = 2;
+
   enum class VerbosityLevel { None, Basic, Detailed };
-  pointInterpolation_t(mesh_t *mesh,
-                       MPI_Comm comm,
-                       bool mySession_ = true,
-                       std::vector<int> bIntID = {});
   pointInterpolation_t(mesh_t *mesh_,
                        MPI_Comm comm,
-                       dlong localHashSize,
-                       dlong globalHashSize,
-                       double bb_tol = 0.01,
-                       double newton_tol = 0,
                        bool mySession_ = true,
-                       std::vector<int> bIntID = {});
+                       std::vector<int> bIntID = {},
+                       double bb_tol = 0.05,
+                       double newton_tol = 0,
+                       dlong localHashSize = 0,
+                       dlong globalHashSize = 0);
+
   ~pointInterpolation_t() = default;
 
   // Finds the process, element, and reference coordinates of the given points
@@ -33,9 +34,9 @@ class pointInterpolation_t
             dlong inputFieldOffset,
             const occa::memory& o_in,
             dlong outputFieldOffset,
-            occa::memory& o_out);
-
-  void eval(dlong nFields, dlong inputFieldOffset, const std::vector<dfloat>& in, dlong outputFieldOffset, std::vector<dfloat> &out);
+            occa::memory& o_out,
+            dlong nPoints = -1,
+            dlong findPtsOffset = 0);
 
   auto *ptr()
   {
@@ -73,16 +74,16 @@ class pointInterpolation_t
   occa::memory distanceINT();
 
 private:
-  mesh_t *mesh;
-  double newton_tol;
-  std::string timerName = "";
+  mesh_t *mesh = nullptr;
+  double newton_tol = 0;
+  std::string timerName;
   TimerLevel timerLevel = TimerLevel::None;
+
   std::unique_ptr<findpts::findpts_t> findpts_;
   findpts::data_t data_;
-  bool mySession;
 
+  bool mySession = true;
   bool findCalled = false;
-
   bool pointsAdded = false;
 
   // correponds  to which setPoints overload is called
@@ -91,10 +92,10 @@ class pointInterpolation_t
 
   int nPoints;
 
-  dfloat *_x;
-  dfloat *_y;
-  dfloat *_z;
-  dlong *_session;
+  dfloat *_x = nullptr;
+  dfloat *_y = nullptr;
+  dfloat *_z = nullptr;
+  dlong *_session = nullptr;
 
   occa::memory _o_x;
   occa::memory _o_y;
diff --git a/src/pointInterpolation/registerPointInterpolationKernels.cpp b/src/pointInterpolation/registerPointInterpolationKernels.cpp
index 77627b6b7..6a906fdde 100644
--- a/src/pointInterpolation/registerPointInterpolationKernels.cpp
+++ b/src/pointInterpolation/registerPointInterpolationKernels.cpp
@@ -12,12 +12,9 @@ unsigned nearestPowerOfTwo(unsigned int v)
     answer *= 2;
   return answer;
 }
-} // namespace
 
-void registerPointInterpolationKernels()
+void registerKernels(int N)
 {
-  dlong N;
-  platform->options.getArgs("POLYNOMIAL DEGREE", N);
   const dlong Nq = N + 1;
 
   const std::string oklpath = getenv("NEKRS_KERNEL_DIR");
@@ -53,16 +50,24 @@ void registerPointInterpolationKernels()
 
   std::string kernelName;
   std::string fileName;
+  std::string orderSuffix = "_" + std::to_string(N);
 
   kernelName = "findptsLocal";
   fileName = oklpath + "/pointInterpolation/findpts/" + kernelName + ".okl";
-  platform->kernelRequests.add(kernelName, fileName, findptsKernelInfo);
+  platform->kernelRequests.add(kernelName + orderSuffix, fileName, findptsKernelInfo);
 
   kernelName = "findptsLocalEval";
   fileName = oklpath + "/pointInterpolation/findpts/" + kernelName + ".okl";
-  platform->kernelRequests.add(kernelName, fileName, findptsKernelInfo);
+  platform->kernelRequests.add(kernelName + orderSuffix, fileName, findptsKernelInfo);
 
   kernelName = "findptsLocalEvalMask";
   fileName = oklpath + "/pointInterpolation/findpts/" + kernelName + ".okl";
-  platform->kernelRequests.add(kernelName, fileName, findptsKernelInfo);
+  platform->kernelRequests.add(kernelName + orderSuffix, fileName, findptsKernelInfo);
+}
+
+} // namespace
+
+void registerPointInterpolationKernels()
+{
+  for (int i = 1; i < mesh_t::maxNqIntp; i++) registerKernels(i);
 }
diff --git a/src/udf/udf.cpp b/src/udf/udf.cpp
index c0e7cb7dc..523519d14 100644
--- a/src/udf/udf.cpp
+++ b/src/udf/udf.cpp
@@ -330,7 +330,8 @@ void udfBuild(setupAide &options)
     if (buildRank == 0) {
       double tStart = MPI_Wtime();
 
-      char cmd[4096];
+      const int cmdSize = 4096;
+      char cmd[cmdSize];
       mkdir(std::string(cache_dir + "/udf").c_str(), S_IRWXU);
 
       const std::string pipeToNull =
@@ -433,7 +434,8 @@ void udfBuild(setupAide &options)
         const std::string useFloat = (sizeof(dfloat) == sizeof(float)) ? "ON" : "OFF";
         const std::string cmakeVerbose = (verbose) ? "ON" : "OFF";
 
-        sprintf(cmd,
+        snprintf(cmd,
+                 cmdSize,
                 "rm -f %s/*.so && cmake %s -S %s -B %s "
                 "-DNEKRS_USE_DFLOAT_FLOAT=%s "
                 "-DNEKRS_INSTALL_DIR=\"%s\" -DCASE_DIR=\"%s\" -DCMAKE_CXX_COMPILER=\"$NEKRS_CXX\" "
@@ -458,7 +460,7 @@ void udfBuild(setupAide &options)
         auto stdoutFlag = (verbose) ? std::string("") : ">>cmake.log 2>&1"; 
 
         { // generate pre-processed okl
-          sprintf(cmd, "cd %s && make -j1 okl.i %s", cmakeBuildDir.c_str(), stdoutFlag.c_str());
+          snprintf(cmd, cmdSize, "cd %s && make -j1 okl.i %s", cmakeBuildDir.c_str(), stdoutFlag.c_str());
           const int retVal = system(cmd);
           if (verbose && platform->comm.mpiRank == 0) {
             printf("%s (preprocessing retVal: %d)\n", cmd, retVal);
@@ -473,7 +475,7 @@ void udfBuild(setupAide &options)
         }
 
         { // build
-          sprintf(cmd, "cd %s/udf && make -j1", cache_dir.c_str());
+          snprintf(cmd, cmdSize, "cd %s/udf && make -j1", cache_dir.c_str());
           const int retVal = system(cmd);
           if (verbose && platform->comm.mpiRank == 0) {
             printf("%s (make retVal: %d)\n", cmd, retVal);

From d0f3cffce26768d5f76907116dd02c4cffb84e03 Mon Sep 17 00:00:00 2001
From: stgeke <stgeke@gmail.com>
Date: Sat, 19 Oct 2024 17:35:54 +0200
Subject: [PATCH 2/2] Import next changes SHA[b6fc11eba2, b6fc11eba2]

---
 RELEASE.md | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 570fe397c..c6113e816 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -22,10 +22,12 @@
 
 ## Good to know
 
+* after fixing a bug in the linear solver residual norm, iteration counts have increased compared to previous versions
 * GPU aware MPI is disabled by default (`NEKRS_GPU_MPI=0`)
-* HYPRE replaces AmgX
 * [reproducibility] variable time step controller restricts dt to 5 significant digits
-* after fixing a bug in the linear solver residual norm, iteration counts have increased compared to previous versions
+* nrsman <par, env>  can be used to display the par file or environment settings
+* HYPRE replaces AmgX
+* Field file extension starts with 0-index
 
 ## Breaking Changes
 
@@ -33,20 +35,29 @@ This list provides an overview of the most significant changes in this release,
 
 * run `build.sh` instead of `nrsconfig` to build the code
 * change par section `SCALAR00` to `TEMPERATURE` in case it represent indeed a physical temperature
+* `nek::userchk` is no longer called automatically during the setup phase 
+* host mirrored variables including `nrs->U, cds->S, mesh->x, nrs->usrwrk` have been removed 
+* send signal (defined in env-var `NEKRS_SIGNUM_UPD`) to process trigger file `nekrs.upd`
+* use `auto foo = platform->deviceMemoryPool.reserve<T>(nWords)` instead of pre-allocated dfloat slices like `platform->o_mempool.slice0`
+* change count argument of `occa::memory::slice, occa::memory::copyFrom, occa::memory::copyTo` to number of words instead of bytes 
+* use `nekrs_registerPtr` instead of common blocks NRSSCPTR / SCNRS in usr file and access them using `nek::ptr` in udf (see examples)
+
+### Name Changes
 * `velocityDirichletConditions` -> `codedFixedValueVelocity` (same for scalars)
 * `velocityNeumannConditions` -> `codedFixedGradientVelocity` (same for scalars)
-* `nek::userchk` is no longer called automatically during the setup phase 
-* use temporary instead of `nrs->U` and copy to `nrs->o_U`
-* use temporary instead of `cds->S` and copy to `cds->o_S`
-* use `auto [x, y, z] = mesh->xyzHost()` instead of `mesh->x` (same for other components) 
 * `nrs->_mesh` -> `cds->mesh[0]`
 * `nek::ocopyToNek` -> `nrs->copyToNek`
-* `nek::ocopyFromNek` -> `nek::copyFromNek`
-* send signal (defined in env-var `NEKRS_SIGNUM_UPD`) to process trigger file `nekrs.upd`
-* use `auto foo = platform->deviceMemoryPool.reserve<T>(nWords)` instead of e.g. `platform->o_mempool.slice0`
-* change count argument of `occa::memory::slice, occa::memory::copyFrom, occa::memory::copyTo` to number of words instead of bytes 
+* `nek::ocopyFromNek` -> `nrs->copyFromNek`
+* `nrs->o_FU` -> `nrs->o_NLT`
+* `cds->o_FS` -> `cds->o_NLT`
+* `occaKernel` -> `deviceKernel`
+* `occaProperties` > `deviceKernelProperties`
+* `occa::memory` -> `deviceMemory` 
+* `nrs->isOutputStep` -> `nrs->checkpointStep`
+
+### Interface Changes 
 * define `time` as double (instead of defloat) in all UDF functions
-* remove `nrs_t` argument from UDF API functions (nrs object is now globally accessible within udf if the Navier Stokes solver is enabled)
+* remove `nrs_t` argument from all UDF functions (nrs object is now globally accessible within udf if the Navier Stokes solver is enabled)
 * `nrs_t::userProperties = std::function<void(double)>` -> `udf::properties = std::function<void(nrs_t *, dfloat, occa::memory, occa::memory, occa::memory, occa::memory)>`
 * `nrs_t::userVelocitySource = std::function<void(double)>` -> `udf::uEqnSource = std::function<void(nrs_t *, dfloat, occa::memory, occa::memory)>`
 * `nrs_t::userScalarSource = std::function<void(double)>` -> `udf::sEqnSource = std::function<void(nrs_t *, dfloat, occa::memory, occa::memory)>`
@@ -54,20 +65,10 @@ This list provides an overview of the most significant changes in this release,
 * `nrs_t::userDivergence = std::function<void(double)>` -> `udf::udfdif = std::function<void(nrs_t *, dfloat, occa::memory)>`
 * `tavg::setup(dlong fieldOffset, const fields& fields)` -> `tavg::setup(nrs_t*)`
 * `planarAvg(mesh_t*, const std::string&, int, int, int, int, dlong, occa::memory o_avg)` -> `postProcessing::planarAvg(nrs_t*, const std::string&, int, int, int, int, occa::memory)`
-* `nrs->o_FU` -> `nrs->o_NLT`
-* `cds->o_FS` -> `cds->o_NLT`
 * `::postProcessing` functions are now members of `nrs_t` (except planarAvg)
-* use `nekrs_registerPtr` instead of common blocks NRSSCPTR / SCNRS in usr file and access them using `nek::ptr` in udf (see e.g. channel example)
-* `occaKernel` -> `deviceKernel`
-* `occaProperties` > `deviceKernelProperties`
-* `occa::memory` -> `deviceMemory` 
 * remove `nrs_t` argument from `<plugin>::setup`
-* `nrs->isOutputStep` -> `nrs->checkpointStep`
 * `pointInterpolation_t::setPoints(int, dfloat*, dfloat*, dfloat*)` -> `pointInterpolation_t::setPoints(const std::vector<dfloat>&, const std::vector<dfloat>&, const std::vector<dfloat>&)`
 * use `iofld` instead of `writeFld`
-* `nrs->usrwrk` was removed (it's a user variable not used anywhere in the code)
-* field file extension starts with 0-index
-
 
 ## Known Bugs / Restrictions