diff --git a/.travis.yml b/.travis.yml
index d4853aa5..f4d5f200 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -26,19 +26,19 @@ install:
   # Back out of the directory to install the libraries
   - cd ..
 
-  # HYPRE (cached 2.10.0b build)
-  - if [ ! -e hypre-2.10.0b/src/hypre/lib/libHYPRE.a ]; then
-      wget https://computation.llnl.gov/project/linear_solvers/download/hypre-2.10.0b.tar.gz --no-check-certificate;
-      rm -rf hypre-2.10.0b;
-      tar xvzf hypre-2.10.0b.tar.gz;
-      cd hypre-2.10.0b/src;
+  # HYPRE (cached 2.11.2 build)
+  - if [ ! -e hypre-2.11.2/src/hypre/lib/libHYPRE.a ]; then
+      wget https://computation.llnl.gov/project/linear_solvers/download/hypre-2.11.2.tar.gz --no-check-certificate;
+      rm -rf hypre-2.11.2;
+      tar xvzf hypre-2.11.2.tar.gz;
+      cd hypre-2.11.2/src;
       ./configure --disable-fortran --without-fei CC=mpicc CXX=mpic++;
       make -j3;
       cd ../..;
     else
-      echo "Reusing cached hypre-2.10.0b/";
+      echo "Reusing cached hypre-2.11.2/";
     fi;
-  - ln -s hypre-2.10.0b hypre
+  - ln -s hypre-2.11.2 hypre
 
   # METIS (cached 4.0 build)
   - if [ ! -e metis-4.0/libmetis.a ]; then
@@ -54,7 +54,7 @@ install:
   # MFEM (master branch)
   - if [ ! -e mfem/libmfem.a ]; then
       rm -rf mfem;
-      git clone --depth 1 https://github.com/mfem/mfem.git;
+      git clone --single-branch --branch master --depth 1 https://github.com/mfem/mfem.git;
       cd mfem;
       make -j3 parallel;
       make info;
@@ -72,50 +72,13 @@ branches:
 
 script:
    - make -j
-   - touch RESULTS.dat
-
-   - mpirun -np 8 laghos -p 0 -m data/square01_quad.mesh -rs 3 -tf 0.75 -pa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 21 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - mpirun -np 8 laghos -p 0 -m data/cube01_hex.mesh -rs 1 -tf 0.75 -pa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 21 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 3 -tf 0.8 -pa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 18 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - mpirun -np 8 laghos -p 1 -m data/cube01_hex.mesh -rs 2 -tf 0.6 -pa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 18 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - mpirun -np 8 laghos -p 2 -m data/segment01.mesh -rs 5 -tf 0.2 -fa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 18 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - mpirun -np 8 laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 18 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - mpirun -np 8 laghos -p 3 -m data/box01_hex.mesh -rs 1 -tf 3.0 -pa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 18 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - mpirun -np 8 laghos -p 4 -m data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62831853 -s 7 -pa -vs 100 | tee RUN.dat
-   - cat RUN.dat | tail -n 21 | head -n 1 | awk '{ printf("step = %04d, dt = %s |e| = %s\n", $2, $8, $11); }' >> RESULTS.dat
-
-   - |
-     cat <<EOF  > BASELINE.dat
-     step = 0339, dt = 0.000702, |e| = 49.6955373491
-     step = 1041, dt = 0.000121, |e| = 3390.9635545457
-     step = 1154, dt = 0.001655, |e| = 46.3033960530
-     step = 0560, dt = 0.002449, |e| = 134.0861672235
-     step = 0413, dt = 0.000470, |e| = 32.0120774101
-     step = 2872, dt = 0.000064, |e| = 56.5470233805
-     step = 0528, dt = 0.000180, |e| = 56.5053488122
-     step = 0776, dt = 0.000045, |e| = 409.8243172608
-     EOF
-
-   - diff --report-identical-files RESULTS.dat BASELINE.dat
-
+   - make checks ranks=1
+   - make checks ranks=2
+   - make checks ranks=3
+   - make tests
 
 cache:
    directories:
-     - $TRAVIS_BUILD_DIR/../hypre-2.10.0b/src/hypre/lib
-     - $TRAVIS_BUILD_DIR/../hypre-2.10.0b/src/hypre/include
+     - $TRAVIS_BUILD_DIR/../hypre-2.11.2/src/hypre/lib
+     - $TRAVIS_BUILD_DIR/../hypre-2.11.2/src/hypre/include
      - $TRAVIS_BUILD_DIR/../metis-4.0
-
diff --git a/CHANGELOG b/CHANGELOG
index 408c82f2..e386b7e8 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,30 @@
         High-order Lagrangian Hydrodynamics Miniapp
 
 
+Version 3.0, released on Mar 27, 2020
+=====================================
+
+- Replaced the Laghos-2.0 custom implementations in the cuda/, raja/, occa/ and
+  hip/ directories with direct general device support in the main Laghos sources
+  based on MFEM-4.1
+
+- With the above change different device backends can be selected at runtime,
+  including cuda, raja, occa, hip, omp and more. See the -d command-line option.
+
+- Added 'setup' makefile target to download and build the Laghos dependencies:
+  HYPRE (2.11.2), METIS (4.0.3) and MFEM (master branch).
+
+- Added 'tests' and 'checks' makefile targets to launch non-regression tests.
+
+- Added default dimension options that generate internally the mesh:
+  * 1D (-dim 1): data/segment01.mesh
+  * 2D (-dim 2): data/square01_quad.mesh
+  * 3D (-dim 3): data/cube01_hex.mesh
+
+- The timing/ directory was deprecated. Use the scripts in the CEED benchmarks
+  instead, https://github.com/CEED/benchmarks.
+
+
 Version 2.0, released on Nov 19, 2018
 =====================================
 
diff --git a/README.md b/README.md
index e4cefdb6..756891fa 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,12 @@ necessary operations. As the local action is defined by utilizing the tensor
 structure of the finite element spaces, the amount of data storage, memory
 transfers, and FLOPs are lower (especially for higher orders).
 
+The Laghos implementation includes support for hardware devices, such
+as GPUs, and programming models, such as CUDA, OCCA, RAJA and OpenMP,
+based on [MFEM](http://mfem.org), version 4.1 or later. These device
+backends are selectable at runtime, see the `-d/--device` command-line
+option.
+
 Other computational motives in Laghos include the following:
 
 - Support for unstructured meshes, in 2D and 3D, with quadrilateral and
@@ -93,9 +99,9 @@ Other computational motives in Laghos include the following:
 ## Code Structure
 
 - The file `laghos.cpp` contains the main driver with the time integration loop
-  starting around line 488.
+  starting around line 609.
 - In each time step, the ODE system of interest is constructed and solved by
-  the class `LagrangianHydroOperator`, defined around line 424 of `laghos.cpp`
+  the class `LagrangianHydroOperator`, defined around line 544 of `laghos.cpp`
   and implemented in files `laghos_solver.hpp` and `laghos_solver.cpp`.
 - All quadrature-based computations are performed in the function
   `LagrangianHydroOperator::UpdateQuadratureData` in `laghos_solver.cpp`.
@@ -120,7 +126,7 @@ Other computational motives in Laghos include the following:
 
 Laghos has the following external dependencies:
 
-- *hypre*, used for parallel linear algebra, we recommend version 2.10.0b<br>
+- *hypre*, used for parallel linear algebra, we recommend version 2.11.2<br>
    https://computation.llnl.gov/casc/hypre/software.html
 
 -  METIS, used for parallel domain decomposition (optional), we recommend [version 4.0.3](http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/OLD/metis-4.0.3.tar.gz) <br>
@@ -133,13 +139,13 @@ To build the miniapp, first download *hypre* and METIS from the links above
 and put everything on the same level as the `Laghos` directory:
 ```sh
 ~> ls
-Laghos/  hypre-2.10.0b.tar.gz  metis-4.0.tar.gz
+Laghos/  hypre-2.11.2.tar.gz  metis-4.0.3.tar.gz
 ```
 
 Build *hypre*:
 ```sh
-~> tar -zxvf hypre-2.10.0b.tar.gz
-~> cd hypre-2.10.0b/src/
+~> tar -zxvf hypre-2.11.2.tar.gz
+~> cd hypre-2.11.2/src/
 ~/hypre-2.10.0b/src> ./configure --disable-fortran
 ~/hypre-2.10.0b/src> make -j
 ~/hypre-2.10.0b/src> cd ../..
@@ -162,14 +168,12 @@ Clone and build the parallel version of MFEM:
 ```sh
 ~> git clone https://github.com/mfem/mfem.git ./mfem
 ~> cd mfem/
-~/mfem> git checkout laghos-v2.0
+~/mfem> git checkout master
 ~/mfem> make parallel -j
 ~/mfem> cd ..
 ```
-The above uses the `laghos-v2.0` tag of MFEM, which is guaranteed to work with
-Laghos v2.0. Alternatively, one can use the latest versions of the MFEM and
-Laghos `master` branches (provided there are no conflicts). See the [MFEM
-building page](http://mfem.org/building/) for additional details.
+The above uses the `master` branch of MFEM.
+See the [MFEM building page](http://mfem.org/building/) for additional details.
 
 (Optional) Clone and build GLVis:
 ```sh
@@ -185,11 +189,14 @@ to the GLVis socket.
 Build Laghos
 ```sh
 ~> cd Laghos/
-~/Laghos> make
+~/Laghos> make -j
 ```
 This can be followed by `make test` and `make install` to check and install the
 build respectively. See `make help` for additional options.
 
+See also the `make setup` target that can be used to automated the
+download and building of hypre, METIS and MFEM.
+
 ## Running
 
 #### Sedov blast
@@ -199,8 +206,8 @@ partial assembly option (`-pa`).
 
 Some sample runs in 2D and 3D respectively are:
 ```sh
-mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 3 -tf 0.8 -pa
-mpirun -np 8 laghos -p 1 -m data/cube01_hex.mesh -rs 2 -tf 0.6 -vis -pa
+mpirun -np 8 ./laghos -p 1 -dim 2 -rs 3 -tf 0.8 -pa
+mpirun -np 8 ./laghos -p 1 -dim 3 -rs 2 -tf 0.6 -pa -vis
 ```
 
 The latter produces the following density plot (notice the `-vis` option)
@@ -216,9 +223,9 @@ evaluation. (Viscosity can still be activated for these problems with the
 
 Some sample runs in 2D and 3D respectively are:
 ```sh
-mpirun -np 8 laghos -p 0 -m data/square01_quad.mesh -rs 3 -tf 0.5 -pa
-mpirun -np 8 laghos -p 0 -m data/cube01_hex.mesh -rs 1 -tf 0.25 -pa
-mpirun -np 8 laghos -p 4 -m data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62 -s 7 -vis -pa
+mpirun -np 8 ./laghos -p 0 -dim 2 -rs 3 -tf 0.5 -pa
+mpirun -np 8 ./laghos -p 0 -dim 3 -rs 1 -cfl 0.1 -tf 0.25 -pa
+mpirun -np 8 ./laghos -p 4 -m data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62 -s 7 -vis -pa
 ```
 
 The latter produce the following velocity magnitude plots (notice the `-vis` option)
@@ -235,8 +242,8 @@ vorticity, thus examining the complex computational abilities of Laghos.
 
 Some sample runs in 2D and 3D respectively are:
 ```sh
-mpirun -np 8 laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa
-mpirun -np 8 laghos -p 3 -m data/box01_hex.mesh -rs 1 -tf 3.0 -vis -pa
+mpirun -np 8 ./laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 2.5 -cfl 0.025 -pa
+mpirun -np 8 ./laghos -p 3 -m data/box01_hex.mesh -rs 1 -tf 2.5 -cfl 0.05 -vis -pa
 ```
 
 The latter produces the following specific internal energy plot (notice the `-vis` option)
@@ -248,25 +255,36 @@ The latter produces the following specific internal energy plot (notice the `-vi
 To make sure the results are correct, we tabulate reference final iterations
 (`step`), time steps (`dt`) and energies (`|e|`) for the runs listed below:
 
-1. `mpirun -np 8 laghos -p 0 -m data/square01_quad.mesh -rs 3 -tf 0.75 -pa`
-2. `mpirun -np 8 laghos -p 0 -m data/cube01_hex.mesh -rs 1 -tf 0.75 -pa`
-3. `mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 3 -tf 0.8 -pa`
-4. `mpirun -np 8 laghos -p 1 -m data/cube01_hex.mesh -rs 2 -tf 0.6 -pa`
-5. `mpirun -np 8 laghos -p 2 -m data/segment01.mesh -rs 5 -tf 0.2 -fa`
-6. `mpirun -np 8 laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa`
-7. `mpirun -np 8 laghos -p 3 -m data/box01_hex.mesh -rs 1 -tf 3.0 -pa`
-8. `mpirun -np 8 laghos -p 4 -m data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62831853 -s 7 -pa`
+1. `mpirun -np 8 ./laghos -p 0 -dim 2 -rs 3 -tf 0.75 -pa`
+2. `mpirun -np 8 ./laghos -p 0 -dim 3 -rs 1 -tf 0.75 -pa`
+3. `mpirun -np 8 ./laghos -p 1 -dim 2 -rs 3 -tf 0.8 -pa`
+4. `mpirun -np 8 ./laghos -p 1 -dim 3 -rs 2 -tf 0.6 -pa`
+5. `mpirun -np 8 ./laghos -p 2 -dim 1 -rs 5 -tf 0.2 -fa`
+6. `mpirun -np 8 ./laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa`
+7. `mpirun -np 8 ./laghos -p 3 -m data/box01_hex.mesh -rs 1 -tf 3.0 -pa`
+8. `mpirun -np 8 ./laghos -p 4 -m data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62831853 -s 7 -pa`
 
 | `run` | `step` | `dt` | `e` |
 | ----- | ------ | ---- | --- |
-|  1. |  339 | 0.000702 | 49.6955373491   |
-|  2. | 1041 | 0.000121 | 3390.9635545458 |
-|  3. | 1154 | 0.001655 | 46.3033960530   |
-|  4. |  560 | 0.002449 | 134.0861672235  |
-|  5. |  413 | 0.000470 | 32.0120774101   |
-|  6. | 2872 | 0.000064 | 56.5470233805   |
-|  7. |  528 | 0.000180 | 56.5053488122   |
-|  8. |  776 | 0.000045 | 409.8243172608  |
+|  1. |  339 | 0.000702 | 4.9695537349e+01 |
+|  2. | 1041 | 0.000121 | 3.3909635545e+03 |
+|  3. | 1154 | 0.001655 | 4.6303396053e+01 |
+|  4. |  560 | 0.002449 | 1.3408616722e+02 |
+|  5. |  413 | 0.000470 | 3.2012077410e+01 |
+|  6. | 2872 | 0.000064 | 5.6547039096e+01 |
+|  7. |  528 | 0.000180 | 5.6505348812e+01 |
+|  8. |  776 | 0.000045 | 4.0982431726e+02 |
+
+Similar GPU runs using the MFEM CUDA *device* can be run as follows:
+
+1. `./laghos -p 0 -dim 2 -rs 3 -tf 0.75 -pa -d cuda`
+2. `./laghos -p 0 -dim 3 -rs 1 -tf 0.75 -pa -d cuda`
+3. `./laghos -p 1 -dim 2 -rs 3 -tf 0.80 -pa -d cuda`
+4. `./laghos -p 1 -dim 3 -rs 2 -tf 0.60 -pa -d cuda`
+5. `./laghos -p 2 -dim 1 -rs 5 -tf 0.20 -fa`
+6. `./laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa -d cuda`
+7. `./laghos -p 3 -m data/box01_hex.mesh -rs 1 -tf 3.0 -pa -cgt 1e-12 -d cuda`
+8. `./laghos -p 4 -m data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62831853 -s 7 -pa -d cuda`
 
 An implementation is considered valid if the final energy values are all within
 round-off distance from the above reference values.
@@ -315,12 +333,8 @@ In addition to the main MPI-based CPU implementation in https://github.com/CEED/
 the following versions of Laghos have been developed
 
 - **SERIAL** version in the [serial/](./serial/README.md) directory.
-- **CUDA** version in the [cuda/](./cuda/README.md) directory. This version supports GPU acceleration.
-- **RAJA** version in the [raja/](./raja/README.md) directory. This version supports GPU acceleration. See [GitHub](https://software.llnl.gov/RAJA/) for more information about RAJA.
-- **OCCA** version in the [occa/](./occa/README.md) directory. This version supports GPU and OpenMP acceleration. See the OCCA [website](http://libocca.org/) for more information.
-- **AMR** version in the [amr/](./amr/README.md) directory. This version supports dynamic adaptive mesh refinement.
-- **MFEM/engines**-based version in the
-  [engines-kernels](https://github.com/CEED/Laghos/tree/engines-kernels) branch.
+- **AMR** version in the [amr/](./amr/README.md) directory.
+  This version supports dynamic adaptive mesh refinement.
 
 ## Contact
 
diff --git a/cuda/README.md b/cuda/README.md
deleted file mode 100644
index 883add25..00000000
--- a/cuda/README.md
+++ /dev/null
@@ -1,131 +0,0 @@
-               __                __
-              / /   ____  ____  / /_  ____  _____
-             / /   / __ `/ __ `/ __ \/ __ \/ ___/
-            / /___/ /_/ / /_/ / / / / /_/ (__  )
-           /_____/\__,_/\__, /_/ /_/\____/____/
-                       /____/
-
-        High-order Lagrangian Hydrodynamics Miniapp
-
-                      CUDA version
-
-## Overview
-
-This directory contains the CUDA version of the **Laghos** (LAGrangian
-High-Order Solver), which is provided as a reference implementation and is NOT
-the official benchmark version of the miniapp.
-
-For more details about Laghos see the [README file](../README.md) in the
-top-level directory.
-
-The Laghos miniapp is part of the [CEED software suite](http://ceed.exascaleproject.org/software),
-a collection of software benchmarks, miniapps, libraries and APIs for
-efficient exascale discretizations based on high-order finite element
-and spectral element methods. See http://github.com/ceed for more
-information and source code availability.
-
-The CEED research is supported by the [Exascale Computing Project](https://exascaleproject.org/exascale-computing-project)
-(17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
-organizations (Office of Science and the National Nuclear Security
-Administration) responsible for the planning and preparation of a
-[capable exascale ecosystem](https://exascaleproject.org/what-is-exascale),
-including software, applications, hardware, advanced system engineering and early
-testbed platforms, in support of the nation’s exascale computing imperative.
-
-## Differences with the official benchmark version
-
-The CUDA version differs from the official benchmark version of Laghos (in the
-top-level directory) in the following ways:
-
-1. Only problems 0 and 1 are defined
-2. Final iterations (`step`), time steps (`dt`) and energies (`|e|`) differ from the original version
-
-## Building
-
-Follow the steps below to build the CUDA version with GPU acceleration.
-
-### Environment setup
-```sh
-export MPI_HOME=~/usr/local/openmpi/3.0.0
-```
-
-### Hypre
-- <https://computation.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods/download/hypre-2.11.2.tar.gz>
-- `tar xzvf hypre-2.11.2.tar.gz`
-- ` cd hypre-2.11.2/src`
-- `./configure --disable-fortran --with-MPI --with-MPI-include=$MPI_HOME/include --with-MPI-lib-dirs=$MPI_HOME/lib`
-- `make -j`
-- `cd ../..`
-
-### Metis
--   <http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/metis-5.1.0.tar.gz>
--   `tar xzvf metis-5.1.0.tar.gz`
--   `cd metis-5.1.0`
--   ``make config prefix=`pwd` ``
--   `make && make install`
--   `cd ..`
-
-### MFEM
--   `git clone git@github.com:mfem/mfem.git`
--   `cd mfem`
--   `git checkout laghos-v2.0`
--   ``make config MFEM_USE_MPI=YES HYPRE_DIR=`pwd`/../hypre-2.11.2/src/hypre MFEM_USE_METIS_5=YES METIS_DIR=`pwd`/../metis-5.1.0``
--   `make status` to verify that all the include paths are correct
--   `make -j`
--   `cd ..`
-
-### CUDA Laghos
--   `git clone git@github.com:CEED/Laghos.git`
--   `cd Laghos/cuda`
--   edit the `makefile`, set NV\_ARCH to the desired architecture and the absolute paths to CUDA\_DIR, MFEM\_DIR, MPI\_HOME
--   `make` to build the CUDA version
-
-## Running
-
-The CUDA version can run the same sample test runs as the official benchmark
-version of Laghos.
-
-### Options
--   -m <string>: Mesh file to use
--   -ok <int>: Order (degree) of the kinematic finite element space
--   -rs <int>: Number of times to refine the mesh uniformly in serial
--   -p <int>: Problem setup to use, Sedov problem is '1'
--   -cfl <double>: CFL-condition number
--   -ms <int>: Maximum number of steps (negative means no restriction)
--   -uvm: Enable or disable Unified Memory
--   -aware: Enable or disable MPI CUDA Aware
-
-## Verification of Results
-
-To make sure the results are correct, we tabulate reference final iterations
-(`step`), time steps (`dt`) and energies (`|e|`) for the runs listed below:
-
-1. `mpirun -np 4 laghos -p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.75 -pa`
-2. `mpirun -np 4 laghos -p 0 -m ../data/cube01_hex.mesh -rs 1 -tf 0.75 -pa`
-3. `mpirun -np 4 laghos -p 1 -m ../data/square01_quad.mesh -rs 3 -tf 0.8 -pa -cfl 0.05`
-4. `mpirun -np 4 laghos -p 1 -m ../data/cube01_hex.mesh -rs 2 -tf 0.6 -pa -cfl 0.08`
-
-| `run` | `step` | `dt` | `e` |
-| ----- | ------ | ---- | --- |
-|  1. |  333 | 0.000008 | 49.6955373330   |
-|  2. | 1036 | 0.000093 | 3390.9635544029 |
-|  3. | 1570 | 0.000768 | 46.2901037375   |
-|  4. |  486 | 0.000864 | 135.1267396160  |
-
-An implementation is considered valid if the final energy values are all within
-round-off distance from the above reference values.
-
-## Contact
-
-You can reach the Laghos team by emailing laghos@llnl.gov or by leaving a
-comment in the [issue tracker](https://github.com/CEED/Laghos/issues).
-
-## Copyright
-
-The following copyright applies to each file in the CEED software suite,
-unless otherwise stated in the file:
-
-> Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the
-> Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights reserved.
-
-See files LICENSE and NOTICE in the top-level directory for details.
diff --git a/cuda/cuda/config/config.cpp b/cuda/cuda/config/config.cpp
deleted file mode 100644
index fc05c722..00000000
--- a/cuda/cuda/config/config.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-#if defined(OPEN_MPI) && OPEN_MPI
-#include <mpi-ext.h>
-#endif
-#include <unistd.h>
-
-namespace mfem
-{
-
-
-// ***************************************************************************
-bool isNvidiaCudaMpsDaemonRunning(void)
-{
-   const char *command="pidof -s nvidia-cuda-mps-control>/dev/null";
-   return system(command)==0;
-}
-
-// ***************************************************************************
-void computeCapabilityOfTheDevice(const int mpi_rank,
-                                  const CUdevice cuDevice,
-                                  const int device)
-{
-   char name[128];
-   int major, minor;
-   cuDeviceGetName(name, 128, cuDevice);
-   cuDeviceComputeCapability(&major, &minor, device);
-   printf("\033[32m[laghos] Rank_%d => Device_%d (%s:sm_%d.%d)\033[m\n",
-          mpi_rank, device, name, major, minor);
-}
-
-// ***************************************************************************
-static bool isTux(void)
-{
-   char hostname[1024];
-   hostname[1023] = '\0';
-   gethostname(hostname, 1023);
-   if (strncmp("tux", hostname, 3)==0) { return true; }
-   return false;
-}
-
-// ***************************************************************************
-__attribute__((unused))
-static void printDevProp(cudaDeviceProp devProp)
-{
-   printf("Major revision number:         %d\n",  devProp.major);
-   printf("Minor revision number:         %d\n",  devProp.minor);
-   printf("Name:                          %s\n",  devProp.name);
-   printf("Total global memory:           %lu\n",  devProp.totalGlobalMem);
-   printf("Total shared memory per block: %lu\n",  devProp.sharedMemPerBlock);
-   printf("Total registers per block:     %d\n",  devProp.regsPerBlock);
-   printf("Warp size:                     %d\n",  devProp.warpSize);
-   printf("Maximum memory pitch:          %lu\n",  devProp.memPitch);
-   printf("Maximum threads per block:     %d\n",  devProp.maxThreadsPerBlock);
-   for (int i = 0; i < 3; ++i)
-   {
-      printf("Maximum dimension %d of block:  %d\n", i, devProp.maxThreadsDim[i]);
-   }
-   for (int i = 0; i < 3; ++i)
-   {
-      printf("Maximum dimension %d of grid:   %d\n", i, devProp.maxGridSize[i]);
-   }
-   printf("Clock rate:                    %d\n",  devProp.clockRate);
-   printf("Total constant memory:         %lu\n",  devProp.totalConstMem);
-   printf("Texture alignment:             %lu\n",  devProp.textureAlignment);
-   printf("Concurrent copy and execution: %s\n",
-          (devProp.deviceOverlap ? "Yes" : "No"));
-   printf("Number of multiprocessors:     %d\n",  devProp.multiProcessorCount);
-   printf("Kernel execution timeout:      %s\n",
-          (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
-}
-
-// ***************************************************************************
-// *   Setup
-// ***************************************************************************
-void rconfig::Setup(const int _mpi_rank,
-                    const int _mpi_size,
-                    const bool _cuda,
-                    const bool _uvm,
-                    const bool _aware,
-                    const bool _share,
-                    const bool _hcpo,
-                    const bool _sync,
-                    const int rs_levels)
-{
-   mpi_rank=_mpi_rank;
-   mpi_size=_mpi_size;
-
-   // Look if we are on a Tux machine
-   const bool tux = isTux();
-   if (tux && Root())
-   {
-      printf("\033[32m[laghos] \033[1mTux\033[m\n");
-   }
-
-   // On Tux machines, look for MPS
-   mps = tux?isNvidiaCudaMpsDaemonRunning():false;
-   if (tux && Mps() && Root())
-   {
-      printf("\033[32m[laghos] \033[32;1mMPS daemon\033[m\033[m\n");
-   }
-   if (tux && !Mps() && Root())
-   {
-      printf("\033[32m[laghos] \033[31;1mNo MPS daemon\033[m\n");
-   }
-
-   // Get the number of devices with compute capability greater or equal to 2.0
-   // Can be changed wuth CUDA_VISIBLE_DEVICES
-   cudaGetDeviceCount(&gpu_count);
-   cuda=_cuda;
-   uvm=_uvm;
-   aware=_aware;
-   share=_share;
-   hcpo=_hcpo;
-   sync=_sync;
-
-   // __NVVP__ warning output
-#if defined(__NVVP__)
-   if (Root())
-   {
-      printf("\033[32m[laghos] \033[31;1m__NVVP__\033[m\n");
-   }
-#endif // __NVVP__
-
-   // LAGHOS_DEBUG warning output
-#if defined(LAGHOS_DEBUG)
-   if (Root())
-   {
-      printf("\033[32m[laghos] \033[31;1mLAGHOS_DEBUG\033[m\n");
-   }
-#endif
-
-   // Check for Enforced Kernel Synchronization
-   if (Sync() && Root())
-   {
-      printf("\033[32m[laghos] \033[31;1mEnforced Kernel Synchronization!\033[m\n");
-   }
-
-   // Check if MPI is CUDA aware
-   if (Root())
-      printf("\033[32m[laghos] MPI %s CUDA aware\033[m\n",
-             aware?"\033[1mIS":"is \033[31;1mNOT\033[32m");
-
-   if (Root())
-   {
-      printf("\033[32m[laghos] CUDA device count: %i\033[m\n", gpu_count);
-   }
-
-   // Initializes the driver API
-   // Must be called before any other function from the driver API
-   // Currently, the Flags parameter must be 0.
-   const unsigned int Flags = 0; // parameter must be 0
-   cuInit(Flags);
-
-   // Returns properties for the selected device
-   const int device = Mps()?0:(mpi_rank%gpu_count);
-   // Check if we have enough devices for all ranks
-   assert(device<gpu_count);
-
-   // Get a handle to our compute device
-   cuDeviceGet(&cuDevice,device);
-   computeCapabilityOfTheDevice(mpi_rank,cuDevice,device);
-
-   // Get the properties of the device
-   struct cudaDeviceProp properties;
-   cudaGetDeviceProperties(&properties, device);
-#if defined(LAGHOS_DEBUG)
-   if (Root())
-   {
-      printDevProp(properties);
-   }
-#endif // LAGHOS_DEBUG
-   maxXGridSize=properties.maxGridSize[0];
-   maxXThreadsDim=properties.maxThreadsDim[0];
-
-   // Create our context
-   cuCtxCreate(&cuContext, CU_CTX_SCHED_AUTO, cuDevice);
-   hStream=new CUstream;
-   cuStreamCreate(hStream, CU_STREAM_DEFAULT);
-}
-
-// ***************************************************************************
-bool rconfig::IAmAlone()
-{
-   return mpi_size==1;
-}
-
-// ***************************************************************************
-bool rconfig::GeomNeedsUpdate(const int sequence)
-{
-   assert(sequence==0);
-   return (sequence!=0);
-}
-
-// ***************************************************************************
-bool rconfig::DoHostConformingProlongationOperator()
-{
-   return (Cuda())?hcpo:true;
-}
-
-} // namespace mfem
diff --git a/cuda/cuda/config/config.hpp b/cuda/cuda/config/config.hpp
deleted file mode 100644
index e6bb9e90..00000000
--- a/cuda/cuda/config/config.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_CONFIG
-#define LAGHOS_CUDA_CONFIG
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * Configuration class for CUDA
-// ***************************************************************************
-class rconfig
-{
-private:
-   // *************************************************************************
-   int mpi_rank=0;
-   int mpi_size=0;
-   bool aware=false;
-   //  ************************************************************************
-   bool mps=false;
-   int gpu_count=0;
-   int maxXGridSize=0;
-   int maxXThreadsDim=0;
-   // *************************************************************************
-   CUdevice cuDevice;
-   CUcontext cuContext;
-   CUstream *hStream;
-   // *************************************************************************
-   bool cuda=false;
-   bool uvm=false;
-   bool share=false;
-   // *************************************************************************
-   bool hcpo=false;
-   bool sync=false;
-   bool nvvp=false;
-   // *************************************************************************
-private:
-   rconfig() {}
-   rconfig(rconfig const&);
-   void operator=(rconfig const&);
-   // *************************************************************************
-public:
-   static rconfig& Get()
-   {
-      static rconfig rconfig_singleton;
-      return rconfig_singleton;
-   }
-   // *************************************************************************
-   void Setup(const int,const int, const bool cuda,
-              const bool uvm, const bool aware,
-              const bool share, const bool hcpo,
-              const bool sync, const int rs_levels);
-   // *************************************************************************
-   bool IAmAlone();
-   bool GeomNeedsUpdate(const int);
-   bool DoHostConformingProlongationOperator();
-   // *************************************************************************
-   inline int Rank() { return mpi_rank; }
-   inline int Size() { return mpi_size; }
-   inline bool Root() { return mpi_rank==0; }
-   inline bool Aware() { return aware; }
-   // *************************************************************************
-   inline bool Mps() { return mps; }
-   // *************************************************************************
-   inline bool Uvm() { return uvm; }
-   inline bool Cuda() { return cuda; }
-   inline bool Share() { return share; }
-   inline bool Hcpo() { return hcpo; }
-   inline bool Sync() { return sync; }
-   inline bool Nvvp(bool toggle=false) { return toggle?nvvp=!nvvp:nvvp; }
-   inline int MaxXGridSize() { return maxXGridSize; }
-   inline int MaxXThreadsDim() { return maxXThreadsDim; }
-   // *************************************************************************
-   inline CUstream *Stream() { return hStream; }
-};
-
-} // namespace mfem
-
-#endif // LAGHOS_CUDA_CONFIG
diff --git a/cuda/cuda/cuda.hpp b/cuda/cuda/cuda.hpp
deleted file mode 100644
index d2ca655d..00000000
--- a/cuda/cuda/cuda.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA
-#define LAGHOS_CUDA
-
-// stdincs *********************************************************************
-#include <stdio.h>
-#include <stdarg.h>
-#include <assert.h>
-
-// CUDA ************************************************************************
-#include <cuda.h>
-
-// MFEM/fem  *******************************************************************
-#include "fem/gridfunc.hpp"
-#include "general/communication.hpp"
-#include "fem/pfespace.hpp"
-
-// LAGHOS/cuda/config **********************************************************
-#include "./config/config.hpp"
-
-// LAGHOS/cuda/general *********************************************************
-#include "./general/memcpy.hpp"
-#include "./general/malloc.hpp"
-#include "./general/array.hpp"
-#include "./general/table.hpp"
-#include "./general/commd.hpp"
-
-// LAGHOS/cuda/linalg **********************************************************
-#include "./linalg/vector.hpp"
-#include "./linalg/operator.hpp"
-#include "./linalg/ode.hpp"
-#include "./linalg/solvers.hpp"
-
-// LAGHOS/cuda/kernels *********************************************************
-#include "./kernels/include/kernels.hpp"
-
-// LAGHOS/cuda/fem *************************************************************
-#include "./fem/conform.hpp"
-#include "./fem/prolong.hpp"
-#include "./fem/restrict.hpp"
-#include "./fem/fespace.hpp"
-#include "./fem/bilinearform.hpp"
-#include "./fem/cuGridfunc.hpp"
-#include "./fem/bilininteg.hpp"
-
-#endif // LAGHOS_CUDA
-
diff --git a/cuda/cuda/fem/bilinearform.cpp b/cuda/cuda/fem/bilinearform.cpp
deleted file mode 100644
index fcca741b..00000000
--- a/cuda/cuda/fem/bilinearform.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaBilinearForm
-// ***************************************************************************
-CudaBilinearForm::CudaBilinearForm(CudaFiniteElementSpace* fes) :
-   CudaOperator(fes->GetVSize(),fes->GetVSize()),
-   mesh(fes->GetMesh()),
-   trialFes(fes),
-   testFes(fes),
-   localX(mesh->GetNE() * trialFes->GetLocalDofs() * trialFes->GetVDim()),
-   localY(mesh->GetNE() * testFes->GetLocalDofs() * testFes->GetVDim()) {}
-
-// ***************************************************************************
-CudaBilinearForm::~CudaBilinearForm() { }
-
-// ***************************************************************************
-// Adds new Domain Integrator.
-void CudaBilinearForm::AddDomainIntegrator(CudaIntegrator* i)
-{
-   AddIntegrator(i, DomainIntegrator);
-}
-
-// Adds new Boundary Integrator.
-void CudaBilinearForm::AddBoundaryIntegrator(CudaIntegrator* i)
-{
-   AddIntegrator(i, BoundaryIntegrator);
-}
-
-// Adds new interior Face Integrator.
-void CudaBilinearForm::AddInteriorFaceIntegrator(CudaIntegrator* i)
-{
-   AddIntegrator(i, InteriorFaceIntegrator);
-}
-
-// Adds new boundary Face Integrator.
-void CudaBilinearForm::AddBoundaryFaceIntegrator(CudaIntegrator* i)
-{
-   AddIntegrator(i, BoundaryFaceIntegrator);
-}
-
-// Adds Integrator based on CudaIntegratorType
-void CudaBilinearForm::AddIntegrator(CudaIntegrator* i,
-                                     const CudaIntegratorType itype)
-{
-   assert(i);
-   i->SetupIntegrator(*this, itype);
-   integrators.push_back(i);
-}
-
-// ***************************************************************************
-void CudaBilinearForm::Assemble()
-{
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->Assemble();
-   }
-}
-
-// ***************************************************************************
-void CudaBilinearForm::FormLinearSystem(const Array<int>& constraintList,
-                                        CudaVector& x, CudaVector& b,
-                                        CudaOperator*& Aout,
-                                        CudaVector& X, CudaVector& B,
-                                        int copy_interior)
-{
-   FormOperator(constraintList, Aout);
-   InitRHS(constraintList, x, b, Aout, X, B, copy_interior);
-}
-
-// ***************************************************************************
-void CudaBilinearForm::FormOperator(const Array<int>& constraintList,
-                                    CudaOperator*& Aout)
-{
-   const CudaOperator* trialP = trialFes->GetProlongationOperator();
-   const CudaOperator* testP  = testFes->GetProlongationOperator();
-   CudaOperator *rap = this;
-   if (trialP) { rap = new CudaRAPOperator(*testP, *this, *trialP); }
-   Aout = new CudaConstrainedOperator(rap, constraintList, rap!=this);
-}
-
-// ***************************************************************************
-void CudaBilinearForm::InitRHS(const Array<int>& constraintList,
-                               const CudaVector& x, const CudaVector& b,
-                               CudaOperator* A,
-                               CudaVector& X, CudaVector& B,
-                               int copy_interior)
-{
-   const CudaOperator* P = trialFes->GetProlongationOperator();
-   const CudaOperator* R = trialFes->GetRestrictionOperator();
-   if (P)
-   {
-      // Variational restriction with P
-      B.SetSize(P->Width());
-      P->MultTranspose(b, B);
-      X.SetSize(R->Height());
-      R->Mult(x, X);
-   }
-   else
-   {
-      // rap, X and B point to the same data as this, x and b
-      X.SetSize(x.Size(),x);
-      B.SetSize(b.Size(),b);
-   }
-   CudaConstrainedOperator* cA = static_cast<CudaConstrainedOperator*>(A);
-   if (cA)
-   {
-      cA->EliminateRHS(X, B);
-   }
-   else
-   {
-      mfem_error("CudaBilinearForm::InitRHS expects an CudaConstrainedOperator");
-   }
-}
-
-// ***************************************************************************
-void CudaBilinearForm::Mult(const CudaVector& x, CudaVector& y) const
-{
-   trialFes->GlobalToLocal(x, localX);
-   localY = 0;
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->MultAdd(localX, localY);
-   }
-   testFes->LocalToGlobal(localY, y);
-}
-
-// ***************************************************************************
-void CudaBilinearForm::MultTranspose(const CudaVector& x, CudaVector& y) const
-{
-   testFes->GlobalToLocal(x, localX);
-   localY = 0;
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->MultTransposeAdd(localX, localY);
-   }
-   trialFes->LocalToGlobal(localY, y);
-}
-
-// ***************************************************************************
-void CudaBilinearForm::RecoverFEMSolution(const CudaVector& X,
-                                          const CudaVector& b,
-                                          CudaVector& x)
-{
-   const CudaOperator *P = this->GetProlongation();
-   if (P)
-   {
-      // Apply conforming prolongation
-      x.SetSize(P->Height());
-      P->Mult(X, x);
-   }
-   // Otherwise X and x point to the same data
-}
-
-
-// ***************************************************************************
-// * CudaConstrainedOperator
-// ***************************************************************************
-CudaConstrainedOperator::CudaConstrainedOperator(CudaOperator* A_,
-                                                 const Array<int>& constraintList_,
-                                                 bool own_A_) :
-   CudaOperator(A_->Height(), A_->Width())
-{
-   Setup(A_, constraintList_, own_A_);
-}
-
-void CudaConstrainedOperator::Setup(CudaOperator* A_,
-                                    const Array<int>& constraintList_,
-                                    bool own_A_)
-{
-   A = A_;
-   own_A = own_A_;
-   constraintIndices = constraintList_.Size();
-   if (constraintIndices)
-   {
-      constraintList.allocate(constraintIndices);
-   }
-   z.SetSize(height);
-   w.SetSize(height);
-}
-
-void CudaConstrainedOperator::EliminateRHS(const CudaVector& x,
-                                           CudaVector& b) const
-{
-   w = 0.0;
-   A->Mult(w, z);
-   b -= z;
-}
-
-void CudaConstrainedOperator::Mult(const CudaVector& x, CudaVector& y) const
-{
-   if (constraintIndices == 0)
-   {
-      A->Mult(x, y);
-      return;
-   }
-   z = x;
-   A->Mult(z, y);
-}
-
-} // mfem
diff --git a/cuda/cuda/fem/bilinearform.hpp b/cuda/cuda/fem/bilinearform.hpp
deleted file mode 100644
index dbe38323..00000000
--- a/cuda/cuda/fem/bilinearform.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_BILINEARFORM
-#define LAGHOS_CUDA_BILINEARFORM
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaIntegratorType
-// ***************************************************************************
-enum CudaIntegratorType
-{
-   DomainIntegrator       = 0,
-   BoundaryIntegrator     = 1,
-   InteriorFaceIntegrator = 2,
-   BoundaryFaceIntegrator = 3,
-};
-
-class CudaIntegrator;
-
-// ***************************************************************************
-// * CudaBilinearForm
-// ***************************************************************************
-class CudaBilinearForm : public CudaOperator
-{
-   friend class CudaIntegrator;
-protected:
-   typedef std::vector<CudaIntegrator*> IntegratorVector;
-   mutable Mesh* mesh;
-   mutable CudaFiniteElementSpace* trialFes;
-   mutable CudaFiniteElementSpace* testFes;
-   IntegratorVector integrators;
-   mutable CudaVector localX, localY;
-public:
-   CudaBilinearForm(CudaFiniteElementSpace*);
-   ~CudaBilinearForm();
-   Mesh& GetMesh() const { return *mesh; }
-   CudaFiniteElementSpace& GetTrialFESpace() const { return *trialFes;}
-   CudaFiniteElementSpace& GetTestFESpace() const { return *testFes;}
-   // *************************************************************************
-   void AddDomainIntegrator(CudaIntegrator*);
-   void AddBoundaryIntegrator(CudaIntegrator*);
-   void AddInteriorFaceIntegrator(CudaIntegrator*);
-   void AddBoundaryFaceIntegrator(CudaIntegrator*);
-   void AddIntegrator(CudaIntegrator*, const CudaIntegratorType);
-   // *************************************************************************
-   virtual void Assemble();
-   void FormLinearSystem(const Array<int>& constraintList,
-                         CudaVector& x, CudaVector& b,
-                         CudaOperator*& Aout,
-                         CudaVector& X, CudaVector& B,
-                         int copy_interior = 0);
-   void FormOperator(const Array<int>& constraintList, CudaOperator*& Aout);
-   void InitRHS(const Array<int>& constraintList,
-                const CudaVector& x, const CudaVector& b,
-                CudaOperator* Aout,
-                CudaVector& X, CudaVector& B,
-                int copy_interior = 0);
-   virtual void Mult(const CudaVector& x, CudaVector& y) const;
-   virtual void MultTranspose(const CudaVector& x, CudaVector& y) const;
-   void RecoverFEMSolution(const CudaVector&, const CudaVector&, CudaVector&);
-};
-
-
-// ***************************************************************************
-// * Constrained Operator
-// ***************************************************************************
-class CudaConstrainedOperator : public CudaOperator
-{
-protected:
-   CudaOperator *A;
-   bool own_A;
-   CudaArray<int> constraintList;
-   int constraintIndices;
-   mutable CudaVector z, w;
-public:
-   CudaConstrainedOperator(CudaOperator*, const Array<int>&, bool = false);
-   void Setup(CudaOperator*, const Array<int>&, bool = false);
-   void EliminateRHS(const CudaVector&, CudaVector&) const;
-   virtual void Mult(const CudaVector&, CudaVector&) const;
-   virtual ~CudaConstrainedOperator() {}
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_BILINEARFORM
diff --git a/cuda/cuda/fem/bilininteg.cpp b/cuda/cuda/fem/bilininteg.cpp
deleted file mode 100644
index 6c4b5062..00000000
--- a/cuda/cuda/fem/bilininteg.cpp
+++ /dev/null
@@ -1,526 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// *****************************************************************************
-static CudaGeometry *geom=NULL;
-
-// ***************************************************************************
-// * ~ CudaGeometry
-// ***************************************************************************
-CudaGeometry::~CudaGeometry()
-{
-   free(geom->meshNodes);
-   free(geom->J);
-   free(geom->invJ);
-   free(geom->detJ);
-   delete[] geom;
-}
-
-// *****************************************************************************
-// * CudaGeometry Get: use this one to fetch nodes from vector Sx
-// *****************************************************************************
-CudaGeometry* CudaGeometry::Get(CudaFiniteElementSpace& fes,
-                                const IntegrationRule& ir,
-                                const CudaVector& Sx)
-{
-   const Mesh *mesh = fes.GetMesh();
-   const mfem::GridFunction *nodes = mesh->GetNodes();
-   const FiniteElementSpace *fespace = nodes->FESpace();
-   const FiniteElement *fe = fespace->GetFE(0);
-   const int dims     = fe->GetDim();
-   const int numDofs  = fe->GetDof();
-   const int numQuad  = ir.GetNPoints();
-   const int elements = fespace->GetNE();
-   const int ndofs    = fespace->GetNDofs();
-   const CudaDofQuadMaps* maps = CudaDofQuadMaps::GetSimplexMaps(*fe, ir);
-   rNodeCopyByVDim(elements,numDofs,ndofs,dims,geom->eMap,Sx,geom->meshNodes);
-   rIniGeom(dims,numDofs,numQuad,elements,
-            maps->dofToQuadD,
-            geom->meshNodes,
-            geom->J,
-            geom->invJ,
-            geom->detJ);
-   return geom;
-}
-
-
-// *****************************************************************************
-CudaGeometry* CudaGeometry::Get(CudaFiniteElementSpace& fes,
-                                const IntegrationRule& ir)
-{
-   Mesh& mesh = *(fes.GetMesh());
-   const bool geom_to_allocate =
-      (!geom) || rconfig::Get().GeomNeedsUpdate(mesh.GetSequence());
-   if (geom_to_allocate) { geom=new CudaGeometry(); }
-   if (!mesh.GetNodes()) { mesh.SetCurvature(1, false, -1, Ordering::byVDIM); }
-   GridFunction& nodes = *(mesh.GetNodes());
-   const FiniteElementSpace& fespace = *(nodes.FESpace());
-   const FiniteElement& fe = *(fespace.GetFE(0));
-   const int dims     = fe.GetDim();
-   const int elements = fespace.GetNE();
-   const int numDofs  = fe.GetDof();
-   const int numQuad  = ir.GetNPoints();
-   const bool orderedByNODES = (fespace.GetOrdering() == Ordering::byNODES);
-
-   if (orderedByNODES) { ReorderByVDim(nodes); }
-   const int asize = dims*numDofs*elements;
-   Array<double> meshNodes(asize);
-   const Table& e2dTable = fespace.GetElementToDofTable();
-   const int* elementMap = e2dTable.GetJ();
-   Array<int> eMap(numDofs*elements);
-   {
-      for (int e = 0; e < elements; ++e)
-      {
-         for (int d = 0; d < numDofs; ++d)
-         {
-            const int lid = d+numDofs*e;
-            const int gid = elementMap[lid];
-            eMap[lid]=gid;
-            for (int v = 0; v < dims; ++v)
-            {
-               const int moffset = v+dims*lid;
-               const int xoffset = v+dims*gid;
-               meshNodes[moffset] = nodes[xoffset];
-            }
-         }
-      }
-   }
-   if (geom_to_allocate)
-   {
-      geom->meshNodes.allocate(dims, numDofs, elements);
-      geom->eMap.allocate(numDofs, elements);
-   }
-   {
-      geom->meshNodes = meshNodes;
-      geom->eMap = eMap;
-   }
-   if (orderedByNODES) { ReorderByNodes(nodes); }
-   if (geom_to_allocate)
-   {
-      geom->J.allocate(dims, dims, numQuad, elements);
-      geom->invJ.allocate(dims, dims, numQuad, elements);
-      geom->detJ.allocate(numQuad, elements);
-   }
-
-   const CudaDofQuadMaps* maps = CudaDofQuadMaps::GetSimplexMaps(fe, ir);
-   rIniGeom(dims,numDofs,numQuad,elements,
-            maps->dofToQuadD,
-            geom->meshNodes,
-            geom->J,
-            geom->invJ,
-            geom->detJ);
-   return geom;
-}
-
-// ***************************************************************************
-void CudaGeometry::ReorderByVDim(GridFunction& nodes)
-{
-   const FiniteElementSpace *fes=nodes.FESpace();
-   const int size = nodes.Size();
-   const int vdim = fes->GetVDim();
-   const int ndofs = fes->GetNDofs();
-   double *data = nodes.GetData();
-   double *temp = new double[size];
-   int k=0;
-   for (int d = 0; d < ndofs; d++)
-      for (int v = 0; v < vdim; v++)
-      {
-         temp[k++] = data[d+v*ndofs];
-      }
-   for (int i = 0; i < size; i++)
-   {
-      data[i] = temp[i];
-   }
-   delete [] temp;
-}
-
-// ***************************************************************************
-void CudaGeometry::ReorderByNodes(GridFunction& nodes)
-{
-   const FiniteElementSpace *fes=nodes.FESpace();
-   const int size = nodes.Size();
-   const int vdim = fes->GetVDim();
-   const int ndofs = fes->GetNDofs();
-   double *data = nodes.GetData();
-   double *temp = new double[size];
-   int k = 0;
-   for (int j = 0; j < ndofs; j++)
-      for (int i = 0; i < vdim; i++)
-      {
-         temp[j+i*ndofs] = data[k++];
-      }
-   for (int i = 0; i < size; i++)
-   {
-      data[i] = temp[i];
-   }
-   delete [] temp;
-}
-
-// *****************************************************************************
-// * CudaDofQuadMaps
-// *****************************************************************************
-static std::map<std::string, CudaDofQuadMaps* > AllDofQuadMaps;
-
-// ***************************************************************************
-CudaDofQuadMaps::~CudaDofQuadMaps() {}
-
-// *****************************************************************************
-void CudaDofQuadMaps::delCudaDofQuadMaps()
-{
-   for (std::map<std::string,
-        CudaDofQuadMaps*>::iterator itr = AllDofQuadMaps.begin();
-        itr != AllDofQuadMaps.end();
-        itr++)
-   {
-      delete itr->second;
-   }
-}
-
-// *****************************************************************************
-CudaDofQuadMaps* CudaDofQuadMaps::Get(const CudaFiniteElementSpace& fespace,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return Get(*fespace.GetFE(0),*fespace.GetFE(0),ir,transpose);
-}
-
-CudaDofQuadMaps* CudaDofQuadMaps::Get(const CudaFiniteElementSpace&
-                                      trialFESpace,
-                                      const CudaFiniteElementSpace& testFESpace,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return Get(*trialFESpace.GetFE(0),*testFESpace.GetFE(0),ir,transpose);
-}
-
-CudaDofQuadMaps* CudaDofQuadMaps::Get(const FiniteElement& trialFE,
-                                      const FiniteElement& testFE,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return GetTensorMaps(trialFE, testFE, ir, transpose);
-}
-
-// *****************************************************************************
-CudaDofQuadMaps* CudaDofQuadMaps::GetTensorMaps(const FiniteElement& trialFE,
-                                                const FiniteElement& testFE,
-                                                const IntegrationRule& ir,
-                                                const bool transpose)
-{
-   const TensorBasisElement& trialTFE =
-      dynamic_cast<const TensorBasisElement&>(trialFE);
-   const TensorBasisElement& testTFE =
-      dynamic_cast<const TensorBasisElement&>(testFE);
-   std::stringstream ss;
-   ss << "TensorMap:"
-      << " O1:"  << trialFE.GetOrder()
-      << " O2:"  << testFE.GetOrder()
-      << " BT1:" << trialTFE.GetBasisType()
-      << " BT2:" << testTFE.GetBasisType()
-      << " Q:"   << ir.GetNPoints();
-   std::string hash = ss.str();
-   // If we've already made the dof-quad maps, reuse them
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   // Otherwise, build them
-   CudaDofQuadMaps *maps = new CudaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   const CudaDofQuadMaps* trialMaps = GetD2QTensorMaps(trialFE, ir);
-   const CudaDofQuadMaps* testMaps  = GetD2QTensorMaps(testFE, ir, true);
-   maps->dofToQuad   = trialMaps->dofToQuad;
-   maps->dofToQuadD  = trialMaps->dofToQuadD;
-   maps->quadToDof   = testMaps->dofToQuad;
-   maps->quadToDofD  = testMaps->dofToQuadD;
-   maps->quadWeights = testMaps->quadWeights;
-   return maps;
-}
-
-// *****************************************************************************
-CudaDofQuadMaps* CudaDofQuadMaps::GetD2QTensorMaps(const FiniteElement& fe,
-                                                   const IntegrationRule& ir,
-                                                   const bool transpose)
-{
-   const TensorBasisElement& tfe = dynamic_cast<const TensorBasisElement&>(fe);
-   const Poly_1D::Basis& basis = tfe.GetBasis1D();
-   const int order = fe.GetOrder();
-   const int dofs = order + 1;
-   const int dims = fe.GetDim();
-   const IntegrationRule& ir1D = IntRules.Get(Geometry::SEGMENT, ir.GetOrder());
-   const int quadPoints = ir1D.GetNPoints();
-   const int quadPoints2D = quadPoints*quadPoints;
-   const int quadPoints3D = quadPoints2D*quadPoints;
-   const int quadPointsND = ((dims == 1) ? quadPoints :
-                             ((dims == 2) ? quadPoints2D : quadPoints3D));
-   std::stringstream ss ;
-   ss << "D2QTensorMap:"
-      << " order:" << order
-      << " dofs:" << dofs
-      << " dims:" << dims
-      << " quadPoints:"<<quadPoints
-      << " transpose:"  << (transpose?"T":"F");
-   std::string hash = ss.str();
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-
-   CudaDofQuadMaps *maps = new CudaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-
-   maps->dofToQuad.allocate(quadPoints, dofs,1,1,transpose);
-   maps->dofToQuadD.allocate(quadPoints, dofs,1,1,transpose);
-   double* quadWeights1DData = NULL;
-   if (transpose)
-   {
-      // Initialize quad weights only for transpose
-      maps->quadWeights.allocate(quadPointsND);
-      quadWeights1DData = ::new double[quadPoints];
-   }
-   mfem::Vector d2q(dofs);
-   mfem::Vector d2qD(dofs);
-   Array<double> dofToQuad(quadPoints*dofs);
-   Array<double> dofToQuadD(quadPoints*dofs);
-   for (int q = 0; q < quadPoints; ++q)
-   {
-      const IntegrationPoint& ip = ir1D.IntPoint(q);
-      basis.Eval(ip.x, d2q, d2qD);
-      if (transpose)
-      {
-         quadWeights1DData[q] = ip.weight;
-      }
-      for (int d = 0; d < dofs; ++d)
-      {
-         dofToQuad[maps->dofToQuad.dim()[0]*q + maps->dofToQuad.dim()[1]*d] = d2q[d];
-         dofToQuadD[maps->dofToQuad.dim()[0]*q + maps->dofToQuad.dim()[1]*d] = d2qD[d];
-      }
-   }
-   maps->dofToQuad = dofToQuad;
-   maps->dofToQuadD = dofToQuadD;
-   if (transpose)
-   {
-      Array<double> quadWeights(quadPointsND);
-      for (int q = 0; q < quadPointsND; ++q)
-      {
-         const int qx = q % quadPoints;
-         const int qz = q / quadPoints2D;
-         const int qy = (q - qz*quadPoints2D) / quadPoints;
-         double w = quadWeights1DData[qx];
-         if (dims > 1)
-         {
-            w *= quadWeights1DData[qy];
-         }
-         if (dims > 2)
-         {
-            w *= quadWeights1DData[qz];
-         }
-         quadWeights[q] = w;
-      }
-      maps->quadWeights = quadWeights;
-      ::delete [] quadWeights1DData;
-   }
-   assert(maps);
-   return maps;
-}
-
-// *****************************************************************************
-CudaDofQuadMaps* CudaDofQuadMaps::GetSimplexMaps(const FiniteElement& fe,
-                                                 const IntegrationRule& ir,
-                                                 const bool transpose)
-{
-   return GetSimplexMaps(fe, fe, ir, transpose);
-}
-
-// *****************************************************************************
-CudaDofQuadMaps* CudaDofQuadMaps::GetSimplexMaps(const FiniteElement& trialFE,
-                                                 const FiniteElement& testFE,
-                                                 const IntegrationRule& ir,
-                                                 const bool transpose)
-{
-   std::stringstream ss;
-   ss << "SimplexMap:"
-      << " O1:" << trialFE.GetOrder()
-      << " O2:" << testFE.GetOrder()
-      << " Q:"  << ir.GetNPoints();
-   std::string hash = ss.str();
-   // If we've already made the dof-quad maps, reuse them
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   CudaDofQuadMaps *maps = new CudaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   const CudaDofQuadMaps* trialMaps = GetD2QSimplexMaps(trialFE, ir);
-   const CudaDofQuadMaps* testMaps  = GetD2QSimplexMaps(testFE, ir, true);
-   maps->dofToQuad   = trialMaps->dofToQuad;
-   maps->dofToQuadD  = trialMaps->dofToQuadD;
-   maps->quadToDof   = testMaps->dofToQuad;
-   maps->quadToDofD  = testMaps->dofToQuadD;
-   maps->quadWeights = testMaps->quadWeights;
-   return maps;
-}
-
-// *****************************************************************************
-CudaDofQuadMaps* CudaDofQuadMaps::GetD2QSimplexMaps(const FiniteElement& fe,
-                                                    const IntegrationRule& ir,
-                                                    const bool transpose)
-{
-   const int dims = fe.GetDim();
-   const int numDofs = fe.GetDof();
-   const int numQuad = ir.GetNPoints();
-   std::stringstream ss ;
-   ss << "D2QSimplexMap:"
-      << " Dim:" << dims
-      << " numDofs:" << numDofs
-      << " numQuad:" << numQuad
-      << " transpose:"  << (transpose?"T":"F");
-   std::string hash = ss.str();
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   CudaDofQuadMaps* maps = new CudaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   // Initialize the dof -> quad mapping
-   maps->dofToQuad.allocate(numQuad, numDofs,1,1,transpose);
-   maps->dofToQuadD.allocate(dims, numQuad, numDofs,1,transpose);
-   if (transpose) // Initialize quad weights only for transpose
-   {
-      maps->quadWeights.allocate(numQuad);
-   }
-   Vector d2q(numDofs);
-   DenseMatrix d2qD(numDofs, dims);
-   Array<double> quadWeights(numQuad);
-   Array<double> dofToQuad(numQuad*numDofs);
-   Array<double> dofToQuadD(dims*numQuad*numDofs);
-   for (int q = 0; q < numQuad; ++q)
-   {
-      const IntegrationPoint& ip = ir.IntPoint(q);
-      if (transpose)
-      {
-         quadWeights[q] = ip.weight;
-      }
-      fe.CalcShape(ip, d2q);
-      fe.CalcDShape(ip, d2qD);
-      for (int d = 0; d < numDofs; ++d)
-      {
-         const double w = d2q[d];
-         dofToQuad[maps->dofToQuad.dim()[0]*q +
-                   maps->dofToQuad.dim()[1]*d] = w;
-         for (int dim = 0; dim < dims; ++dim)
-         {
-            const double wD = d2qD(d, dim);
-            dofToQuadD[maps->dofToQuadD.dim()[0]*dim +
-                       maps->dofToQuadD.dim()[1]*q +
-                       maps->dofToQuadD.dim()[2]*d] = wD;
-         }
-      }
-   }
-   if (transpose)
-   {
-      maps->quadWeights = quadWeights;
-   }
-   maps->dofToQuad = dofToQuad;
-   maps->dofToQuadD = dofToQuadD;
-   return maps;
-}
-
-
-// *****************************************************************************
-// * Base Integrator
-// *****************************************************************************
-void CudaIntegrator::SetIntegrationRule(const IntegrationRule& ir_)
-{
-   ir = &ir_;
-}
-
-const IntegrationRule& CudaIntegrator::GetIntegrationRule() const
-{
-   assert(ir);
-   return *ir;
-}
-
-void CudaIntegrator::SetupIntegrator(CudaBilinearForm& bform_,
-                                     const CudaIntegratorType itype_)
-{
-   mesh = &(bform_.GetMesh());
-   trialFESpace = &(bform_.GetTrialFESpace());
-   testFESpace  = &(bform_.GetTestFESpace());
-   itype = itype_;
-   if (ir == NULL) { assert(false); }
-   maps = CudaDofQuadMaps::Get(*trialFESpace,*testFESpace,*ir);
-   mapsTranspose = CudaDofQuadMaps::Get(*testFESpace,*trialFESpace,*ir);
-   Setup();
-}
-
-CudaGeometry* CudaIntegrator::GetGeometry()
-{
-   return CudaGeometry::Get(*trialFESpace, *ir);
-}
-
-
-// *****************************************************************************
-// * Mass Integrator
-// *****************************************************************************
-void CudaMassIntegrator::SetupIntegrationRule()
-{
-   assert(false);
-}
-
-// *****************************************************************************
-void CudaMassIntegrator::Assemble()
-{
-   if (op.Size()) { return; }
-   assert(false);
-}
-
-// *****************************************************************************
-void CudaMassIntegrator::SetOperator(CudaVector& v) { op = v; }
-
-// ***************************************************************************
-void CudaMassIntegrator::MultAdd(CudaVector& x, CudaVector& y)
-{
-   const int dim = mesh->Dimension();
-   const int quad1D = IntRules.Get(Geometry::SEGMENT,ir->GetOrder()).GetNPoints();
-   const int dofs1D = trialFESpace->GetFE(0)->GetOrder() + 1;
-   if (rconfig::Get().Share())
-      rMassMultAddS(dim,
-                    dofs1D,
-                    quad1D,
-                    mesh->GetNE(),
-                    maps->dofToQuad,
-                    maps->dofToQuadD,
-                    maps->quadToDof,
-                    maps->quadToDofD,
-                    op,x,y);
-   else
-      rMassMultAdd(dim,
-                   dofs1D,
-                   quad1D,
-                   mesh->GetNE(),
-                   maps->dofToQuad,
-                   maps->dofToQuadD,
-                   maps->quadToDof,
-                   maps->quadToDofD,
-                   op,x,y);
-}
-
-} // namespace mfem
-
diff --git a/cuda/cuda/fem/bilininteg.hpp b/cuda/cuda/fem/bilininteg.hpp
deleted file mode 100644
index 97e36f7b..00000000
--- a/cuda/cuda/fem/bilininteg.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_BILININTEG
-#define LAGHOS_CUDA_BILININTEG
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaGeometry
-// ***************************************************************************
-class CudaGeometry
-{
-public:
-   ~CudaGeometry();
-   CudaArray<int> eMap;
-   CudaArray<double> meshNodes;
-   CudaArray<double> J, invJ, detJ;
-   static CudaGeometry* Get(CudaFiniteElementSpace&,
-                            const IntegrationRule&);
-   static CudaGeometry* Get(CudaFiniteElementSpace&,
-                            const IntegrationRule&,
-                            const CudaVector&);
-   static void ReorderByVDim(GridFunction& nodes);
-   static void ReorderByNodes(GridFunction& nodes);
-};
-
-// ***************************************************************************
-// * CudaDofQuadMaps
-// ***************************************************************************
-class CudaDofQuadMaps
-{
-private:
-   std::string hash;
-public:
-   CudaArray<double, false> dofToQuad, dofToQuadD; // B
-   CudaArray<double, false> quadToDof, quadToDofD; // B^T
-   CudaArray<double> quadWeights;
-public:
-   ~CudaDofQuadMaps();
-   static void delCudaDofQuadMaps();
-   static CudaDofQuadMaps* Get(const CudaFiniteElementSpace&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static CudaDofQuadMaps* Get(const CudaFiniteElementSpace&,
-                               const CudaFiniteElementSpace&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static CudaDofQuadMaps* Get(const FiniteElement&,
-                               const FiniteElement&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static CudaDofQuadMaps* GetTensorMaps(const FiniteElement&,
-                                         const FiniteElement&,
-                                         const IntegrationRule&,
-                                         const bool = false);
-   static CudaDofQuadMaps* GetD2QTensorMaps(const FiniteElement&,
-                                            const IntegrationRule&,
-                                            const bool = false);
-   static CudaDofQuadMaps* GetSimplexMaps(const FiniteElement&,
-                                          const IntegrationRule&,
-                                          const bool = false);
-   static CudaDofQuadMaps* GetSimplexMaps(const FiniteElement&,
-                                          const FiniteElement&,
-                                          const IntegrationRule&,
-                                          const bool = false);
-   static CudaDofQuadMaps* GetD2QSimplexMaps(const FiniteElement&,
-                                             const IntegrationRule&,
-                                             const bool = false);
-};
-
-// ***************************************************************************
-// * Base Integrator
-// ***************************************************************************
-class CudaIntegrator
-{
-protected:
-   Mesh* mesh = NULL;
-   CudaFiniteElementSpace* trialFESpace = NULL;
-   CudaFiniteElementSpace* testFESpace = NULL;
-   CudaIntegratorType itype;
-   const IntegrationRule* ir = NULL;
-   CudaDofQuadMaps* maps;
-   CudaDofQuadMaps* mapsTranspose;
-private:
-public:
-   virtual std::string GetName() = 0;
-   void SetIntegrationRule(const IntegrationRule& ir_);
-   const IntegrationRule& GetIntegrationRule() const;
-   virtual void SetupIntegrationRule() = 0;
-   virtual void SetupIntegrator(CudaBilinearForm& bform_,
-                                const CudaIntegratorType itype_);
-   virtual void Setup() = 0;
-   virtual void Assemble() = 0;
-   virtual void MultAdd(CudaVector& x, CudaVector& y) = 0;
-   virtual void MultTransposeAdd(CudaVector&, CudaVector&) {assert(false);}
-   CudaGeometry* GetGeometry();
-};
-
-// ***************************************************************************
-// * Mass Integrator
-// ***************************************************************************
-class CudaMassIntegrator : public CudaIntegrator
-{
-private:
-   CudaVector op;
-public:
-   CudaMassIntegrator() {}
-   virtual ~CudaMassIntegrator() {}
-   virtual std::string GetName() {return "MassIntegrator";}
-   virtual void SetupIntegrationRule();
-   virtual void Setup() {}
-   virtual void Assemble();
-   void SetOperator(CudaVector& v);
-   virtual void MultAdd(CudaVector& x, CudaVector& y);
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_BILININTEG
diff --git a/cuda/cuda/fem/conform.cpp b/cuda/cuda/fem/conform.cpp
deleted file mode 100644
index 88257f2e..00000000
--- a/cuda/cuda/fem/conform.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaConformingProlongationOperator
-// ***************************************************************************
-CudaConformingProlongationOperator::CudaConformingProlongationOperator
-(ParFiniteElementSpace &pfes): CudaOperator(pfes.GetVSize(),
-                                               pfes.GetTrueVSize()),
-   external_ldofs(),
-   d_external_ldofs(Height()-Width()), // size can be 0 here
-   gc(new CudaCommD(pfes)),
-   kMaxTh(0)
-{
-   Array<int> ldofs;
-   Table &group_ldof = gc->GroupLDofTable();
-   external_ldofs.Reserve(Height()-Width());
-   for (int gr = 1; gr < group_ldof.Size(); gr++)
-   {
-      if (!gc->GetGroupTopology().IAmMaster(gr))
-      {
-         ldofs.MakeRef(group_ldof.GetRow(gr), group_ldof.RowSize(gr));
-         external_ldofs.Append(ldofs);
-      }
-   }
-   external_ldofs.Sort();
-   const int HmW=Height()-Width();
-   if (HmW>0)
-   {
-      d_external_ldofs=external_ldofs;
-   }
-   assert(external_ldofs.Size() == Height()-Width());
-   const int m = external_ldofs.Size();
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      const int size = end-j;
-      if (size>kMaxTh) { kMaxTh=size; }
-      j = end+1;
-   }
-}
-
-// ***************************************************************************
-// * ~CudaConformingProlongationOperator
-// ***************************************************************************
-CudaConformingProlongationOperator::~CudaConformingProlongationOperator()
-{
-   delete  gc;
-}
-
-// ***************************************************************************
-// * CUDA Error Status Check
-// ***************************************************************************
-void cuLastCheck()
-{
-   cudaError_t cudaStatus = cudaGetLastError();
-   if (cudaStatus != cudaSuccess)
-      exit(fprintf(stderr, "\n\t\033[31;1m[cuLastCheck] failed: %s\033[m\n",
-                   cudaGetErrorString(cudaStatus)));
-}
-
-// ***************************************************************************
-// * k_Mult
-// ***************************************************************************
-static __global__
-void k_Mult(double *y, const double *x,
-            const int *external_ldofs, const int m)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i>=m) { return; }
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   for (int k=0; k<(end-j); k+=1)
-   {
-      y[j+k]=x[j-i+k];
-   }
-}
-static __global__
-void k_Mult2(double *y, const double *x, const int *external_ldofs,
-             const int m, const int base)
-{
-   const int i = base+threadIdx.x;
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   const int k = blockIdx.x;
-   if (k>=(end-j)) { return; }
-   y[j+k]=x[j-i+k];
-}
-
-// ***************************************************************************
-// * Device Mult
-// ***************************************************************************
-void CudaConformingProlongationOperator::d_Mult(const CudaVector &x,
-                                                CudaVector &y) const
-{
-   const double *d_xdata = x.GetData();
-   const int in_layout = 2; // 2 - input is ltdofs array
-   gc->d_BcastBegin(const_cast<double*>(d_xdata), in_layout);
-   double *d_ydata = y.GetData();
-   int j = 0;
-   const int m = external_ldofs.Size();
-   if (m>0)
-   {
-      const int maxXThDim = rconfig::Get().MaxXThreadsDim();
-      if (m>maxXThDim)
-      {
-         const int kTpB=64;
-         k_Mult<<<(m+kTpB-1)/kTpB,kTpB>>>(d_ydata,d_xdata,d_external_ldofs,m);
-         cuLastCheck();
-      }
-      else
-      {
-         assert((m/maxXThDim)==0);
-         assert(kMaxTh<rconfig::Get().MaxXGridSize());
-         for (int of7=0; of7<m/maxXThDim; of7+=1)
-         {
-            const int base = of7*maxXThDim;
-            k_Mult2<<<kMaxTh,maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,base);
-            cuLastCheck();
-         }
-         k_Mult2<<<kMaxTh,m%maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,0);
-         cuLastCheck();
-      }
-      j = external_ldofs[m-1]+1;
-   }
-   rmemcpy::rDtoD(d_ydata+j,d_xdata+j-m,(Width()+m-j)*sizeof(double));
-   const int out_layout = 0; // 0 - output is ldofs array
-   gc->d_BcastEnd(d_ydata, out_layout);
-}
-
-
-// ***************************************************************************
-// * k_Mult
-// ***************************************************************************
-static __global__
-void k_MultTranspose(double *y, const double *x,
-                     const int *external_ldofs, const int m)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i>=m) { return; }
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   for (int k=0; k<(end-j); k+=1)
-   {
-      y[j-i+k]=x[j+k];
-   }
-}
-
-static __global__
-void k_MultTranspose2(double *y, const double *x,
-                      const int *external_ldofs,
-                      const int m, const int base)
-{
-   const int i = base+threadIdx.x;
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   const int k = blockIdx.x;
-   if (k>=(end-j)) { return; }
-   y[j-i+k]=x[j+k];
-}
-
-// ***************************************************************************
-// * Device MultTranspose
-// ***************************************************************************
-void CudaConformingProlongationOperator::d_MultTranspose(const CudaVector &x,
-                                                         CudaVector &y) const
-{
-   const double *d_xdata = x.GetData();
-   gc->d_ReduceBegin(d_xdata);
-   double *d_ydata = y.GetData();
-   int j = 0;
-   const int m = external_ldofs.Size();
-   if (m>0)
-   {
-      const int maxXThDim = rconfig::Get().MaxXThreadsDim();
-      if (m>maxXThDim)
-      {
-         const int kTpB=64;
-         k_MultTranspose<<<(m+kTpB-1)/kTpB,kTpB>>>(d_ydata,d_xdata,d_external_ldofs,m);
-         cuLastCheck();
-      }
-      else
-      {
-         const int TpB = rconfig::Get().MaxXThreadsDim();
-         assert(kMaxTh<rconfig::Get().MaxXGridSize());
-         for (int of7=0; of7<m/maxXThDim; of7+=1)
-         {
-            const int base = of7*maxXThDim;
-            k_MultTranspose2<<<kMaxTh,maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,base);
-            cuLastCheck();
-         }
-         k_MultTranspose2<<<kMaxTh,m%maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,0);
-         cuLastCheck();
-      }
-      j = external_ldofs[m-1]+1;
-   }
-   rmemcpy::rDtoD(d_ydata+j-m,d_xdata+j,(Height()-j)*sizeof(double));
-   const int out_layout = 2; // 2 - output is an array on all ltdofs
-   gc->d_ReduceEnd<double>(d_ydata, out_layout, GroupCommunicator::Sum);
-}
-
-// ***************************************************************************
-// * Host Mult
-// ***************************************************************************
-void CudaConformingProlongationOperator::h_Mult(const Vector &x,
-                                                Vector &y) const
-{
-   const double *xdata = x.GetData();
-   double *ydata = y.GetData();
-   const int m = external_ldofs.Size();
-   const int in_layout = 2; // 2 - input is ltdofs array
-   gc->BcastBegin(const_cast<double*>(xdata), in_layout);
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      std::copy(xdata+j-i, xdata+end-i, ydata+j);
-      j = end+1;
-   }
-   std::copy(xdata+j-m, xdata+Width(), ydata+j);
-   const int out_layout = 0; // 0 - output is ldofs array
-   gc->BcastEnd(ydata, out_layout);
-}
-
-// ***************************************************************************
-// * Host MultTranspose
-// ***************************************************************************
-void CudaConformingProlongationOperator::h_MultTranspose(const Vector &x,
-                                                         Vector &y) const
-{
-   const double *xdata = x.GetData();
-   double *ydata = y.GetData();
-   const int m = external_ldofs.Size();
-   gc->ReduceBegin(xdata);
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      std::copy(xdata+j, xdata+end, ydata+j-i);
-      j = end+1;
-   }
-   std::copy(xdata+j, xdata+Height(), ydata+j-m);
-   const int out_layout = 2; // 2 - output is an array on all ltdofs
-   gc->ReduceEnd<double>(ydata, out_layout, GroupCommunicator::Sum);
-}
-
-} // namespace mfem
diff --git a/cuda/cuda/fem/conform.hpp b/cuda/cuda/fem/conform.hpp
deleted file mode 100644
index 9c62e5f7..00000000
--- a/cuda/cuda/fem/conform.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_CONFORM_PROLONGATION_OP
-#define LAGHOS_CUDA_CONFORM_PROLONGATION_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaConformingProlongationOperator
-//  **************************************************************************
-class CudaConformingProlongationOperator : public CudaOperator
-{
-protected:
-   Array<int> external_ldofs;
-   CudaArray<int> d_external_ldofs;
-   CudaCommD *gc;
-   int kMaxTh;
-public:
-   CudaConformingProlongationOperator(ParFiniteElementSpace &);
-   ~CudaConformingProlongationOperator();
-   void d_Mult(const CudaVector &x, CudaVector &y) const;
-   void d_MultTranspose(const CudaVector &x, CudaVector &y) const;
-   void h_Mult(const Vector &x, Vector &y) const;
-   void h_MultTranspose(const Vector &x, Vector &y) const;
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_CONFORM_PROLONGATION_OP
diff --git a/cuda/cuda/fem/cuGridfunc.cpp b/cuda/cuda/fem/cuGridfunc.cpp
deleted file mode 100644
index a89d1d6e..00000000
--- a/cuda/cuda/fem/cuGridfunc.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-void CudaGridFunction::ToQuad(const IntegrationRule& ir,
-                              CudaVector& quadValues)
-{
-   const FiniteElement& fe = *(fes.GetFE(0));
-   const int dim  = fe.GetDim();
-   const int vdim = fes.GetVDim();
-   const int elements = fes.GetNE();
-   const int numQuad  = ir.GetNPoints();
-   const CudaDofQuadMaps* maps = CudaDofQuadMaps::Get(fes, ir);
-   const int quad1D  = IntRules.Get(Geometry::SEGMENT,ir.GetOrder()).GetNPoints();
-   const int dofs1D =fes.GetFE(0)->GetOrder() + 1;
-   quadValues.SetSize(numQuad * elements);
-   if (rconfig::Get().Share())
-   {
-      rGridFuncToQuadS(dim,vdim,dofs1D,quad1D,elements,
-                       maps->dofToQuad,
-                       fes.GetLocalToGlobalMap(),
-                       ptr(),
-                       quadValues);
-   }
-   else
-      rGridFuncToQuad(dim,vdim,dofs1D,quad1D,elements,
-                      maps->dofToQuad,
-                      fes.GetLocalToGlobalMap(),
-                      ptr(),
-                      quadValues);
-}
-
-} // mfem
diff --git a/cuda/cuda/fem/cuGridfunc.hpp b/cuda/cuda/fem/cuGridfunc.hpp
deleted file mode 100644
index 9f2bce92..00000000
--- a/cuda/cuda/fem/cuGridfunc.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_GRIDFUNC
-#define LAGHOS_CUDA_GRIDFUNC
-
-namespace mfem
-{
-
-class CudaGridFunction : public CudaVector
-{
-public:
-   const CudaFiniteElementSpace& fes;
-public:
-
-   CudaGridFunction(const CudaFiniteElementSpace& f):
-      CudaVector(f.GetVSize()),fes(f) {}
-
-   CudaGridFunction(const CudaFiniteElementSpace& f,const CudaVector* v):
-      CudaVector(v), fes(f) {}
-
-   void ToQuad(const IntegrationRule&,CudaVector&);
-
-   CudaGridFunction& operator=(const CudaVector& v)
-   {
-      CudaVector::operator=(v);
-      return *this;
-   }
-   CudaGridFunction& operator=(const Vector& v)
-   {
-      CudaVector::operator=(v);
-      return *this;
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_GRIDFUNC
diff --git a/cuda/cuda/fem/fespace.cpp b/cuda/cuda/fem/fespace.cpp
deleted file mode 100644
index 78877f26..00000000
--- a/cuda/cuda/fem/fespace.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaFiniteElementSpace
-//  ***************************************************************************
-CudaFiniteElementSpace::CudaFiniteElementSpace(Mesh* mesh,
-                                               const FiniteElementCollection* fec,
-                                               const int vdim_,
-                                               Ordering::Type ordering_)
-   :ParFiniteElementSpace(static_cast<ParMesh*>(mesh),fec,vdim_,ordering_),
-    globalDofs(GetNDofs()),
-    localDofs(GetFE(0)->GetDof()),
-    offsets(globalDofs+1),
-    indices(localDofs, GetNE()),
-    map(localDofs, GetNE())
-{
-   const FiniteElement *fe = GetFE(0);
-   const TensorBasisElement* el = dynamic_cast<const TensorBasisElement*>(fe);
-   const Array<int> &dof_map = el->GetDofMap();
-   const bool dof_map_is_identity = (dof_map.Size()==0);
-
-   const Table& e2dTable = GetElementToDofTable();
-   const int* elementMap = e2dTable.GetJ();
-   const int elements = GetNE();
-   Array<int> h_offsets(globalDofs+1);
-   // We'll be keeping a count of how many local nodes point to its global dof
-   for (int i = 0; i <= globalDofs; ++i)
-   {
-      h_offsets[i] = 0;
-   }
-   for (int e = 0; e < elements; ++e)
-   {
-      for (int d = 0; d < localDofs; ++d)
-      {
-         const int gid = elementMap[localDofs*e + d];
-         ++h_offsets[gid + 1];
-      }
-   }
-   // Aggregate to find offsets for each global dof
-   for (int i = 1; i <= globalDofs; ++i)
-   {
-      h_offsets[i] += h_offsets[i - 1];
-   }
-
-   Array<int> h_indices(localDofs*elements);
-   Array<int> h_map(localDofs*elements);
-   // For each global dof, fill in all local nodes that point   to it
-   for (int e = 0; e < elements; ++e)
-   {
-      for (int d = 0; d < localDofs; ++d)
-      {
-         const int did = dof_map_is_identity?d:dof_map[d];
-         const int gid = elementMap[localDofs*e + did];
-         const int lid = localDofs*e + d;
-         h_indices[h_offsets[gid]++] = lid;
-         h_map[lid] = gid;
-      }
-   }
-
-   // We shifted the offsets vector by 1 by using it as a counter
-   // Now we shift it back.
-   for (int i = globalDofs; i > 0; --i)
-   {
-      h_offsets[i] = h_offsets[i - 1];
-   }
-   h_offsets[0] = 0;
-
-   offsets = h_offsets;
-   indices = h_indices;
-   map = h_map;
-
-   const SparseMatrix* R = GetRestrictionMatrix(); assert(R);
-   const CudaConformingProlongationOperator *P =
-      new CudaConformingProlongationOperator(*this);
-
-   const int mHeight = R->Height();
-   const int* I = R->GetI();
-   const int* J = R->GetJ();
-   int trueCount = 0;
-   for (int i = 0; i < mHeight; ++i)
-   {
-      trueCount += ((I[i + 1] - I[i]) == 1);
-   }
-
-   Array<int> h_reorderIndices(2*trueCount);
-   for (int i = 0, trueIdx=0; i < mHeight; ++i)
-   {
-      if ((I[i + 1] - I[i]) == 1)
-      {
-         h_reorderIndices[trueIdx++] = J[I[i]];
-         h_reorderIndices[trueIdx++] = i;
-      }
-   }
-
-   reorderIndices = ::new CudaArray<int>(2*trueCount);
-   *reorderIndices = h_reorderIndices;
-
-   restrictionOp = new CudaRestrictionOperator(R->Height(),
-                                               R->Width(),
-                                               reorderIndices);
-   prolongationOp = new CudaProlongationOperator(P);
-}
-
-// ***************************************************************************
-CudaFiniteElementSpace::~CudaFiniteElementSpace()
-{
-   ::delete reorderIndices;
-}
-
-// ***************************************************************************
-bool CudaFiniteElementSpace::hasTensorBasis() const
-{
-   assert(dynamic_cast<const TensorBasisElement*>(GetFE(0)));
-   return true;
-}
-
-// ***************************************************************************
-void CudaFiniteElementSpace::GlobalToLocal(const CudaVector& globalVec,
-                                           CudaVector& localVec) const
-{
-   const int vdim = GetVDim();
-   const int localEntries = localDofs * GetNE();
-   const bool vdim_ordering = ordering == Ordering::byVDIM;
-   rGlobalToLocal(vdim,
-                  vdim_ordering,
-                  globalDofs,
-                  localEntries,
-                  offsets,
-                  indices,
-                  globalVec,
-                  localVec);
-}
-
-// ***************************************************************************
-void CudaFiniteElementSpace::LocalToGlobal(const CudaVector& localVec,
-                                           CudaVector& globalVec) const
-{
-   const int vdim = GetVDim();
-   const int localEntries = localDofs * GetNE();
-   const bool vdim_ordering = ordering == Ordering::byVDIM;
-   rLocalToGlobal(vdim,
-                  vdim_ordering,
-                  globalDofs,
-                  localEntries,
-                  offsets,
-                  indices,
-                  localVec,
-                  globalVec);
-}
-
-} // namespace mfem
diff --git a/cuda/cuda/fem/fespace.hpp b/cuda/cuda/fem/fespace.hpp
deleted file mode 100644
index 7766ebfd..00000000
--- a/cuda/cuda/fem/fespace.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_FESPACE
-#define LAGHOS_CUDA_FESPACE
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaFiniteElementSpace
-//  **************************************************************************
-class CudaFiniteElementSpace : public ParFiniteElementSpace
-{
-private:
-   int globalDofs, localDofs;
-   CudaArray<int> offsets;
-   CudaArray<int> indices, *reorderIndices;
-   CudaArray<int> map;
-   CudaOperator *restrictionOp, *prolongationOp;
-public:
-   CudaFiniteElementSpace(Mesh* mesh,
-                          const FiniteElementCollection* fec,
-                          const int vdim_ = 1,
-                          Ordering::Type ordering_ = Ordering::byNODES);
-   ~CudaFiniteElementSpace();
-   // *************************************************************************
-   bool hasTensorBasis() const;
-   int GetLocalDofs() const { return localDofs; }
-   const CudaOperator* GetRestrictionOperator() { return restrictionOp; }
-   const CudaOperator* GetProlongationOperator() { return prolongationOp; }
-   const CudaArray<int>& GetLocalToGlobalMap() const { return map; }
-   // *************************************************************************
-   void GlobalToLocal(const CudaVector&, CudaVector&) const;
-   void LocalToGlobal(const CudaVector&, CudaVector&) const;
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_FESPACE
diff --git a/cuda/cuda/fem/prolong.cpp b/cuda/cuda/fem/prolong.cpp
deleted file mode 100644
index c70b3d23..00000000
--- a/cuda/cuda/fem/prolong.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaProlongationOperator
-// ***************************************************************************
-CudaProlongationOperator::CudaProlongationOperator
-(const CudaConformingProlongationOperator* Op):
-   CudaOperator(Op->Height(), Op->Width()),pmat(Op) {}
-
-// ***************************************************************************
-void CudaProlongationOperator::Mult(const CudaVector& x,
-                                    CudaVector& y) const
-{
-   if (rconfig::Get().IAmAlone())
-   {
-      y=x;
-      return;
-   }
-   if (!rconfig::Get().DoHostConformingProlongationOperator())
-   {
-      pmat->d_Mult(x, y);
-      return;
-   }
-   const Vector hostX=x;//D2H
-   Vector hostY(y.Size());
-   pmat->h_Mult(hostX, hostY);
-   y=hostY;//H2D
-}
-
-// ***************************************************************************
-void CudaProlongationOperator::MultTranspose(const CudaVector& x,
-                                             CudaVector& y) const
-{
-   if (rconfig::Get().IAmAlone())
-   {
-      y=x;
-      return;
-   }
-   if (!rconfig::Get().DoHostConformingProlongationOperator())
-   {
-      pmat->d_MultTranspose(x, y);
-      return;
-   }
-   const Vector hostX=x;
-   Vector hostY(y.Size());
-   pmat->h_MultTranspose(hostX, hostY);
-   y=hostY;//H2D
-}
-
-} // namespace mfem
diff --git a/cuda/cuda/fem/prolong.hpp b/cuda/cuda/fem/prolong.hpp
deleted file mode 100644
index 6dc13cdf..00000000
--- a/cuda/cuda/fem/prolong.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_PROLONG_OP
-#define LAGHOS_CUDA_PROLONG_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaProlongationOperator
-// ***************************************************************************
-class CudaProlongationOperator : public CudaOperator
-{
-protected:
-   const CudaConformingProlongationOperator* pmat = NULL;
-public:
-   CudaProlongationOperator(const CudaConformingProlongationOperator*);
-   void Mult(const CudaVector& x, CudaVector& y) const;
-   void MultTranspose(const CudaVector& x, CudaVector& y) const ;
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_PROLONG_OP
diff --git a/cuda/cuda/fem/restrict.cpp b/cuda/cuda/fem/restrict.cpp
deleted file mode 100644
index 39a274fc..00000000
--- a/cuda/cuda/fem/restrict.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaRestrictionOperator
-// ***************************************************************************
-void CudaRestrictionOperator::Mult(const CudaVector& x,
-                                   CudaVector& y) const
-{
-   rExtractSubVector(entries, indices->ptr(), x, y);
-}
-
-} // namespace mfem
diff --git a/cuda/cuda/fem/restrict.hpp b/cuda/cuda/fem/restrict.hpp
deleted file mode 100644
index ad125efb..00000000
--- a/cuda/cuda/fem/restrict.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_RESTRICT_OP
-#define LAGHOS_CUDA_RESTRICT_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaRestrictionOperator
-// ***************************************************************************
-class CudaRestrictionOperator : public CudaOperator
-{
-protected:
-   int entries;
-   const CudaArray<int> *indices;
-public:
-   CudaRestrictionOperator(const int h, const int w,
-                           const CudaArray<int> *idx):
-      CudaOperator(h,w),
-      entries(idx->size()>>1),
-      indices(idx) {}
-   void Mult(const CudaVector& x, CudaVector& y) const ;
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_RESTRICT_OP
diff --git a/cuda/cuda/general/array.hpp b/cuda/cuda/general/array.hpp
deleted file mode 100644
index b92b8415..00000000
--- a/cuda/cuda/general/array.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_ARRAY
-#define LAGHOS_CUDA_ARRAY
-
-namespace mfem
-{
-
-template <class T, bool xyz = true> class CudaArray;
-
-// Partial Specializations for xyz==TRUE *************************************
-template <class T> class CudaArray<T,true> : public rmalloc<T>
-{
-private:
-   T* data = NULL;
-   size_t sz,d[4];
-public:
-   CudaArray():data(NULL),sz(0),d{0,0,0,0} {}
-   CudaArray(const size_t x) {allocate(x);}
-   CudaArray(const size_t x,const size_t y) {allocate(x,y);}
-   CudaArray(const CudaArray<T,true> &r) {assert(false);}
-   CudaArray& operator=(Array<T> &a)
-   {
-      rmemcpy::rHtoD(data,a.GetData(),a.Size()*sizeof(T));
-      return *this;
-   }
-   ~CudaArray() {rmalloc<T>::operator delete (data);}
-   inline size_t* dim() { return &d[0]; }
-   inline T* ptr() { return data; }
-   inline const T* GetData() const { return data; }
-   inline const T* ptr() const { return data; }
-   inline operator T* () { return data; }
-   inline operator const T* () const { return data; }
-   double operator* (const CudaArray& a) const { return vector_dot(sz, data, a.data); }
-   inline size_t size() const { return sz; }
-   inline size_t Size() const { return sz; }
-   inline size_t bytes() const { return size()*sizeof(T); }
-   void allocate(const size_t X, const size_t Y =1,
-                 const size_t Z =1, const size_t D =1,
-                 const bool transposed = false)
-   {
-      d[0]=X; d[1]=Y; d[2]=Z; d[3]=D;
-      sz=d[0]*d[1]*d[2]*d[3];
-      data=(T*) rmalloc<T>::operator new (sz);
-   }
-   inline T& operator[](const size_t x) { return data[x]; }
-   inline T& operator()(const size_t x, const size_t y)
-   {
-      return data[x + d[0]*y];
-   }
-   inline T& operator()(const size_t x, const size_t y, const size_t z)
-   {
-      return data[x + d[0]*(y + d[1]*z)];
-   }
-   void Print(std::ostream& out= std::cout, int width = 8) const
-   {
-      T *h_data = (double*) ::malloc(bytes());
-      rmemcpy::rDtoH(h_data,data,bytes());
-      for (size_t i=0; i<sz; i+=1)
-         if (sizeof(T)==8) { printf("\n\t[%ld] %.15e",i,h_data[i]); }
-         else { printf("\n\t[%ld] %d",i,h_data[i]); }
-      free(h_data);
-   }
-};
-
-// Partial Specializations for xyz==FALSE ************************************
-template <class T> class CudaArray<T,false> : public rmalloc<T>
-{
-private:
-   static const int DIM = 4;
-   T* data = NULL;
-   size_t sz,d[DIM];
-public:
-   CudaArray():data(NULL),sz(0),d{0,0,0,0} {}
-   CudaArray(const size_t d0) {allocate(d0);}
-   CudaArray(const CudaArray<T,false> &r) {assert(false);}
-   ~CudaArray() {rmalloc<T>::operator delete (data);}
-   CudaArray& operator=(Array<T> &a)
-   {
-      rmemcpy::rHtoD(data,a.GetData(),a.Size()*sizeof(T));
-      return *this;
-   }
-   inline size_t* dim() { return &d[0]; }
-   inline T* ptr() { return data; }
-   inline T* GetData() const { return data; }
-   inline const T* ptr() const { return data; }
-   inline operator T* () { return data; }
-   inline operator const T* () const { return data; }
-   double operator* (const CudaArray& a) const { return vector_dot(sz, data, a.data); }
-   inline size_t size() const { return sz; }
-   inline size_t Size() const { return sz; }
-   inline size_t bytes() const { return size()*sizeof(T); }
-   void allocate(const size_t X, const size_t Y =1,
-                 const size_t Z =1, const size_t D =1,
-                 const bool transposed = false)
-   {
-      d[0]=X; d[1]=Y; d[2]=Z; d[3]=D;
-      sz=d[0]*d[1]*d[2]*d[3];
-      assert(sz>0);
-      data=(T*) rmalloc<T>::operator new (sz);
-#define xsw(a,b) a^=b^=a^=b
-      if (transposed) { xsw(d[0],d[1]); }
-      for (size_t i=1,b=d[0]; i<DIM; xsw(d[i],b),++i)
-      {
-         d[i]*=d[i-1];
-      }
-      d[0]=1;
-      if (transposed) { xsw(d[0],d[1]); }
-   }
-   inline T& operator[](const size_t x) { return data[x]; }
-   inline T& operator()(const size_t x, const size_t y)
-   {
-      return data[d[0]*x + d[1]*y];
-   }
-   inline T& operator()(const size_t x, const size_t y, const size_t z)
-   {
-      return data[d[0]*x + d[1]*y + d[2]*z];
-   }
-   void Print(std::ostream& out= std::cout, int width = 8) const
-   {
-      T *h_data = (double*) ::malloc(bytes());
-      rmemcpy::rDtoH(h_data,data,bytes());
-      for (size_t i=0; i<sz; i+=1)
-         if (sizeof(T)==8) { printf("\n\t[%ld] %.15e",i,h_data[i]); }
-         else { printf("\n\t[%ld] %d",i,h_data[i]); }
-      free(h_data);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_ARRAY
-
diff --git a/cuda/cuda/general/commd.cpp b/cuda/cuda/general/commd.cpp
deleted file mode 100644
index 26e28470..00000000
--- a/cuda/cuda/general/commd.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * CudaCommD
-// ***************************************************************************
-CudaCommD::CudaCommD(ParFiniteElementSpace &pfes):
-   GroupCommunicator(pfes.GroupComm()),
-   d_group_ldof(group_ldof),
-   d_group_ltdof(group_ltdof),
-   d_group_buf(NULL) {comm_lock=0;}
-
-
-// ***************************************************************************
-// * ~CudaCommD
-// ***************************************************************************
-CudaCommD::~CudaCommD() { }
-
-
-// ***************************************************************************
-// * kCopyFromTable
-// ***************************************************************************
-template <class T> static __global__
-void k_CopyGroupToBuffer(T *buf,const T *data,const int *dofs)
-{
-   const int j = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[j];
-   buf[j]=data[idx];
-}
-
-// ***************************************************************************
-// ***************************************************************************
-template <class T> static
-T *d_CopyGroupToBuffer_k(const T *d_ldata,T *d_buf,
-                         const CudaTable &d_dofs,
-                         const int group)
-{
-   const int ndofs = d_dofs.RowSize(group);
-   const int *dofs = d_dofs.GetRow(group);
-   k_CopyGroupToBuffer<<<ndofs,1>>>(d_buf,d_ldata,dofs);
-   return d_buf + ndofs;
-}
-
-// ***************************************************************************
-// * d_CopyGroupToBuffer
-// ***************************************************************************
-template <class T>
-T *CudaCommD::d_CopyGroupToBuffer(const T *d_ldata, T *d_buf,
-                                  int group, int layout) const
-{
-   if (layout==2) // master
-   {
-      return d_CopyGroupToBuffer_k(d_ldata,d_buf,d_group_ltdof,group);
-   }
-   if (layout==0) // slave
-   {
-      return d_CopyGroupToBuffer_k(d_ldata,d_buf,d_group_ldof,group);
-   }
-   assert(false);
-   return 0;
-}
-
-// ***************************************************************************
-// * k_CopyGroupFromBuffer
-// ***************************************************************************
-template <class T> static __global__
-void k_CopyGroupFromBuffer(const T *buf,T *data,const int *dofs)
-{
-   const int j = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[j];
-   data[idx]=buf[j];
-}
-
-// ***************************************************************************
-// * d_CopyGroupFromBuffer
-// ***************************************************************************
-template <class T>
-const T *CudaCommD::d_CopyGroupFromBuffer(const T *d_buf, T *d_ldata,
-                                          int group, int layout) const
-{
-   assert(layout==0);
-   const int ndofs = d_group_ldof.RowSize(group);
-   const int *dofs = d_group_ldof.GetRow(group);
-   k_CopyGroupFromBuffer<<<ndofs,1>>>(d_buf,d_ldata,dofs);
-   return d_buf + ndofs;
-}
-
-// ***************************************************************************
-// * kAtomicAdd
-// ***************************************************************************
-template <class T>
-static __global__ void kAtomicAdd(T* adrs, const int* dofs,T *value)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[i];
-   adrs[idx] += value[i];
-}
-template __global__ void kAtomicAdd<int>(int*, const int*, int*);
-template __global__ void kAtomicAdd<double>(double*, const int*, double*);
-
-// ***************************************************************************
-// * ReduceGroupFromBuffer
-// ***************************************************************************
-template <class T>
-const T *CudaCommD::d_ReduceGroupFromBuffer(const T *d_buf, T *d_ldata,
-                                            int group, int layout,
-                                            void (*Op)(OpData<T>)) const
-{
-   OpData<T> opd;
-   opd.ldata = d_ldata;
-   opd.nldofs = group_ldof.RowSize(group);
-   opd.nb = 1;
-   opd.buf = const_cast<T*>(d_buf);
-   opd.ldofs = const_cast<int*>(d_group_ltdof.GetRow(group));
-   assert(opd.nb == 1);
-   kAtomicAdd<<<opd.nldofs,1>>>(opd.ldata,opd.ldofs,opd.buf);
-   return d_buf + opd.nldofs;
-}
-
-
-// ***************************************************************************
-// * d_BcastBegin
-// ***************************************************************************
-template <class T>
-void CudaCommD::d_BcastBegin(T *d_ldata, int layout)
-{
-   MFEM_VERIFY(comm_lock == 0, "object is already in use");
-   if (group_buf_size == 0) { return; }
-
-   assert(layout==2);
-   const int rnk = rconfig::Get().Rank();
-   int request_counter = 0;
-   group_buf.SetSize(group_buf_size*sizeof(T));
-   T *buf = (T *)group_buf.GetData();
-   if (!d_group_buf)
-   {
-      d_group_buf = rmalloc<T>::operator new (group_buf_size);
-   }
-   T *d_buf = (T*)d_group_buf;
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      const int num_send_groups = nbr_send_groups.RowSize(nbr);
-      if (num_send_groups > 0)
-      {
-         T *buf_start = buf;
-         T *d_buf_start = d_buf;
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         for (int i = 0; i < num_send_groups; i++)
-         {
-            T *d_buf_ini = d_buf;
-            assert(layout==2);
-            d_buf = d_CopyGroupToBuffer(d_ldata, d_buf, grp_list[i], 2);
-            buf += d_buf - d_buf_ini;
-         }
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rDtoH(buf_start,d_buf_start,(buf-buf_start)*sizeof(T));
-         }
-
-         // make sure the device has finished
-         if (rconfig::Get().Aware())
-         {
-            cudaStreamSynchronize(0);//*rconfig::Get().Stream());
-         }
-
-         if (rconfig::Get().Aware())
-            MPI_Isend(d_buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Isend(buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = -1; // mark as send request
-         request_counter++;
-      }
-
-      const int num_recv_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         if (rconfig::Get().Aware())
-            MPI_Irecv(d_buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Irecv(buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = nbr;
-         request_counter++;
-         buf_offsets[nbr] = buf - (T*)group_buf.GetData();
-         buf += recv_size;
-         d_buf += recv_size;
-      }
-   }
-   assert(buf - (T*)group_buf.GetData() == group_buf_size);
-   comm_lock = 1; // 1 - locked for Bcast
-   num_requests = request_counter;
-}
-
-// ***************************************************************************
-// * d_BcastEnd
-// ***************************************************************************
-template <class T>
-void CudaCommD::d_BcastEnd(T *d_ldata, int layout)
-{
-   if (comm_lock == 0) { return; }
-   const int rnk = rconfig::Get().Rank();
-   // The above also handles the case (group_buf_size == 0).
-   assert(comm_lock == 1);
-   // copy the received data from the buffer to d_ldata, as it arrives
-   int idx;
-   while (MPI_Waitany(num_requests, requests, &idx, MPI_STATUS_IGNORE),
-          idx != MPI_UNDEFINED)
-   {
-      int nbr = request_marker[idx];
-      if (nbr == -1) { continue; } // skip send requests
-
-      const int num_recv_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         const T *buf = (T*)group_buf.GetData() + buf_offsets[nbr];
-         const T *d_buf = (T*)d_group_buf + buf_offsets[nbr];
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rHtoD((void*)d_buf,buf,recv_size*sizeof(T));
-         }
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            d_buf = d_CopyGroupFromBuffer(d_buf, d_ldata, grp_list[i], layout);
-         }
-      }
-   }
-   comm_lock = 0; // 0 - no lock
-   num_requests = 0;
-}
-
-// ***************************************************************************
-// * d_ReduceBegin
-// ***************************************************************************
-template <class T>
-void CudaCommD::d_ReduceBegin(const T *d_ldata)
-{
-   MFEM_VERIFY(comm_lock == 0, "object is already in use");
-   if (group_buf_size == 0) { return; }
-   const int rnk = rconfig::Get().Rank();
-   int request_counter = 0;
-   group_buf.SetSize(group_buf_size*sizeof(T));
-   T *buf = (T *)group_buf.GetData();
-   if (!d_group_buf)
-   {
-      d_group_buf = rmalloc<T>::operator new (group_buf_size);
-   }
-   T *d_buf = (T*)d_group_buf;
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      const int num_send_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_send_groups > 0)
-      {
-         T *buf_start = buf;
-         T *d_buf_start = d_buf;
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         for (int i = 0; i < num_send_groups; i++)
-         {
-            T *d_buf_ini = d_buf;
-            d_buf = d_CopyGroupToBuffer(d_ldata, d_buf, grp_list[i], 0);
-            buf += d_buf - d_buf_ini;
-         }
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rDtoH(buf_start,d_buf_start,(buf-buf_start)*sizeof(T));
-         }
-         // make sure the device has finished
-         if (rconfig::Get().Aware())
-         {
-            cudaStreamSynchronize(0);//*rconfig::Get().Stream());
-         }
-         if (rconfig::Get().Aware())
-            MPI_Isend(d_buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Isend(buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = -1; // mark as send request
-         request_counter++;
-      }
-
-      // In Reduce operation: send_groups <--> recv_groups
-      const int num_recv_groups = nbr_send_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         if (rconfig::Get().Aware())
-            MPI_Irecv(d_buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Irecv(buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = nbr;
-         request_counter++;
-         buf_offsets[nbr] = buf - (T*)group_buf.GetData();
-         buf += recv_size;
-         d_buf += recv_size;
-      }
-   }
-   assert(buf - (T*)group_buf.GetData() == group_buf_size);
-   comm_lock = 2;
-   num_requests = request_counter;
-}
-
-// ***************************************************************************
-// * d_ReduceEnd
-// ***************************************************************************
-template <class T>
-void CudaCommD::d_ReduceEnd(T *d_ldata, int layout,
-                            void (*Op)(OpData<T>))
-{
-   if (comm_lock == 0) { return; }
-   const int rnk = rconfig::Get().Rank();
-   // The above also handles the case (group_buf_size == 0).
-   assert(comm_lock == 2);
-   MPI_Waitall(num_requests, requests, MPI_STATUSES_IGNORE);
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      // In Reduce operation: send_groups <--> recv_groups
-      const int num_recv_groups = nbr_send_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         const T *buf = (T*)group_buf.GetData() + buf_offsets[nbr];
-         assert(d_group_buf);
-         const T *d_buf = (T*)d_group_buf + buf_offsets[nbr];
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rHtoD((void*)d_buf,buf,recv_size*sizeof(T));
-         }
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            d_buf = d_ReduceGroupFromBuffer(d_buf, d_ldata, grp_list[i], layout, Op);
-         }
-      }
-   }
-   comm_lock = 0; // 0 - no lock
-   num_requests = 0;
-}
-
-// ***************************************************************************
-// * instantiate CudaCommD::Bcast and Reduce for doubles
-// ***************************************************************************
-template void CudaCommD::d_BcastBegin<double>(double*, int);
-template void CudaCommD::d_BcastEnd<double>(double*, int);
-template void CudaCommD::d_ReduceBegin<double>(const double *);
-template void CudaCommD::d_ReduceEnd<double>(double*,int,
-                                             void (*)(OpData<double>));
-
-} // namespace mfem
diff --git a/cuda/cuda/general/commd.hpp b/cuda/cuda/general/commd.hpp
deleted file mode 100644
index 4f2e0c1b..00000000
--- a/cuda/cuda/general/commd.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_COMM_D
-#define LAGHOS_CUDA_COMM_D
-
-#ifdef MFEM_USE_MPI
-#include <mpi.h>
-#endif
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * First communicator, buf goes on the device
-// ***************************************************************************
-class CudaCommD : public GroupCommunicator, public rmemcpy
-{
-private:
-   CudaTable d_group_ldof;
-   CudaTable d_group_ltdof;
-   void *d_group_buf;
-   int comm_lock;
-   int num_requests;
-public:
-   CudaCommD(ParFiniteElementSpace&);
-   ~CudaCommD();
-
-   template <class T> T *d_CopyGroupToBuffer(const T*,T*,int,int) const;
-   template <class T>
-   const T *d_CopyGroupFromBuffer(const T*, T*,int, int) const;
-   template <class T>
-   const T *d_ReduceGroupFromBuffer(const T*,T*,int,int,
-                                    void (*)(OpData<T>)) const;
-
-   template <class T> void d_BcastBegin(T*,int);
-   template <class T> void d_BcastEnd(T*, int);
-
-   template <class T> void d_ReduceBegin(const T*);
-   template <class T> void d_ReduceEnd(T*,int,void (*)(OpData<T>));
-};
-
-
-} // mfem
-
-#endif // LAGHOS_CUDA_COMM_D
diff --git a/cuda/cuda/general/malloc.hpp b/cuda/cuda/general/malloc.hpp
deleted file mode 100644
index 628a1371..00000000
--- a/cuda/cuda/general/malloc.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_MALLOC
-#define LAGHOS_CUDA_MALLOC
-
-namespace mfem
-{
-
-// ***************************************************************************
-template<class T> struct rmalloc: public rmemcpy
-{
-
-   // *************************************************************************
-   inline void* operator new (size_t n, bool lock_page = false)
-   {
-      if (!rconfig::Get().Cuda()) { return ::new T[n]; }
-      void *ptr;
-      if (!rconfig::Get().Uvm())
-      {
-         if (lock_page) { cuMemHostAlloc(&ptr, n*sizeof(T), CU_MEMHOSTALLOC_PORTABLE); }
-         else { cuMemAlloc((CUdeviceptr*)&ptr, n*sizeof(T)); }
-      }
-      else
-      {
-         cuMemAllocManaged((CUdeviceptr*)&ptr, n*sizeof(T),CU_MEM_ATTACH_GLOBAL);
-      }
-      return ptr;
-   }
-
-   // ***************************************************************************
-   inline void operator delete (void *ptr)
-   {
-      if (!rconfig::Get().Cuda())
-      {
-         if (ptr)
-         {
-            ::delete[] static_cast<T*>(ptr);
-         }
-      }
-      else
-      {
-         cuMemFree((CUdeviceptr)ptr); // or cuMemFreeHost if page_locked was used
-      }
-      ptr = nullptr;
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_MALLOC
diff --git a/cuda/cuda/general/memcpy.cpp b/cuda/cuda/general/memcpy.cpp
deleted file mode 100644
index fa0d2229..00000000
--- a/cuda/cuda/general/memcpy.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// *************************************************************************
-void* rmemcpy::rHtoH(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   std::memcpy(dest,src,bytes);
-   return dest;
-}
-
-// *************************************************************************
-void* rmemcpy::rHtoD(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   if (!rconfig::Get().Cuda()) { return std::memcpy(dest,src,bytes); }
-   if (!rconfig::Get().Uvm())
-   {
-      cuMemcpyHtoD((CUdeviceptr)dest,src,bytes);
-   }
-   else { cuMemcpy((CUdeviceptr)dest,(CUdeviceptr)src,bytes); }
-   return dest;
-}
-
-// ***************************************************************************
-void* rmemcpy::rDtoH(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   if (!rconfig::Get().Cuda()) { return std::memcpy(dest,src,bytes); }
-   if (!rconfig::Get().Uvm())
-   {
-      cuMemcpyDtoH(dest,(CUdeviceptr)src,bytes);
-   }
-   else { cuMemcpy((CUdeviceptr)dest,(CUdeviceptr)src,bytes); }
-   return dest;
-}
-
-// ***************************************************************************
-void* rmemcpy::rDtoD(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   if (!rconfig::Get().Cuda()) { return std::memcpy(dest,src,bytes); }
-   if (!rconfig::Get().Uvm())
-   {
-      if (!async)
-      {
-         cuMemcpyDtoD((CUdeviceptr)dest,(CUdeviceptr)src,bytes);
-      }
-      else
-      {
-         const CUstream s = *rconfig::Get().Stream();
-         cuMemcpyDtoDAsync((CUdeviceptr)dest,(CUdeviceptr)src,bytes,s);
-      }
-   }
-   else { cuMemcpy((CUdeviceptr)dest,(CUdeviceptr)src,bytes); }
-   return dest;
-}
-
-} // mfem
diff --git a/cuda/cuda/general/memcpy.hpp b/cuda/cuda/general/memcpy.hpp
deleted file mode 100644
index f44e37e4..00000000
--- a/cuda/cuda/general/memcpy.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_MEMCPY
-#define LAGHOS_CUDA_MEMCPY
-
-namespace mfem
-{
-
-// ***************************************************************************
-struct rmemcpy
-{
-   static void* rHtoH(void*, const void*, std::size_t, const bool =false);
-   static void* rHtoD(void*, const void*, std::size_t, const bool =false);
-   static void* rDtoH(void*, const void*, std::size_t, const bool =false);
-   static void* rDtoD(void*, const void*, std::size_t, const bool =false);
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_MEMCPY
diff --git a/cuda/cuda/general/table.cpp b/cuda/cuda/general/table.cpp
deleted file mode 100644
index 13d53307..00000000
--- a/cuda/cuda/general/table.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-CudaTable::CudaTable(const Table &table)
-{
-   size = table.Size();
-   assert(size > 0);
-   const int nnz = table.GetI()[size];
-   I = new int[size+1];
-   J = (int*) operator new (nnz);
-   rHtoH(I,table.GetI(),sizeof(int)*(size+1));
-   if (nnz>0)
-   {
-      assert(table.GetJ());
-      rHtoD(J,table.GetJ(),sizeof(int)*nnz);
-   }
-}
-
-} // mfem
diff --git a/cuda/cuda/general/table.hpp b/cuda/cuda/general/table.hpp
deleted file mode 100644
index 481b7e73..00000000
--- a/cuda/cuda/general/table.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_TABLE
-#define LAGHOS_CUDA_TABLE
-
-namespace mfem
-{
-
-class CudaTable : public rmalloc<int>
-{
-private:
-   int size = 0;
-   int *I = NULL;
-   int *J = NULL;
-public:
-   CudaTable(const Table&);
-   inline int Size() {return size;}
-   int RowSize(int i) const { return I[i+1]-I[i]; }
-   const int *GetRow(int i) const { return J+I[i]; }
-   int *GetRow(int i) { return J+I[i]; }
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_TABLE
diff --git a/cuda/cuda/kernels/blas/vector_axpy.cpp b/cuda/cuda/kernels/blas/vector_axpy.cpp
deleted file mode 100644
index e5802d80..00000000
--- a/cuda/cuda/kernels/blas/vector_axpy.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_axpy0(const int N,
-                  const double alpha,
-                  double* __restrict v0,
-                  const double* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] += alpha * v1[i]; }
-}
-
-
-// *****************************************************************************
-void vector_axpy(const int N,
-                 const double alpha,
-                 double* __restrict v0,
-                 const double* __restrict v1)
-{
-   cuKer(vector_axpy,N,alpha,v0,v1);
-}
diff --git a/cuda/cuda/kernels/blas/vector_clear_dofs.cpp b/cuda/cuda/kernels/blas/vector_clear_dofs.cpp
deleted file mode 100644
index e5fc382d..00000000
--- a/cuda/cuda/kernels/blas/vector_clear_dofs.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_clear_dofs0(const int N,
-                        double* __restrict v0,
-                        const int* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[v1[i]] = 0.0; }
-}
-
-// *****************************************************************************
-void vector_clear_dofs(const int N,
-                       double* __restrict v0,
-                       const int* __restrict v1)
-{
-   cuKer(vector_clear_dofs,N,v0,v1);
-}
diff --git a/cuda/cuda/kernels/blas/vector_dot.cpp b/cuda/cuda/kernels/blas/vector_dot.cpp
deleted file mode 100644
index 615e734c..00000000
--- a/cuda/cuda/kernels/blas/vector_dot.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-#define CUDA_BLOCKSIZE 256
-
-// *****************************************************************************
-__global__ void cuKernelDot(const size_t N, double *gdsr,
-                            const double *x, const double *y)
-{
-   __shared__ double s_dot[CUDA_BLOCKSIZE];
-   const size_t n = blockDim.x*blockIdx.x + threadIdx.x;
-   if (n>=N) { return; }
-   const size_t bid = blockIdx.x;
-   const size_t tid = threadIdx.x;
-   const size_t bbd = bid*blockDim.x;
-   const size_t rid = bbd+tid;
-   s_dot[tid] = x[n] * y[n];
-   for (size_t workers=blockDim.x>>1; workers>0; workers>>=1)
-   {
-      __syncthreads();
-      if (tid >= workers) { continue; }
-      if (rid >= N) { continue; }
-      const size_t dualTid = tid + workers;
-      if (dualTid >= N) { continue; }
-      const size_t rdd = bbd+dualTid;
-      if (rdd >= N) { continue; }
-      if (dualTid >= blockDim.x) { continue; }
-      s_dot[tid] += s_dot[dualTid];
-   }
-   if (tid==0) { gdsr[bid] = s_dot[0]; }
-}
-
-// *****************************************************************************
-double cuVectorDot(const size_t N, const double *x, const double *y)
-{
-   const size_t tpb = CUDA_BLOCKSIZE;
-   const size_t blockSize = CUDA_BLOCKSIZE;
-   const size_t gridSize = (N+blockSize-1)/blockSize;
-   const size_t dot_sz = (N%tpb)==0? (N/tpb) : (1+N/tpb);
-   const size_t bytes = dot_sz*sizeof(double);
-   static double *h_dot = NULL;
-   if (!h_dot) { h_dot = (double*)calloc(dot_sz,sizeof(double)); }
-   static CUdeviceptr gdsr = (CUdeviceptr) NULL;
-   if (!gdsr) { cuMemAlloc(&gdsr,bytes); }
-   cuKernelDot<<<gridSize,blockSize>>>(N, (double*)gdsr, x, y);
-   cuMemcpy((CUdeviceptr)h_dot,(CUdeviceptr)gdsr,bytes);
-   double dot = 0.0;
-   for (size_t i=0; i<dot_sz; i+=1) { dot += h_dot[i]; }
-   return dot;
-}
-
-
-// *****************************************************************************
-double vector_dot(const int N,
-                  const double* __restrict x,
-                  const double* __restrict y)
-{
-   return cuVectorDot(N, x, y);
-}
diff --git a/cuda/cuda/kernels/blas/vector_get_subvector.cpp b/cuda/cuda/kernels/blas/vector_get_subvector.cpp
deleted file mode 100644
index c2c92855..00000000
--- a/cuda/cuda/kernels/blas/vector_get_subvector.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_get_subvector0(const int N,
-                           double* __restrict v0,
-                           const double* __restrict v1,
-                           const int* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int dof_i = v2[i];
-      v0[i] = dof_i >= 0 ? v1[dof_i] : -v1[-dof_i-1];
-   }
-}
-
-// *****************************************************************************
-void vector_get_subvector(const int N,
-                          double* __restrict v0,
-                          const double* __restrict v1,
-                          const int* __restrict v2)
-{
-   cuKer(vector_get_subvector,N,v0,v1,v2);
-}
-
diff --git a/cuda/cuda/kernels/blas/vector_map_dofs.cpp b/cuda/cuda/kernels/blas/vector_map_dofs.cpp
deleted file mode 100644
index 5f0e291f..00000000
--- a/cuda/cuda/kernels/blas/vector_map_dofs.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_map_dofs0(const int N,
-                      double* __restrict v0,
-                      const double* __restrict v1,
-                      const int* v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int idx = v2[i];
-      v0[idx] = v1[idx];
-   }
-}
-
-// *****************************************************************************
-void vector_map_dofs(const int N,
-                     double* __restrict v0,
-                     const double* __restrict v1,
-                     const int* v2)
-{
-   cuKer(vector_map_dofs,N,v0,v1,v2);
-}
diff --git a/cuda/cuda/kernels/blas/vector_min.cpp b/cuda/cuda/kernels/blas/vector_min.cpp
deleted file mode 100644
index 9d9d3f1d..00000000
--- a/cuda/cuda/kernels/blas/vector_min.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-#define CUDA_BLOCKSIZE 256
-
-// *****************************************************************************
-__global__ void cuKernelMin(const size_t N, double *gdsr, const double *x)
-{
-   __shared__ double s_min[CUDA_BLOCKSIZE];
-   const size_t n = blockDim.x*blockIdx.x + threadIdx.x;
-   if (n>=N) { return; }
-   const size_t bid = blockIdx.x;
-   const size_t tid = threadIdx.x;
-   const size_t bbd = bid*blockDim.x;
-   const size_t rid = bbd+tid;
-   s_min[tid] = x[n];
-   for (size_t workers=blockDim.x>>1; workers>0; workers>>=1)
-   {
-      __syncthreads();
-      if (tid >= workers) { continue; }
-      if (rid >= N) { continue; }
-      const size_t dualTid = tid + workers;
-      if (dualTid >= N) { continue; }
-      const size_t rdd = bbd+dualTid;
-      if (rdd >= N) { continue; }
-      if (dualTid >= blockDim.x) { continue; }
-      s_min[tid] = fmin(s_min[tid],s_min[dualTid]);
-   }
-   if (tid==0) { gdsr[bid] = s_min[0]; }
-}
-
-// *****************************************************************************
-double cuVectorMin(const size_t N, const double *x)
-{
-   const size_t tpb = CUDA_BLOCKSIZE;
-   const size_t blockSize = CUDA_BLOCKSIZE;
-   const size_t gridSize = (N+blockSize-1)/blockSize;
-   const size_t min_sz = (N%tpb)==0? (N/tpb) : (1+N/tpb);
-   const size_t bytes = min_sz*sizeof(double);
-   static double *h_min = NULL;
-   if (!h_min) { h_min = (double*)calloc(min_sz,sizeof(double)); }
-   static CUdeviceptr gdsr = (CUdeviceptr) NULL;
-   if (!gdsr) { cuMemAlloc(&gdsr,bytes); }
-   cuKernelMin<<<gridSize,blockSize>>>(N, (double*)gdsr, x);
-   cuMemcpy((CUdeviceptr)h_min,(CUdeviceptr)gdsr,bytes);
-   double min = HUGE_VAL;
-   for (size_t i=0; i<min_sz; i+=1) { min = fmin(min,h_min[i]); }
-   return min;
-}
-
-
-// *****************************************************************************
-double vector_min(const int N, const double* __restrict x)
-{
-   return cuVectorMin(N, x);
-}
-
diff --git a/cuda/cuda/kernels/blas/vector_neg.cpp b/cuda/cuda/kernels/blas/vector_neg.cpp
deleted file mode 100644
index 3005bfd6..00000000
--- a/cuda/cuda/kernels/blas/vector_neg.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_neg0(const int N,
-                 double* __restrict vec)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { vec[i] *= -1.0; }
-}
-
-// *****************************************************************************
-void vector_neg(const int N,
-                double* __restrict vec)
-{
-   cuKer(vector_neg,N,vec);
-}
diff --git a/cuda/cuda/kernels/blas/vector_op_eq.cpp b/cuda/cuda/kernels/blas/vector_op_eq.cpp
deleted file mode 100644
index 516dec0c..00000000
--- a/cuda/cuda/kernels/blas/vector_op_eq.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel void vector_op_eq0(const int N,
-                                     const double c0,
-                                     double* __restrict v0)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] = c0; }
-}
-
-// *****************************************************************************
-void vector_op_eq(const int N,
-                  const double c0,
-                  double* __restrict v0)
-{
-   cuKer(vector_op_eq,N,c0,v0);
-}
diff --git a/cuda/cuda/kernels/blas/vector_op_eq_d.cu b/cuda/cuda/kernels/blas/vector_op_eq_d.cu
deleted file mode 100644
index 4a0a1d76..00000000
--- a/cuda/cuda/kernels/blas/vector_op_eq_d.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-static __global__ void d_vector_op_eq0(const int N,
-                                       const double c0,
-                                       double* __restrict v0){
-  const int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) v0[i] = c0;
-}
-
-// *****************************************************************************
-extern "C" __global__ void d_vector_op_eq(const int N,
-                                          const double c0,
-                                          double* __restrict v0){
-  const size_t blockSize = 128;
-  const size_t gridSize = (N+blockSize-1)/blockSize;
-  d_vector_op_eq0<<<gridSize,blockSize>>>(N,c0,v0);
-}
diff --git a/cuda/cuda/kernels/blas/vector_set_subvector.cpp b/cuda/cuda/kernels/blas/vector_set_subvector.cpp
deleted file mode 100644
index 8aca6153..00000000
--- a/cuda/cuda/kernels/blas/vector_set_subvector.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_set_subvector0(const int N,
-                           double* __restrict v0,
-                           const double* __restrict v1,
-                           const int* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int dof_i = v2[i];
-      const bool tst = dof_i >= 0;
-      const int idx = tst?dof_i:-dof_i-1;
-      const double value = tst?v1[i]:-v1[i];
-      v0[idx]=value;
-   }
-}
-
-// *****************************************************************************
-void vector_set_subvector(const int N,
-                          double* __restrict v0,
-                          const double* __restrict v1,
-                          const int* __restrict v2)
-{
-   cuKer(vector_set_subvector,N,v0,v1,v2);
-}
diff --git a/cuda/cuda/kernels/blas/vector_set_subvector_const.cpp b/cuda/cuda/kernels/blas/vector_set_subvector_const.cpp
deleted file mode 100644
index 5b105708..00000000
--- a/cuda/cuda/kernels/blas/vector_set_subvector_const.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_set_subvector_const0(const int N,
-                                 const double value,
-                                 double* __restrict data,
-                                 const int* __restrict tdofs)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i >= N) { return; }
-   const int dof_i = tdofs[i];
-   data[dof_i] = value;
-   if (dof_i >= 0)
-   {
-      data[dof_i] = value;
-   }
-   else
-   {
-      data[-dof_i-1] = -value;
-   }
-}
-
-// *****************************************************************************
-void vector_set_subvector_const(const int N,
-                                const double value,
-                                double* __restrict data,
-                                const int* __restrict tdofs)
-{
-   cuKer(vector_set_subvector_const,N,value,data,tdofs);
-}
diff --git a/cuda/cuda/kernels/blas/vector_vec_add.cpp b/cuda/cuda/kernels/blas/vector_vec_add.cpp
deleted file mode 100644
index bf50e4b9..00000000
--- a/cuda/cuda/kernels/blas/vector_vec_add.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_vec_add0(const int N,
-                     double* __restrict v0,
-                     const double* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] += v1[i]; }
-}
-
-// *****************************************************************************
-void vector_vec_add(const int N,
-                    double* __restrict v0,
-                    const double* __restrict v1)
-{
-   cuKer(vector_vec_add,N,v0,v1);
-}
diff --git a/cuda/cuda/kernels/blas/vector_vec_mul.cpp b/cuda/cuda/kernels/blas/vector_vec_mul.cpp
deleted file mode 100644
index 1abc6390..00000000
--- a/cuda/cuda/kernels/blas/vector_vec_mul.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_vec_mul0(const int N,
-                     double* __restrict v0,
-                     const double d)
-{
-   const int i = blockDim.x*blockIdx.x+threadIdx.x;
-   if (i < N) { v0[i]*=d; }
-}
-
-// *****************************************************************************
-void vector_vec_mul(const int N,
-                    double* __restrict v0,
-                    const double d)
-{
-   cuKer(vector_vec_mul,N,v0,d);
-}
diff --git a/cuda/cuda/kernels/blas/vector_vec_sub.cpp b/cuda/cuda/kernels/blas/vector_vec_sub.cpp
deleted file mode 100644
index 7f4b19da..00000000
--- a/cuda/cuda/kernels/blas/vector_vec_sub.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_vec_sub0(const int N,
-                     double* __restrict v0,
-                     const double* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] -= v1[i]; }
-}
-
-// *****************************************************************************
-void vector_vec_sub(const int N,
-                    double* __restrict v0,
-                    const double* __restrict v1)
-{
-   cuKer(vector_vec_sub,N,v0,v1);
-}
diff --git a/cuda/cuda/kernels/blas/vector_xpay.cpp b/cuda/cuda/kernels/blas/vector_xpay.cpp
deleted file mode 100644
index e28ff5ce..00000000
--- a/cuda/cuda/kernels/blas/vector_xpay.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_xpay0(const int N,
-                  const double c0,
-                  double* __restrict v0,
-                  const double* __restrict v1,
-                  const double* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] = v1[i] + (c0 * v2[i]); }
-}
-
-// *****************************************************************************
-void vector_xpay(const int N,
-                 const double c0,
-                 double* __restrict v0,
-                 const double* __restrict v1,
-                 const double* __restrict v2)
-{
-   cuKer(vector_xpay,N,c0,v0,v1,v2);
-}
diff --git a/cuda/cuda/kernels/blas/vector_xsy.cpp b/cuda/cuda/kernels/blas/vector_xsy.cpp
deleted file mode 100644
index 34516f30..00000000
--- a/cuda/cuda/kernels/blas/vector_xsy.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_xsy0(const int N,
-                 double* __restrict v0,
-                 const double* __restrict v1,
-                 const double* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] = v1[i]-v2[i]; }
-}
-
-// *****************************************************************************
-void vector_xsy(const int N,
-                double* __restrict v0,
-                const double* __restrict v1,
-                const double* __restrict v2)
-{
-   cuKer(vector_xsy,N,v0,v1,v2);
-}
diff --git a/cuda/cuda/kernels/cuda.hpp b/cuda/cuda/kernels/cuda.hpp
deleted file mode 100644
index 504c9c95..00000000
--- a/cuda/cuda/kernels/cuda.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_KERNELS_CUDA
-#define LAGHOS_CUDA_KERNELS_CUDA
-
-// *****************************************************************************
-#include <math.h>
-#include <stdarg.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <functional>
-#include <unordered_map>
-
-// *****************************************************************************
-#define LOG2(X) ((unsigned) (8*sizeof(unsigned long long)-__builtin_clzll((X))))
-#define ISQRT(N) sqrt(static_cast<float>(N))
-#define ICBRT(N) cbrt(static_cast<float>(N))
-#define IROOT(D,N) ((D==1)?N:(D==2)?ISQRT(N):(D==3)?ICBRT(N):0)
-
-// *****************************************************************************
-#include <cuda.h>
-
-// *****************************************************************************
-#include "../config/config.hpp"
-#include "../general/memcpy.hpp"
-#include "../general/malloc.hpp"
-
-// *****************************************************************************
-#include "include/forall.hpp"
-#include "include/offsets.hpp"
-#include "include/kernels.hpp"
-
-#endif // LAGHOS_CUDA_KERNELS_CUDA
diff --git a/cuda/cuda/kernels/force/force.cpp b/cuda/cuda/kernels/force/force.cpp
deleted file mode 100644
index bd3d1acb..00000000
--- a/cuda/cuda/kernels/force/force.cpp
+++ /dev/null
@@ -1,654 +0,0 @@
-
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-static void rForceMult2D(const int numElements,
-                         const double* restrict L2DofToQuad,
-                         const double* restrict H1QuadToDof,
-                         const double* restrict H1QuadToDofD,
-                         const double* restrict stressJinvT,
-                         const double* restrict e,
-                         double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double e_xy[NUM_QUAD_2D];
-      for (int i = 0; i < NUM_QUAD_2D; ++i)
-      {
-         e_xy[i] = 0;
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-      {
-         double e_x[NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            e_x[qy] = 0;
-         }
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            const double r_e = e[ijkN(dx,dy,el,L2_DOFS_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               e_x[qx] += L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * r_e;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double wy = L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               e_xy[ijN(qx,qy,NUM_QUAD_1D)] += wy * e_x[qx];
-            }
-         }
-      }
-      for (int c = 0; c < 2; ++c)
-      {
-         for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)] = 0.0;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double Dxy[H1_DOFS_1D];
-            double xy[H1_DOFS_1D];
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               Dxy[dx] = 0.0;
-               xy[dx]  = 0.0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double esx = e_xy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(0,c,qx,qy,
-                                                                                     el,NUM_DIM,NUM_QUAD_1D)];
-               const double esy = e_xy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(1,c,qx,qy,
-                                                                                     el,NUM_DIM,NUM_QUAD_1D)];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  Dxy[dx] += esx * H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)];
-                  xy[dx]  += esy * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-               }
-            }
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               const double wy  = H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-               const double wDy = H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)] += wy* Dxy[dx] + wDy*xy[dx];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-static void rForceMultTranspose2D(const int numElements,
-                                  const double* restrict L2QuadToDof,
-                                  const double* restrict H1DofToQuad,
-                                  const double* restrict H1DofToQuadD,
-                                  const double* restrict stressJinvT,
-                                  const double* restrict v,
-                                  double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double vStress[NUM_QUAD_2D];
-      for (int i = 0; i < NUM_QUAD_2D; ++i)
-      {
-         vStress[i] = 0;
-      }
-      for (int c = 0; c < NUM_DIM; ++c)
-      {
-         double v_Dxy[NUM_QUAD_2D];
-         double v_xDy[NUM_QUAD_2D];
-         for (int i = 0; i < NUM_QUAD_2D; ++i)
-         {
-            v_Dxy[i] = v_xDy[i] = 0;
-         }
-         for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-         {
-            double v_x[NUM_QUAD_1D];
-            double v_Dx[NUM_QUAD_1D];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               v_x[qx] = v_Dx[qx] = 0;
-            }
-
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               const double r_v = v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  v_x[qx]  += r_v * H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  v_Dx[qx] += r_v * H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy  = H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               const double wDy = H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  v_Dxy[ijN(qx,qy,NUM_QUAD_1D)] += v_Dx[qx] * wy;
-                  v_xDy[ijN(qx,qy,NUM_QUAD_1D)] += v_x[qx]  * wDy;
-               }
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               vStress[ijN(qx,qy,NUM_QUAD_1D)] +=
-                  ((v_Dxy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,
-                                                                        NUM_QUAD_1D)]) +
-                   (v_xDy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,
-                                                                        NUM_QUAD_1D)]));
-            }
-         }
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-      {
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            e[ijkN(dx,dy,el,L2_DOFS_1D)] = 0;
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         double e_x[L2_DOFS_1D];
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            e_x[dx] = 0;
-         }
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            const double r_v = vStress[ijN(qx,qy,NUM_QUAD_1D)];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e_x[dx] += r_v * L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-            }
-         }
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            const double w = L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e[ijkN(dx,dy,el,L2_DOFS_1D)] += e_x[dx] * w;
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMult3D(const int numElements,
-                  const double* restrict L2DofToQuad,
-                  const double* restrict H1QuadToDof,
-                  const double* restrict H1QuadToDofD,
-                  const double* restrict stressJinvT,
-                  const double* restrict e,
-                  double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double e_xyz[NUM_QUAD_3D];
-      for (int i = 0; i < NUM_QUAD_3D; ++i)
-      {
-         e_xyz[i] = 0;
-      }
-      for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-      {
-         double e_xy[NUM_QUAD_2D];
-         for (int i = 0; i < NUM_QUAD_2D; ++i)
-         {
-            e_xy[i] = 0;
-         }
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            double e_x[NUM_QUAD_1D];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               e_x[qy] = 0;
-            }
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               const double r_e = e[ijklN(dx,dy,dz,el,L2_DOFS_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_x[qx] += L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * r_e;
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_xy[ijN(qx,qy,NUM_QUAD_1D)] += wy * e_x[qx];
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = L2DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_xyz[ijkN(qx,qy,qz,NUM_QUAD_1D)] += wz * e_xy[ijN(qx,qy,NUM_QUAD_1D)];
-               }
-            }
-         }
-      }
-      for (int c = 0; c < 3; ++c)
-      {
-         for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-         {
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] = 0;
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            double Dxy_x[H1_DOFS_1D * H1_DOFS_1D];
-            double xDy_y[H1_DOFS_1D * H1_DOFS_1D];
-            double xy_z[H1_DOFS_1D * H1_DOFS_1D] ;
-            for (int d = 0; d < (H1_DOFS_1D * H1_DOFS_1D); ++d)
-            {
-               Dxy_x[d] = xDy_y[d] = xy_z[d] = 0;
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               double Dx_x[H1_DOFS_1D];
-               double x_y[H1_DOFS_1D];
-               double x_z[H1_DOFS_1D];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  Dx_x[dx] = x_y[dx] = x_z[dx] = 0;
-               }
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  const double r_e = e_xyz[ijkN(qx,qy,qz,NUM_QUAD_1D)];
-                  const double esx = r_e * stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,NUM_DIM,
-                                                                NUM_QUAD_1D)];
-                  const double esy = r_e * stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,
-                                                                NUM_QUAD_1D)];
-                  const double esz = r_e * stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,
-                                                                NUM_QUAD_1D)];
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     Dx_x[dx] += esx * H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)];
-                     x_y[dx]  += esy * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-                     x_z[dx]  += esz * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-                  }
-               }
-               for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-               {
-                  const double wy  = H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-                  const double wDy = H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     Dxy_x[ijN(dx,dy,H1_DOFS_1D)] += Dx_x[dx] * wy;
-                     xDy_y[ijN(dx,dy,H1_DOFS_1D)] += x_y[dx]  * wDy;
-                     xy_z[ijN(dx,dy,H1_DOFS_1D)]  += x_z[dx]  * wy;
-                  }
-               }
-            }
-            for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-            {
-               const double wz  = H1QuadToDof[ijN(dz,qz,H1_DOFS_1D)];
-               const double wDz = H1QuadToDofD[ijN(dz,qz,H1_DOFS_1D)];
-               for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-               {
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] +=
-                        ((Dxy_x[ijN(dx,dy,H1_DOFS_1D)] * wz) +
-                         (xDy_y[ijN(dx,dy,H1_DOFS_1D)] * wz) +
-                         (xy_z[ijN(dx,dy,H1_DOFS_1D)]  * wDz));
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-static void rForceMultTranspose3D(const int numElements,
-                                  const double* restrict L2QuadToDof,
-                                  const double* restrict H1DofToQuad,
-                                  const double* restrict H1DofToQuadD,
-                                  const double* restrict stressJinvT,
-                                  const double* restrict v,
-                                  double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double vStress[NUM_QUAD_3D];
-      for (int i = 0; i < NUM_QUAD_3D; ++i)
-      {
-         vStress[i] = 0;
-      }
-      for (int c = 0; c < NUM_DIM; ++c)
-      {
-         for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-         {
-            double Dxy_x[NUM_QUAD_2D];
-            double xDy_y[NUM_QUAD_2D];
-            double xy_z[NUM_QUAD_2D] ;
-            for (int i = 0; i < NUM_QUAD_2D; ++i)
-            {
-               Dxy_x[i] = xDy_y[i] = xy_z[i] = 0;
-            }
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               double Dx_x[NUM_QUAD_1D];
-               double x_y[NUM_QUAD_1D];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  Dx_x[qx] = x_y[qx] = 0;
-               }
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  const double r_v = v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     Dx_x[qx] += r_v * H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                     x_y[qx]  += r_v * H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  }
-               }
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  const double wy  = H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                  const double wDy = H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     Dxy_x[ijN(qx,qy,NUM_QUAD_1D)] += Dx_x[qx] * wy;
-                     xDy_y[ijN(qx,qy,NUM_QUAD_1D)] += x_y[qx]  * wDy;
-                     xy_z[ijN(qx,qy,NUM_QUAD_1D)]  += x_y[qx]  * wy;
-                  }
-               }
-            }
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               const double wz  = H1DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-               const double wDz = H1DofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     vStress[ijkN(qx,qy,qz,NUM_QUAD_1D)] +=
-                        ((Dxy_x[ijN(qx,qy,NUM_QUAD_1D)]*wz *stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,
-                                                                                 NUM_DIM,NUM_QUAD_1D)]) +
-                         (xDy_y[ijN(qx,qy,NUM_QUAD_1D)]*wz *stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,
-                                                                                 NUM_QUAD_1D)]) +
-                         (xy_z[ijN(qx,qy,NUM_QUAD_1D)] *wDz*stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,
-                                                                                 NUM_QUAD_1D)]));
-                  }
-               }
-            }
-         }
-      }
-      for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-      {
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] = 0;
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         double e_xy[L2_DOFS_1D * L2_DOFS_1D];
-         for (int d = 0; d < (L2_DOFS_1D * L2_DOFS_1D); ++d)
-         {
-            e_xy[d] = 0;
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double e_x[L2_DOFS_1D];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e_x[dx] = 0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double r_v = vStress[ijkN(qx,qy,qz,NUM_QUAD_1D)];
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e_x[dx] += r_v * L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-               }
-            }
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-            {
-               const double w = L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e_xy[ijN(dx,dy,L2_DOFS_1D)] += e_x[dx] * w;
-               }
-            }
-         }
-         for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-         {
-            const double w = L2QuadToDof[ijN(dz,qz,L2_DOFS_1D)];
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] += w * e_xy[ijN(dx,dy,L2_DOFS_1D)];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fForceMult)(const int numElements,
-                           const double* restrict L2QuadToDof,
-                           const double* restrict H1DofToQuad,
-                           const double* restrict H1DofToQuadD,
-                           const double* restrict stressJinvT,
-                           const double* restrict e,
-                           double* restrict v);
-
-// *****************************************************************************
-void rForceMult(const int NUM_DIM,
-                const int NUM_DOFS_1D,
-                const int NUM_QUAD_1D,
-                const int L2_DOFS_1D,
-                const int H1_DOFS_1D,
-                const int nzones,
-                const double* restrict L2QuadToDof,
-                const double* restrict H1DofToQuad,
-                const double* restrict H1DofToQuadD,
-                const double* restrict stressJinvT,
-                const double* restrict e,
-                double* restrict v)
-{
-   const int blck = CUDA_BLOCK_SIZE;
-   const int grid = (nzones+blck-1)/blck;
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   static std::unordered_map<unsigned long long, fForceMult> call =
-   {
-      {0x20,&rForceMult2D<2,2,2,1,2>},
-      {0x21,&rForceMult2D<2,3,4,2,3>},
-      {0x22,&rForceMult2D<2,4,6,3,4>},
-      {0x23,&rForceMult2D<2,5,8,4,5>},
-      {0x24,&rForceMult2D<2,6,10,5,6>},
-      {0x25,&rForceMult2D<2,7,12,6,7>},
-      {0x26,&rForceMult2D<2,8,14,7,8>},
-      {0x27,&rForceMult2D<2,9,16,8,9>},
-      {0x28,&rForceMult2D<2,10,18,9,10>},
-      {0x29,&rForceMult2D<2,11,20,10,11>},
-      {0x2A,&rForceMult2D<2,12,22,11,12>},
-      {0x2B,&rForceMult2D<2,13,24,12,13>},
-      {0x2C,&rForceMult2D<2,14,26,13,14>},
-      {0x2D,&rForceMult2D<2,15,28,14,15>},
-      {0x2E,&rForceMult2D<2,16,30,15,16>},
-      {0x2F,&rForceMult2D<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMult3D<3,2,2,1,2>},
-      {0x31,&rForceMult3D<3,3,4,2,3>},
-      {0x32,&rForceMult3D<3,4,6,3,4>},
-      {0x33,&rForceMult3D<3,5,8,4,5>},
-      {0x34,&rForceMult3D<3,6,10,5,6>},
-      {0x35,&rForceMult3D<3,7,12,6,7>},
-      {0x36,&rForceMult3D<3,8,14,7,8>},
-      {0x37,&rForceMult3D<3,9,16,8,9>},
-      {0x38,&rForceMult3D<3,10,18,9,10>},
-      {0x39,&rForceMult3D<3,11,20,10,11>},
-      {0x3A,&rForceMult3D<3,12,22,11,12>},
-      {0x3B,&rForceMult3D<3,13,24,12,13>},
-      {0x3C,&rForceMult3D<3,14,26,13,14>},
-      {0x3D,&rForceMult3D<3,15,28,14,15>},
-      {0x3E,&rForceMult3D<3,16,30,15,16>},
-      {0x3F,&rForceMult3D<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMult] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         nzones,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,e,v);
-}
-
-// *****************************************************************************
-typedef void (*fForceMultTranspose)(const int numElements,
-                                    const double* restrict L2QuadToDof,
-                                    const double* restrict H1DofToQuad,
-                                    const double* restrict H1DofToQuadD,
-                                    const double* restrict stressJinvT,
-                                    const double* restrict v,
-                                    double* restrict e);
-
-// *****************************************************************************
-void rForceMultTranspose(const int NUM_DIM,
-                         const int NUM_DOFS_1D,
-                         const int NUM_QUAD_1D,
-                         const int L2_DOFS_1D,
-                         const int H1_DOFS_1D,
-                         const int nzones,
-                         const double* restrict L2QuadToDof,
-                         const double* restrict H1DofToQuad,
-                         const double* restrict H1DofToQuadD,
-                         const double* restrict stressJinvT,
-                         const double* restrict v,
-                         double* restrict e)
-{
-   const int blck = CUDA_BLOCK_SIZE;
-   const int grid = (nzones+blck-1)/blck;
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned long long, fForceMultTranspose> call =
-   {
-      // 2D
-      {0x20,&rForceMultTranspose2D<2,2,2,1,2>},
-      {0x21,&rForceMultTranspose2D<2,3,4,2,3>},
-      {0x22,&rForceMultTranspose2D<2,4,6,3,4>},
-      {0x23,&rForceMultTranspose2D<2,5,8,4,5>},
-      {0x24,&rForceMultTranspose2D<2,6,10,5,6>},
-      {0x25,&rForceMultTranspose2D<2,7,12,6,7>},
-      {0x26,&rForceMultTranspose2D<2,8,14,7,8>},
-      {0x27,&rForceMultTranspose2D<2,9,16,8,9>},
-      {0x28,&rForceMultTranspose2D<2,10,18,9,10>},
-      {0x29,&rForceMultTranspose2D<2,11,20,10,11>},
-      {0x2A,&rForceMultTranspose2D<2,12,22,11,12>},
-      {0x2B,&rForceMultTranspose2D<2,13,24,12,13>},
-      {0x2C,&rForceMultTranspose2D<2,14,26,13,14>},
-      {0x2D,&rForceMultTranspose2D<2,15,28,14,15>},
-      {0x2E,&rForceMultTranspose2D<2,16,30,15,16>},
-      {0x2F,&rForceMultTranspose2D<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMultTranspose3D<3,2,2,1,2>},
-      {0x31,&rForceMultTranspose3D<3,3,4,2,3>},
-      {0x32,&rForceMultTranspose3D<3,4,6,3,4>},
-      {0x33,&rForceMultTranspose3D<3,5,8,4,5>},
-      {0x34,&rForceMultTranspose3D<3,6,10,5,6>},
-      {0x35,&rForceMultTranspose3D<3,7,12,6,7>},
-      {0x36,&rForceMultTranspose3D<3,8,14,7,8>},
-      {0x37,&rForceMultTranspose3D<3,9,16,8,9>},
-      {0x38,&rForceMultTranspose3D<3,10,18,9,10>},
-      {0x39,&rForceMultTranspose3D<3,11,20,10,11>},
-      {0x3A,&rForceMultTranspose3D<3,12,22,11,12>},
-      {0x3B,&rForceMultTranspose3D<3,13,24,12,13>},
-      {0x3C,&rForceMultTranspose3D<3,14,26,13,14>},
-      {0x3D,&rForceMultTranspose3D<3,15,28,14,15>},
-      {0x3E,&rForceMultTranspose3D<3,16,30,15,16>},
-      {0x3F,&rForceMultTranspose3D<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMultTranspose] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         nzones,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,v,e);
-}
-
diff --git a/cuda/cuda/kernels/geom/initGeom.cpp b/cuda/cuda/kernels/geom/initGeom.cpp
deleted file mode 100644
index 0d4c529c..00000000
--- a/cuda/cuda/kernels/geom/initGeom.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-kernel
-void rNodeCopyByVDim0(const int elements,
-                      const int numDofs,
-                      const int ndofs,
-                      const int dims,
-                      const int* eMap,
-                      const double* Sx,
-                      double* nodes)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < elements)
-   {
-      for (int dof = 0; dof < numDofs; ++dof)
-      {
-         const int lid = dof+numDofs*e;
-         const int gid = eMap[lid];
-         for (int v = 0; v < dims; ++v)
-         {
-            const int moffset = v+dims*lid;
-            const int voffset = gid+v*ndofs;
-            nodes[moffset] = Sx[voffset];
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-void rNodeCopyByVDim(const int elements,
-                     const int numDofs,
-                     const int ndofs,
-                     const int dims,
-                     const int* eMap,
-                     const double* Sx,
-                     double* nodes)
-{
-   cuKer(rNodeCopyByVDim,elements,numDofs,ndofs,dims,eMap,Sx,nodes);
-}
-
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD> kernel
-void rIniGeom1D(const int numElements,
-                const double* restrict dofToQuadD,
-                const double* restrict nodes,
-                double* restrict J,
-                double* restrict invJ,
-                double* restrict detJ)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double s_nodes[NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d += NUM_QUAD)
-         {
-            s_nodes[d] = nodes[ijkN(0,d,e,NUM_QUAD)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijN(q,d,NUM_DOFS)];
-            J11 += wx * s_nodes[d];
-         }
-         J[ijN(q,e,NUM_QUAD)] = J11;
-         invJ[ijN(q, e,NUM_QUAD)] = 1.0 / J11;
-         detJ[ijN(q, e,NUM_QUAD)] = J11;
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD> kernel
-void rIniGeom2D(const int numElements,
-                const double* restrict dofToQuadD,
-                const double* restrict nodes,
-                double* restrict J,
-                double* restrict invJ,
-                double* restrict detJ)
-{
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double s_nodes[2 * NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d +=NUM_QUAD)
-         {
-            s_nodes[ijN(0,d,2)] = nodes[ijkNM(0,d,el,2,NUM_DOFS)];
-            s_nodes[ijN(1,d,2)] = nodes[ijkNM(1,d,el,2,NUM_DOFS)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0; double J12 = 0;
-         double J21 = 0; double J22 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijkNM(0,q,d,2,NUM_QUAD)];
-            const double wy = dofToQuadD[ijkNM(1,q,d,2,NUM_QUAD)];
-            const double x = s_nodes[ijN(0,d,2)];
-            const double y = s_nodes[ijN(1,d,2)];
-            J11 += (wx * x); J12 += (wx * y);
-            J21 += (wy * x); J22 += (wy * y);
-         }
-         const double r_detJ = (J11 * J22)-(J12 * J21);
-         J[ijklNM(0, 0, q, el,2,NUM_QUAD)] = J11;
-         J[ijklNM(1, 0, q, el,2,NUM_QUAD)] = J12;
-         J[ijklNM(0, 1, q, el,2,NUM_QUAD)] = J21;
-         J[ijklNM(1, 1, q, el,2,NUM_QUAD)] = J22;
-         const double r_idetJ = 1.0 / r_detJ;
-         invJ[ijklNM(0, 0, q, el,2,NUM_QUAD)] =  J22 * r_idetJ;
-         invJ[ijklNM(1, 0, q, el,2,NUM_QUAD)] = -J12 * r_idetJ;
-         invJ[ijklNM(0, 1, q, el,2,NUM_QUAD)] = -J21 * r_idetJ;
-         invJ[ijklNM(1, 1, q, el,2,NUM_QUAD)] =  J11 * r_idetJ;
-         detJ[ijN(q, el,NUM_QUAD)] = r_detJ;
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD> kernel
-void rIniGeom3D(const int numElements,
-                const double* restrict dofToQuadD,
-                const double* restrict nodes,
-                double* restrict J,
-                double* restrict invJ,
-                double* restrict detJ)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double s_nodes[3*NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d += NUM_QUAD)
-         {
-            s_nodes[ijN(0,d,3)] = nodes[ijkNM(0, d, e,3,NUM_DOFS)];
-            s_nodes[ijN(1,d,3)] = nodes[ijkNM(1, d, e,3,NUM_DOFS)];
-            s_nodes[ijN(2,d,3)] = nodes[ijkNM(2, d, e,3,NUM_DOFS)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0; double J12 = 0; double J13 = 0;
-         double J21 = 0; double J22 = 0; double J23 = 0;
-         double J31 = 0; double J32 = 0; double J33 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijkNM(0, q, d,3,NUM_QUAD)];
-            const double wy = dofToQuadD[ijkNM(1, q, d,3,NUM_QUAD)];
-            const double wz = dofToQuadD[ijkNM(2, q, d,3,NUM_QUAD)];
-            const double x = s_nodes[ijN(0, d,3)];
-            const double y = s_nodes[ijN(1, d,3)];
-            const double z = s_nodes[ijN(2, d,3)];
-            J11 += (wx * x); J12 += (wx * y); J13 += (wx * z);
-            J21 += (wy * x); J22 += (wy * y); J23 += (wy * z);
-            J31 += (wz * x); J32 += (wz * y); J33 += (wz * z);
-         }
-         const double r_detJ = ((J11 * J22 * J33) + (J12 * J23 * J31) +
-                                (J13 * J21 * J32) -
-                                (J13 * J22 * J31)-(J12 * J21 * J33)-(J11 * J23 * J32));
-         J[ijklNM(0, 0, q, e,3,NUM_QUAD)] = J11;
-         J[ijklNM(1, 0, q, e,3,NUM_QUAD)] = J12;
-         J[ijklNM(2, 0, q, e,3,NUM_QUAD)] = J13;
-         J[ijklNM(0, 1, q, e,3,NUM_QUAD)] = J21;
-         J[ijklNM(1, 1, q, e,3,NUM_QUAD)] = J22;
-         J[ijklNM(2, 1, q, e,3,NUM_QUAD)] = J23;
-         J[ijklNM(0, 2, q, e,3,NUM_QUAD)] = J31;
-         J[ijklNM(1, 2, q, e,3,NUM_QUAD)] = J32;
-         J[ijklNM(2, 2, q, e,3,NUM_QUAD)] = J33;
-
-         const double r_idetJ = 1.0 / r_detJ;
-         invJ[ijklNM(0, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J22 * J33)-(J23 * J32));
-         invJ[ijklNM(1, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J32 * J13)-(J33 * J12));
-         invJ[ijklNM(2, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J12 * J23)-(J13 * J22));
-
-         invJ[ijklNM(0, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J23 * J31)-(J21 * J33));
-         invJ[ijklNM(1, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J33 * J11)-(J31 * J13));
-         invJ[ijklNM(2, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J13 * J21)-(J11 * J23));
-
-         invJ[ijklNM(0, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J21 * J32)-(J22 * J31));
-         invJ[ijklNM(1, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J31 * J12)-(J32 * J11));
-         invJ[ijklNM(2, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J11 * J22)-(J12 * J21));
-         detJ[ijN(q, e,NUM_QUAD)] = r_detJ;
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fIniGeom)(const int numElements,
-                         const double* restrict dofToQuadD,
-                         const double* restrict nodes,
-                         double* restrict J,
-                         double* restrict invJ,
-                         double* restrict detJ);
-
-
-// *****************************************************************************
-void rIniGeom(const int DIM,
-              const int NUM_DOFS,
-              const int NUM_QUAD,
-              const int numElements,
-              const double* dofToQuadD,
-              const double* nodes,
-              double* restrict J,
-              double* restrict invJ,
-              double* restrict detJ)
-{
-   const int blck = CUDA_BLOCK_SIZE;
-   const int grid = (numElements+blck-1)/blck;
-   const unsigned int dofs1D = IROOT(DIM,NUM_DOFS);
-   const unsigned int quad1D = IROOT(DIM,NUM_QUAD);
-   const unsigned int id = (DIM<<4)|(dofs1D-2);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(dofs1D-2)<=4);
-   if (quad1D!=2*(dofs1D-1))
-   {
-      return exit(
-                printf("\033[31;1m[rIniGeom] order ERROR: -ok=p -ot=p-1, p in [1,16] (%d,%d)\033[m\n",
-                       quad1D,dofs1D));
-   }
-   assert(quad1D==2*(dofs1D-1));
-   static std::unordered_map<unsigned int, fIniGeom> call =
-   {
-      // 2D
-      {0x20,&rIniGeom2D<2*2,(2*2-2)*(2*2-2)>},
-      {0x21,&rIniGeom2D<3*3,(3*2-2)*(3*2-2)>},
-      {0x22,&rIniGeom2D<4*4,(4*2-2)*(4*2-2)>},
-      {0x23,&rIniGeom2D<5*5,(5*2-2)*(5*2-2)>},
-      {0x24,&rIniGeom2D<6*6,(6*2-2)*(6*2-2)>},
-      {0x25,&rIniGeom2D<7*7,(7*2-2)*(7*2-2)>},
-      {0x26,&rIniGeom2D<8*8,(8*2-2)*(8*2-2)>},
-      {0x27,&rIniGeom2D<9*9,(9*2-2)*(9*2-2)>},
-      {0x28,&rIniGeom2D<10*10,(10*2-2)*(10*2-2)>},
-      {0x29,&rIniGeom2D<11*11,(11*2-2)*(11*2-2)>},
-      {0x2A,&rIniGeom2D<12*12,(12*2-2)*(12*2-2)>},
-      {0x2B,&rIniGeom2D<13*13,(13*2-2)*(13*2-2)>},
-      {0x2C,&rIniGeom2D<14*14,(14*2-2)*(14*2-2)>},
-      {0x2D,&rIniGeom2D<15*15,(15*2-2)*(15*2-2)>},
-      {0x2E,&rIniGeom2D<16*16,(16*2-2)*(16*2-2)>},
-      {0x2F,&rIniGeom2D<17*17,(17*2-2)*(17*2-2)>},
-      // 3D
-      {0x30,&rIniGeom3D<2*2*2,2*2*2>},
-      {0x31,&rIniGeom3D<3*3*3,4*4*4>},
-      {0x32,&rIniGeom3D<4*4*4,6*6*6>},
-      {0x33,&rIniGeom3D<5*5*5,8*8*8>},
-      {0x34,&rIniGeom3D<6*6*6,10*10*10>},
-      {0x35,&rIniGeom3D<7*7*7,12*12*12>},
-      {0x36,&rIniGeom3D<8*8*8,14*14*14>},
-      {0x37,&rIniGeom3D<9*9*9,16*16*16>},
-      {0x38,&rIniGeom3D<10*10*10,18*18*18>},
-      {0x39,&rIniGeom3D<11*11*11,20*20*20>},
-      {0x3A,&rIniGeom3D<12*12*12,22*22*22>},
-      {0x3B,&rIniGeom3D<13*13*13,24*24*24>},
-      {0x3C,&rIniGeom3D<14*14*14,26*26*26>},
-      {0x3D,&rIniGeom3D<15*15*15,28*28*28>},
-      {0x3E,&rIniGeom3D<16*16*16,30*30*30>},
-      {0x3F,&rIniGeom3D<17*17*17,32*32*32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rIniGeom] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuadD,nodes,J,invJ,detJ);
-}
diff --git a/cuda/cuda/kernels/include/forall.hpp b/cuda/cuda/kernels/include/forall.hpp
deleted file mode 100644
index 1d65ca0e..00000000
--- a/cuda/cuda/kernels/include/forall.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_KERNELS_FORALL
-#define LAGHOS_CUDA_KERNELS_FORALL
-
-// *****************************************************************************
-#define CUDA_BLOCK_SIZE 256
-
-#define ELEMENT_BATCH 10
-#define M2_ELEMENT_BATCH 32
-
-// *****************************************************************************
-#define kernel __global__
-#define share __shared__
-#define sync __syncthreads();
-// *****************************************************************************
-#define cuKer(name,end,...)                                             \
-   name ## 0<<<((end+CUDA_BLOCK_SIZE-1)/CUDA_BLOCK_SIZE),               \
-               CUDA_BLOCK_SIZE>>>(end,__VA_ARGS__)
-#define cuKerGBS(name,grid,block,end,...) name ## 0<<<grid,block>>>(end,__VA_ARGS__)
-#define call0(id,grid,blck,...) call[id]<<<grid,blck>>>(__VA_ARGS__)
-
-#endif // LAGHOS_CUDA_KERNELS_FORALL
diff --git a/cuda/cuda/kernels/include/kernels.hpp b/cuda/cuda/kernels/include/kernels.hpp
deleted file mode 100644
index e4ef2731..00000000
--- a/cuda/cuda/kernels/include/kernels.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_KERNELS
-#define LAGHOS_CUDA_KERNELS
-
-// *****************************************************************************
-#define restrict __restrict__
-
-// **** BLAS1 ******************************************************************
-void vector_neg(const int, double* restrict);
-void vector_op_eq(const int, const double, double* restrict);
-void vector_xpay(const int, const double,
-                 double* restrict, const double* restrict,
-                 const double* restrict);
-void vector_xsy(const int, double* restrict,
-                const double* restrict, const double* restrict);
-void vector_axpy(const int, const double, double* restrict,
-                 const double* restrict);
-void vector_map_dofs(const int, double* restrict,
-                     const double* restrict, const int* restrict);
-void vector_clear_dofs(const int, double* restrict, const int* restrict);
-void vector_vec_sub(const int, double* restrict, const double* restrict);
-void vector_vec_add(const int, double* restrict, const double* restrict);
-void vector_vec_mul(const int, double* restrict, const double);
-void vector_set_subvector(const int, double* restrict, const double* restrict,
-                          const int* restrict);
-void vector_get_subvector(const int, double* restrict, const double* restrict,
-                          const int* restrict);
-void vector_set_subvector_const(const int, const double, double* restrict,
-                                const int* restrict);
-double vector_dot(const int, const double* restrict, const double* restrict);
-double vector_min(const int, const double* restrict);
-
-// *****************************************************************************
-void reduceMin(int, const double*, double*);
-void reduceSum(int, const double*, const double*, double*);
-
-// *****************************************************************************
-void rGridFuncToQuad(const int, const int, const int,
-                     const int, const int,
-                     const double* restrict, const int* restrict,
-                     const double* restrict, double* restrict);
-
-void rGridFuncToQuadS(const int, const int, const int,
-                      const int, const int,
-                      const double* restrict, const int* restrict,
-                      const double* restrict, double* restrict);
-
-// mapping *********************************************************************
-void rSetSubVector(const int, const int* restrict,
-                   const double* restrict, double* restrict);
-
-void rMapSubVector(const int, const int* restrict,
-                   const double* restrict, double* restrict);
-
-void rExtractSubVector(const int ries, const int* restrict,
-                       const double* restrict, double* restrict);
-
-// kQuadratureData *************************************************************
-void rInitQuadratureData(const int, const int,
-                         const double* restrict, const double* restrict,
-                         const double* restrict, double* restrict);
-
-void rUpdateQuadratureData(const double, const double, const double,
-                           const bool, const int, const int, const int,
-                           const int, const int,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           double* restrict, double* restrict);
-void rUpdateQuadratureDataS(const double, const double, const double,
-                            const bool, const int, const int, const int,
-                            const int, const int,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            double* restrict, double* restrict);
-
-// kForce **********************************************************************
-void rForceMult(const int, const int, const int, const int, const int,
-                const int,
-                const double* restrict, const double* restrict,
-                const double* restrict, const double* restrict,
-                const double* restrict, double* restrict);
-void rForceMultS(const int, const int, const int, const int, const int,
-                 const int, const double* restrict, const double* restrict,
-                 const double* restrict, const double* restrict,
-                 const double* restrict, double* restrict);
-
-void rForceMultTranspose(const int, const int, const int, const int,
-                         const int, const int,
-                         const double* restrict, const double* restrict,
-                         const double* restrict, const double* restrict,
-                         const double* restrict, double* restrict);
-void rForceMultTransposeS(const int, const int, const int,
-                          const int, const int, const int,
-                          const double* restrict, const double* restrict,
-                          const double* restrict, const double* restrict,
-                          const double* restrict, double* restrict);
-
-// *****************************************************************************
-void rNodeCopyByVDim(const int, const int, const int, const int,
-                     const int* restrict, const double* restrict,
-                     double* restrict);
-
-// *****************************************************************************
-void rIniGeom(const int, const int, const int, const int,
-              const double* restrict, const double* restrict,
-              double* restrict, double* restrict, double* restrict);
-
-// *****************************************************************************
-void rGlobalToLocal(const int, const bool, const int, const int,
-                    const int* restrict, const int* restrict,
-                    const double* restrict, double* restrict);
-
-void rLocalToGlobal(const int, const bool, const int,
-                    const int, const int* restrict, const int* restrict,
-                    const double* restrict, double* restrict);
-
-// *****************************************************************************
-void rMassMultAdd(const int, const int, const int, const int,
-                  const double* restrict, const double* restrict,
-                  const double* restrict, const double* restrict,
-                  const double* restrict, const double* restrict,
-                  double* restrict);
-
-void rMassMultAddS(const int, const int, const int, const int,
-                   const double* restrict, const double* restrict,
-                   const double* restrict, const double* restrict,
-                   const double* restrict, const double* restrict,
-                   double* restrict);
-
-#endif // LAGHOS_CUDA_KERNELS
diff --git a/cuda/cuda/kernels/include/offsets.hpp b/cuda/cuda/kernels/include/offsets.hpp
deleted file mode 100644
index ba03579f..00000000
--- a/cuda/cuda/kernels/include/offsets.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_KERNEL_OFFSETS
-#define LAGHOS_CUDA_KERNEL_OFFSETS
-
-// N-Offsets *******************************************************************
-#define   ijN(i,j,N) (i)+(N)*(j)
-#define  ijkN(i,j,k,N) (i)+(N)*((j)+(N)*(k))
-#define ijklN(i,j,k,l,N) (i)+(N)*((j)+(N)*((k)+(N)*(l)))
-
-// N,M-Offsets *****************************************************************
-#define    ijNMt(i,j,N,M,t) (t)?((i)+(N)*(j)):((j)+(M)*(i))
-#define    ijkNM(i,j,k,N,M) (i)+(N)*((j)+(M)*(k))
-#define   _ijkNM(i,j,k,N,M) (j)+(N)*((k)+(M)*(i))
-#define   ijklNM(i,j,k,l,N,M) (i)+(N)*((j)+(N)*((k)+(M)*(l)))
-#define  _ijklNM(i,j,k,l,N,M)  (j)+(N)*((k)+(N)*((l)+(M)*(i)))
-#define  ijklmNM(i,j,k,l,m,N,M) (i)+(N)*((j)+(N)*((k)+(M)*((l)+(M)*(m))))
-#define _ijklmNM(i,j,k,l,m,N,M) (j)+(N)*((k)+(N)*((l)+(N)*((m)+(M)*(i))))
-#define ijklmnNM(i,j,k,l,m,n,N,M) (i)+(N)*((j)+(N)*((k)+(M)*((l)+(M)*((m)+(M)*(n)))))
-
-#endif // LAGHOS_CUDA_KERNEL_OFFSETS
diff --git a/cuda/cuda/kernels/maps/globalToLocal.cpp b/cuda/cuda/kernels/maps/globalToLocal.cpp
deleted file mode 100644
index 05708805..00000000
--- a/cuda/cuda/kernels/maps/globalToLocal.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rGlobalToLocal0(const int globalEntries,
-                     const int NUM_VDIM,
-                     const bool VDIM_ORDERING,
-                     const int localEntries,
-                     const int* __restrict offsets,
-                     const int* __restrict indices,
-                     const double* __restrict globalX,
-                     double* __restrict localX)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < globalEntries)
-   {
-      const int offset = offsets[i];
-      const int nextOffset = offsets[i+1];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         const int g_offset = ijNMt(v,i,NUM_VDIM,globalEntries,VDIM_ORDERING);
-         const double dofValue = globalX[g_offset];
-         for (int j = offset; j < nextOffset; ++j)
-         {
-            const int l_offset = ijNMt(v,indices[j],NUM_VDIM,localEntries,VDIM_ORDERING);
-            localX[l_offset] = dofValue;
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-void rGlobalToLocal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* __restrict offsets,
-                    const int* __restrict indices,
-                    const double* __restrict globalX,
-                    double* __restrict localX)
-{
-   cuKer(rGlobalToLocal,globalEntries,NUM_VDIM,VDIM_ORDERING,
-         localEntries,offsets,indices,globalX,localX);
-}
diff --git a/cuda/cuda/kernels/maps/localToGlobal.cpp b/cuda/cuda/kernels/maps/localToGlobal.cpp
deleted file mode 100644
index 8433b872..00000000
--- a/cuda/cuda/kernels/maps/localToGlobal.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rLocalToGlobal0(const int globalEntries,
-                     const int NUM_VDIM,
-                     const bool VDIM_ORDERING,
-                     const int localEntries,
-                     const int* offsets,
-                     const int* indices,
-                     const double* localX,
-                     double* __restrict globalX)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < globalEntries)
-   {
-      const int offset = offsets[i];
-      const int nextOffset = offsets[i + 1];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         double dofValue = 0;
-         for (int j = offset; j < nextOffset; ++j)
-         {
-            const int l_offset = ijNMt(v,indices[j],NUM_VDIM,localEntries,VDIM_ORDERING);
-            dofValue += localX[l_offset];
-         }
-         const int g_offset = ijNMt(v,i,NUM_VDIM,globalEntries,VDIM_ORDERING);
-         globalX[g_offset] = dofValue;
-      }
-   }
-}
-
-// *****************************************************************************
-void rLocalToGlobal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* offsets,
-                    const int* indices,
-                    const double* localX,
-                    double* __restrict globalX)
-{
-   cuKer(rLocalToGlobal,globalEntries,NUM_VDIM,VDIM_ORDERING,
-         localEntries,offsets,indices,localX,globalX);
-}
diff --git a/cuda/cuda/kernels/maps/mapping.cpp b/cuda/cuda/kernels/maps/mapping.cpp
deleted file mode 100644
index 3bf100fd..00000000
--- a/cuda/cuda/kernels/maps/mapping.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rSetSubVector0(const int N,
-                    const int* indices,
-                    const double* in,
-                    double* __restrict out)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { out[indices[i]] = in[i]; }
-}
-
-// *****************************************************************************
-void rSetSubVector(const int N,
-                   const int* indices,
-                   const double* in,
-                   double* __restrict out)
-{
-   cuKer(rSetSubVector,N,indices,in,out);
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rMapSubVector0(const int N,
-                    const int* indices,
-                    const double* in,
-                    double* __restrict out)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int fromIdx = indices[2*i + 0];
-      const int toIdx   = indices[2*i + 1];
-      out[toIdx] = in[fromIdx];
-   }
-}
-
-// *****************************************************************************
-void rMapSubVector(const int N,
-                   const int* indices,
-                   const double* in,
-                   double* __restrict out)
-{
-   cuKer(rMapSubVector,N,indices,in,out);
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rExtractSubVector0(const int N,
-                        const int* indices,
-                        const double* in,
-                        double* __restrict out)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { out[i] = in[indices[i]]; }
-}
-
-// *****************************************************************************
-void rExtractSubVector(const int N,
-                       const int* indices,
-                       const double* in,
-                       double* __restrict out)
-{
-   cuKer(rExtractSubVector,N,indices,in,out);
-}
diff --git a/cuda/cuda/kernels/mass/assemble.cpp b/cuda/cuda/kernels/mass/assemble.cpp
deleted file mode 100644
index b9e4d908..00000000
--- a/cuda/cuda/kernels/mass/assemble.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble2D0(const int numElements,
-                      const int NUM_QUAD_2D,
-                      const double COEFF,
-                      const double* quadWeights,
-                      const double* J,
-                      double* __restrict oper)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      for (int q = 0; q < NUM_QUAD_2D; ++q)
-      {
-         const double J11 = J[ijklNM(0,0,q,e,2,NUM_QUAD_2D)];
-         const double J12 = J[ijklNM(1,0,q,e,2,NUM_QUAD_2D)];
-         const double J21 = J[ijklNM(0,1,q,e,2,NUM_QUAD_2D)];
-         const double J22 = J[ijklNM(1,1,q,e,2,NUM_QUAD_2D)];
-         const double detJ = ((J11 * J22)-(J21 * J12));
-         oper[ijN(q,e,NUM_QUAD_2D)] = quadWeights[q] * COEFF * detJ;
-      }
-   }
-}
-
-// *****************************************************************************
-static void rMassAssemble2D(const int numElements,
-                            const int NUM_QUAD_2D,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   cuKer(rMassAssemble2D,numElements,NUM_QUAD_2D,COEFF,quadWeights,J,oper);
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble3D0(const int numElements,
-                      const int NUM_QUAD_3D,
-                      const double COEFF,
-                      const double* quadWeights,
-                      const double* J,
-                      double* __restrict oper)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      for (int q = 0; q < NUM_QUAD_3D; ++q)
-      {
-         const double J11 = J[ijklNM(0,0,q,e,3,NUM_QUAD_3D)];
-         const double J12 = J[ijklNM(1,0,q,e,3,NUM_QUAD_3D)];
-         const double J13 = J[ijklNM(2,0,q,e,3,NUM_QUAD_3D)];
-         const double J21 = J[ijklNM(0,1,q,e,3,NUM_QUAD_3D)];
-         const double J22 = J[ijklNM(1,1,q,e,3,NUM_QUAD_3D)];
-         const double J23 = J[ijklNM(2,1,q,e,3,NUM_QUAD_3D)];
-         const double J31 = J[ijklNM(0,2,q,e,3,NUM_QUAD_3D)];
-         const double J32 = J[ijklNM(1,2,q,e,3,NUM_QUAD_3D)];
-         const double J33 = J[ijklNM(2,2,q,e,3,NUM_QUAD_3D)];
-         const double detJ = ((J11*J22*J33)+(J12*J23*J31)+
-                              (J13*J21*J32)-(J13*J22*J31)-
-                              (J12*J21*J33)-(J11*J23*J32));
-         oper[ijN(q,e,NUM_QUAD_3D)] = quadWeights[q]*COEFF*detJ;
-      }
-   }
-}
-static void rMassAssemble3D(const int NUM_QUAD_3D,
-                            const int numElements,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   cuKer(rMassAssemble3D,numElements,NUM_QUAD_3D,COEFF,quadWeights,J,oper);
-}
-
-// *****************************************************************************
-void rMassAssemble(const int dim,
-                   const int NUM_QUAD,
-                   const int numElements,
-                   const double* quadWeights,
-                   const double* J,
-                   const double COEFF,
-                   double* __restrict oper)
-{
-   assert(false);
-   if (dim==1) { assert(false); }
-   if (dim==2) { rMassAssemble2D(numElements,NUM_QUAD,COEFF,quadWeights,J,oper); }
-   if (dim==3) { rMassAssemble3D(numElements,NUM_QUAD,COEFF,quadWeights,J,oper); }
-}
diff --git a/cuda/cuda/kernels/mass/multAdd.cpp b/cuda/cuda/kernels/mass/multAdd.cpp
deleted file mode 100644
index 07c53d53..00000000
--- a/cuda/cuda/kernels/mass/multAdd.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd2D(const int numElements,
-                    const double* restrict dofToQuad,
-                    const double* restrict dofToQuadD,
-                    const double* restrict quadToDof,
-                    const double* restrict quadToDofD,
-                    const double* restrict oper,
-                    const double* restrict solIn,
-                    double* restrict solOut)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double sol_xy[NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            sol_xy[qy][qx] = 0.0;
-         }
-      }
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double sol_x[NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            sol_x[qy] = 0.0;
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            const double s = solIn[ijkN(dx,dy,e,NUM_DOFS_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_x[qx] += dofToQuad[ijN(qx,dx,NUM_QUAD_1D)]* s;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double d2q = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xy[qy][qx] += d2q * sol_x[qx];
-            }
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            sol_xy[qy][qx] *= oper[ijkN(qx,qy,e,NUM_QUAD_1D)];
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         double sol_x[NUM_DOFS_1D];
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            sol_x[dx] = 0.0;
-         }
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            const double s = sol_xy[qy][qx];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_x[dx] += quadToDof[ijN(dx,qx,NUM_DOFS_1D)] * s;
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            const double q2d = quadToDof[ijN(dy,qy,NUM_DOFS_1D)];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               solOut[ijkN(dx,dy,e,NUM_DOFS_1D)] += q2d * sol_x[dx];
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd3D(const int numElements,
-                    const double* dofToQuad,
-                    const double* dofToQuadD,
-                    const double* quadToDof,
-                    const double* quadToDofD,
-                    const double* oper,
-                    const double* solIn,
-                    double* __restrict solOut)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double sol_xyz[NUM_QUAD_1D][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xyz[qz][qy][qx] = 0;
-            }
-         }
-      }
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double sol_xy[NUM_QUAD_1D][NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xy[qy][qx] = 0;
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double sol_x[NUM_QUAD_1D];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_x[qx] = 0;
-            }
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               const double s = solIn[ijklN(dx,dy,dz,e,NUM_DOFS_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_x[qx] += dofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * s;
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_xy[qy][qx] += wy * sol_x[qx];
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
-               }
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xyz[qz][qy][qx] *= oper[ijklN(qx,qy,qz,e,NUM_QUAD_1D)];
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         double sol_xy[NUM_DOFS_1D][NUM_DOFS_1D];
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_xy[dy][dx] = 0;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double sol_x[NUM_DOFS_1D];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_x[dx] = 0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double s = sol_xyz[qz][qy][qx];
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  sol_x[dx] += quadToDof[ijN(dx,qx,NUM_DOFS_1D)] * s;
-               }
-            }
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               const double wy = quadToDof[ijN(dy,qy,NUM_DOFS_1D)];
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  sol_xy[dy][dx] += wy * sol_x[dx];
-               }
-            }
-         }
-         for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-         {
-            const double wz = quadToDof[ijN(dz,qz,NUM_DOFS_1D)];
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  solOut[ijklN(dx,dy,dz,e,NUM_DOFS_1D)] += wz * sol_xy[dy][dx];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fMassMultAdd)(const int numElements,
-                             const double* dofToQuad,
-                             const double* dofToQuadD,
-                             const double* quadToDof,
-                             const double* quadToDofD,
-                             const double* oper,
-                             const double* solIn,
-                             double* __restrict solOut);
-
-// *****************************************************************************
-void rMassMultAdd(const int DIM,
-                  const int NUM_DOFS_1D,
-                  const int NUM_QUAD_1D,
-                  const int numElements,
-                  const double* dofToQuad,
-                  const double* dofToQuadD,
-                  const double* quadToDof,
-                  const double* quadToDofD,
-                  const double* op,
-                  const double* x,
-                  double* __restrict y)
-{
-   const int blck = 256;
-   const int grid = (numElements+blck-1)/blck;
-   assert(LOG2(DIM)<=4);
-   assert((NUM_QUAD_1D&1)==0);
-   assert(LOG2(NUM_DOFS_1D-1)<=8);
-   assert(LOG2(NUM_QUAD_1D>>1)<=8);
-   const unsigned int id = (DIM<<16)|((NUM_DOFS_1D-1)<<8)|(NUM_QUAD_1D>>1);
-   static std::unordered_map<unsigned int, fMassMultAdd> call =
-   {
-      // 2D
-      {0x20001,&rMassMultAdd2D<1,2>},    {0x20101,&rMassMultAdd2D<2,2>},
-      {0x20102,&rMassMultAdd2D<2,4>},    {0x20202,&rMassMultAdd2D<3,4>},
-      {0x20203,&rMassMultAdd2D<3,6>},    {0x20303,&rMassMultAdd2D<4,6>},
-      {0x20304,&rMassMultAdd2D<4,8>},    {0x20404,&rMassMultAdd2D<5,8>},
-      {0x20405,&rMassMultAdd2D<5,10>},   {0x20505,&rMassMultAdd2D<6,10>},
-      {0x20506,&rMassMultAdd2D<6,12>},   {0x20606,&rMassMultAdd2D<7,12>},
-      {0x20607,&rMassMultAdd2D<7,14>},   {0x20707,&rMassMultAdd2D<8,14>},
-      {0x20708,&rMassMultAdd2D<8,16>},   {0x20808,&rMassMultAdd2D<9,16>},
-      {0x20809,&rMassMultAdd2D<9,18>},   {0x20909,&rMassMultAdd2D<10,18>},
-      {0x2090A,&rMassMultAdd2D<10,20>},  {0x20A0A,&rMassMultAdd2D<11,20>},
-      {0x20A0B,&rMassMultAdd2D<11,22>},  {0x20B0B,&rMassMultAdd2D<12,22>},
-      {0x20B0C,&rMassMultAdd2D<12,24>},  {0x20C0C,&rMassMultAdd2D<13,24>},
-      {0x20C0D,&rMassMultAdd2D<13,26>},  {0x20D0D,&rMassMultAdd2D<14,26>},
-      {0x20D0E,&rMassMultAdd2D<14,28>},  {0x20E0E,&rMassMultAdd2D<15,28>},
-      {0x20E0F,&rMassMultAdd2D<15,30>},  {0x20F0F,&rMassMultAdd2D<16,30>},
-      {0x20F10,&rMassMultAdd2D<16,32>},  {0x21010,&rMassMultAdd2D<17,32>},
-      // 3D
-      {0x30001,&rMassMultAdd3D<1,2>},    {0x30101,&rMassMultAdd3D<2,2>},
-      {0x30102,&rMassMultAdd3D<2,4>},    {0x30202,&rMassMultAdd3D<3,4>},
-      {0x30203,&rMassMultAdd3D<3,6>},    {0x30303,&rMassMultAdd3D<4,6>},
-      {0x30304,&rMassMultAdd3D<4,8>},    {0x30404,&rMassMultAdd3D<5,8>},
-      {0x30405,&rMassMultAdd3D<5,10>},   {0x30505,&rMassMultAdd3D<6,10>},
-      {0x30506,&rMassMultAdd3D<6,12>},   {0x30606,&rMassMultAdd3D<7,12>},
-      {0x30607,&rMassMultAdd3D<7,14>},   {0x30707,&rMassMultAdd3D<8,14>},
-      {0x30708,&rMassMultAdd3D<8,16>},   {0x30808,&rMassMultAdd3D<9,16>},
-      {0x30809,&rMassMultAdd3D<9,18>},   {0x30909,&rMassMultAdd3D<10,18>},
-      {0x3090A,&rMassMultAdd3D<10,20>},  {0x30A0A,&rMassMultAdd3D<11,20>},
-      {0x30A0B,&rMassMultAdd3D<11,22>},  {0x30B0B,&rMassMultAdd3D<12,22>},
-      {0x30B0C,&rMassMultAdd3D<12,24>},  {0x30C0C,&rMassMultAdd3D<13,24>},
-      {0x30C0D,&rMassMultAdd3D<13,26>},  {0x30D0D,&rMassMultAdd3D<14,26>},
-      {0x30D0E,&rMassMultAdd3D<14,28>},  {0x30E0E,&rMassMultAdd3D<15,28>},
-      {0x30E0F,&rMassMultAdd3D<15,30>},  {0x30F0F,&rMassMultAdd3D<16,30>},
-      {0x30F10,&rMassMultAdd3D<16,32>},  {0x31010,&rMassMultAdd3D<17,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rMassMultAdd] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuad,dofToQuadD,quadToDof,quadToDofD,op,x,y);
-}
diff --git a/cuda/cuda/kernels/quad/gridFuncToQuad.cpp b/cuda/cuda/kernels/quad/gridFuncToQuad.cpp
deleted file mode 100644
index baf3f34e..00000000
--- a/cuda/cuda/kernels/quad/gridFuncToQuad.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad1D(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double r_out[NUM_VDIM][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            r_out[v][qx] = 0;
-         }
-      }
-      for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-      {
-         const int gid = l2gMap[(dx) + (NUM_DOFS_1D) * (e)];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            const double r_gf = gf[v + gid * NUM_VDIM];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               r_out[v][qx] += r_gf * dofToQuad[(qx) + (NUM_QUAD_1D) * (dx)];
-            }
-         }
-      }
-      for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-      {
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            out[(qx) + (NUM_QUAD_1D) * ((e) + (numElements) * (v))] = r_out[v][qx];
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad2D(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double out_xy[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               out_xy[v][qy][qx] = 0;
-            }
-         }
-      }
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double out_x[NUM_VDIM][NUM_QUAD_1D];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               out_x[v][qy] = 0;
-            }
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            const int gid = l2gMap[ijkN(dx, dy, e,NUM_DOFS_1D)];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               const double r_gf = gf[v + gid*NUM_VDIM];
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  out_x[v][qy] += r_gf * dofToQuad[ijN(qy, dx,NUM_QUAD_1D)];
-               }
-            }
-         }
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double d2q = dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xy[v][qy][qx] += d2q * out_x[v][qx];
-               }
-            }
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               out[_ijklNM(v, qx, qy, e,NUM_QUAD_1D,numElements)] = out_xy[v][qy][qx];
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad3D(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double out_xyz[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xyz[v][qz][qy][qx] = 0;
-               }
-            }
-         }
-      }
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double out_xy[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xy[v][qy][qx] = 0;
-               }
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double out_x[NUM_VDIM][NUM_QUAD_1D];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_x[v][qx] = 0;
-               }
-            }
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               const int gid = l2gMap[ijklN(dx, dy, dz, e,NUM_DOFS_1D)];
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  const double r_gf = gf[v + gid*NUM_VDIM];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_x[v][qx] += r_gf * dofToQuad[ijN(qx, dx, NUM_QUAD_1D)];
-                  }
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = dofToQuad[ijN(qy, dy, NUM_QUAD_1D)];
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_xy[v][qy][qx] += wy * out_x[v][qx];
-                  }
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = dofToQuad[ijN(qz, dz, NUM_QUAD_1D)];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_xyz[v][qz][qy][qx] += wz * out_xy[v][qy][qx];
-                  }
-               }
-            }
-         }
-      }
-
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  out[_ijklmNM(v, qx, qy, qz, e,NUM_QUAD_1D,
-                               numElements)] = out_xyz[v][qz][qy][qx];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fGridFuncToQuad)(const int numElements,
-                                const double* restrict dofToQuad,
-                                const int* restrict l2gMap,
-                                const double* gf,
-                                double* restrict out);
-
-// *****************************************************************************
-void rGridFuncToQuad(const int DIM,
-                     const int NUM_VDIM,
-                     const int NUM_DOFS_1D,
-                     const int NUM_QUAD_1D,
-                     const int numElements,
-                     const double* dofToQuad,
-                     const int* l2gMap,
-                     const double* gf,
-                     double* __restrict out)
-{
-   const int blck = CUDA_BLOCK_SIZE;
-   const int grid = (numElements+blck-1)/blck;
-   const unsigned int id = (DIM<<8)|(NUM_VDIM<<4)|(NUM_DOFS_1D-1);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(NUM_VDIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-1)<=4);
-   assert(NUM_QUAD_1D==2*NUM_DOFS_1D);
-   if (NUM_QUAD_1D!=2*NUM_DOFS_1D)
-   {
-      return exit(
-                printf("\033[31;1m[rGridFuncToQuad] order ERROR: -ok=p -ot=p-1, p in [1,16]\033[m\n"));
-   }
-   static std::unordered_map<unsigned int, fGridFuncToQuad> call =
-   {
-      // 2D
-      {0x210,&rGridFuncToQuad2D<1,1,2>},
-      {0x211,&rGridFuncToQuad2D<1,2,4>},
-      {0x212,&rGridFuncToQuad2D<1,3,6>},
-      {0x213,&rGridFuncToQuad2D<1,4,8>},
-      {0x214,&rGridFuncToQuad2D<1,5,10>},
-      {0x215,&rGridFuncToQuad2D<1,6,12>},
-      {0x216,&rGridFuncToQuad2D<1,7,14>},
-      {0x217,&rGridFuncToQuad2D<1,8,16>},
-      {0x218,&rGridFuncToQuad2D<1,9,18>},
-      {0x219,&rGridFuncToQuad2D<1,10,20>},
-      {0x21A,&rGridFuncToQuad2D<1,11,22>},
-      {0x21B,&rGridFuncToQuad2D<1,12,24>},
-      {0x21C,&rGridFuncToQuad2D<1,13,26>},
-      {0x21D,&rGridFuncToQuad2D<1,14,28>},
-      {0x21E,&rGridFuncToQuad2D<1,15,30>},
-      {0x21F,&rGridFuncToQuad2D<1,16,32>},
-
-      // 3D
-      {0x310,&rGridFuncToQuad3D<1,1,2>},
-      {0x311,&rGridFuncToQuad3D<1,2,4>},
-      {0x312,&rGridFuncToQuad3D<1,3,6>},
-      {0x313,&rGridFuncToQuad3D<1,4,8>},
-      {0x314,&rGridFuncToQuad3D<1,5,10>},
-      {0x315,&rGridFuncToQuad3D<1,6,12>},
-      {0x316,&rGridFuncToQuad3D<1,7,14>},
-      {0x317,&rGridFuncToQuad3D<1,8,16>},
-      {0x318,&rGridFuncToQuad3D<1,9,18>},
-      {0x319,&rGridFuncToQuad3D<1,10,20>},
-      {0x31A,&rGridFuncToQuad3D<1,11,22>},
-      {0x31B,&rGridFuncToQuad3D<1,12,24>},
-      {0x31C,&rGridFuncToQuad3D<1,13,26>},
-      {0x31D,&rGridFuncToQuad3D<1,14,28>},
-      {0x31E,&rGridFuncToQuad3D<1,15,30>},
-      {0x31F,&rGridFuncToQuad3D<1,16,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rGridFuncToQuad] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuad,l2gMap,gf,out);
-}
diff --git a/cuda/cuda/kernels/quad/qDataInit.cpp b/cuda/cuda/kernels/quad/qDataInit.cpp
deleted file mode 100644
index c2e5345f..00000000
--- a/cuda/cuda/kernels/quad/qDataInit.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-
-// *****************************************************************************
-template<const int NUM_QUAD> kernel
-void rInitQuadData(const int nzones,
-                   const double* restrict rho0,
-                   const double* restrict detJ,
-                   const double* restrict quadWeights,
-                   double* restrict rho0DetJ0w)
-{
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < nzones)
-   {
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         rho0DetJ0w[ijN(q,el,NUM_QUAD)] =
-            rho0[ijN(q,el,NUM_QUAD)]*detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fInitQuadratureData)(const int,const double*,const double*,
-                                    const double*,double*);
-void rInitQuadratureData(const int NUM_QUAD,
-                         const int numElements,
-                         const double* restrict rho0,
-                         const double* restrict detJ,
-                         const double* restrict quadWeights,
-                         double* restrict rho0DetJ0w)
-{
-   const int blck = CUDA_BLOCK_SIZE;
-   const int grid = (numElements+blck-1)/blck;
-   const unsigned int id = NUM_QUAD;
-   static std::unordered_map<unsigned int, fInitQuadratureData> call =
-   {
-      {2,&rInitQuadData<2>},
-      {4,&rInitQuadData<4>},
-      {8,&rInitQuadData<8>},
-      {16,&rInitQuadData<16>},
-      {25,&rInitQuadData<25>},
-      {36,&rInitQuadData<36>},
-      {49,&rInitQuadData<49>},
-      {64,&rInitQuadData<64>},
-      {81,&rInitQuadData<81>},
-      {100,&rInitQuadData<100>},
-      {121,&rInitQuadData<121>},
-      {125,&rInitQuadData<125>},
-      {144,&rInitQuadData<144>},
-      {196,&rInitQuadData<196>},
-      {216,&rInitQuadData<216>},
-      {256,&rInitQuadData<256>},
-      {324,&rInitQuadData<324>},
-      {400,&rInitQuadData<400>},
-      {484,&rInitQuadData<484>},
-      {512,&rInitQuadData<512>},
-      {576,&rInitQuadData<576>},
-      {676,&rInitQuadData<676>},
-      {900,&rInitQuadData<900>},
-      {1000,&rInitQuadData<1000>},
-      {1024,&rInitQuadData<1024>},
-      {1728,&rInitQuadData<1728>},
-      {2744,&rInitQuadData<2744>},
-      {4096,&rInitQuadData<4096>},
-      {5832,&rInitQuadData<5832>},
-      {8000,&rInitQuadData<8000>},
-      {10648,&rInitQuadData<10648>},
-      {13824,&rInitQuadData<13824>},
-      {17576,&rInitQuadData<17576>},
-      {21952,&rInitQuadData<21952>},
-      {27000,&rInitQuadData<27000>},
-      {32768,&rInitQuadData<32768>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rInitQuadratureData] id \033[33m0x%X (%d)\033[m ",id,id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,rho0,detJ,quadWeights,rho0DetJ0w);
-}
diff --git a/cuda/cuda/kernels/quad/qDataUpdate.cpp b/cuda/cuda/kernels/quad/qDataUpdate.cpp
deleted file mode 100644
index 7de26537..00000000
--- a/cuda/cuda/kernels/quad/qDataUpdate.cpp
+++ /dev/null
@@ -1,658 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData2D(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      const int DIM = 2;
-      const int VDIMQ = DIM*DIM * NUM_QUAD_2D;
-      double s_gradv[VDIMQ];
-
-      for (int i = 0; i < VDIMQ; ++i) { s_gradv[i] = 0.0; }
-
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double vDx[DIM*NUM_QUAD_1D];
-         double  vx[DIM*NUM_QUAD_1D];
-
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            for (int c = 0; c < DIM; ++c)
-            {
-               vDx[ijN(c,qx,DIM)] = 0.0;
-               vx[ijN(c,qx,DIM)] = 0.0;
-            }
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double wx  =  dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-               const double wDx = dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-               for (int c = 0; c < DIM; ++c)
-               {
-                  vDx[ijN(c,qx,DIM)] += wDx * v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-                  vx[ijN(c,qx,DIM)] +=  wx * v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-               }
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double  wy =  dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            const double wDy = dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               for (int c = 0; c < DIM; ++c)
-               {
-                  s_gradv[ijkN(c,0,qx+qy*NUM_QUAD_1D,DIM)] += wy *vDx[ijN(c,qx,DIM)];
-                  s_gradv[ijkN(c,1,qx+qy*NUM_QUAD_1D,DIM)] += wDy*vx[ijN(c,qx,DIM)];
-               }
-            }
-         }
-      }
-
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double q_gradv[NUM_DIM*NUM_DIM];
-         double q_stress[NUM_DIM*NUM_DIM];
-
-         const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-         q_gradv[ijN(0,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_00)+(s_gradv[ijkN(1,0,q,
-                                                                               2)]*invJ_01));
-         q_gradv[ijN(1,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_10)+(s_gradv[ijkN(1,0,q,
-                                                                               2)]*invJ_11));
-         q_gradv[ijN(0,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_00)+(s_gradv[ijkN(1,1,q,
-                                                                               2)]*invJ_01));
-         q_gradv[ijN(1,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_10)+(s_gradv[ijkN(1,1,q,
-                                                                               2)]*invJ_11));
-
-         const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-
-         const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-         const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-
-         // TODO: Input OccaVector eos(q,e) -> (stress,soundSpeed)
-         const double s = -(GAMMA-1.0)*q_rho*q_e;
-         q_stress[ijN(0,0,2)] = s; q_stress[ijN(1,0,2)] = 0;
-         q_stress[ijN(0,1,2)] = 0; q_stress[ijN(1,1,2)] = s;
-
-         const double gradv00 = q_gradv[ijN(0,0,2)];
-         const double gradv11 = q_gradv[ijN(1,1,2)];
-         const double gradv10 = 0.5*(q_gradv[ijN(1,0,2)]+q_gradv[ijN(0,1,2)]);
-         q_gradv[ijN(1,0,2)] = gradv10;
-         q_gradv[ijN(0,1,2)] = gradv10;
-
-         double comprDirX = 1;
-         double comprDirY = 0;
-         double minEig = 0;
-         // linalg/densemat.cpp: Eigensystem2S()
-         if (gradv10 == 0)
-         {
-            minEig = (gradv00 < gradv11) ? gradv00 : gradv11;
-         }
-         else
-         {
-            const double zeta  = (gradv11-gradv00) / (2.0*gradv10);
-            const double azeta = fabs(zeta);
-            double t = 1.0 / (azeta+sqrt(1.0+zeta*zeta));
-            if ((t < 0) != (zeta < 0))
-            {
-               t = -t;
-            }
-            const double c = sqrt(1.0 / (1.0+t*t));
-            const double s = c*t;
-            t *= gradv10;
-            if ((gradv00-t) <= (gradv11+t))
-            {
-               minEig = gradv00-t;
-               comprDirX = c;
-               comprDirY = -s;
-            }
-            else
-            {
-               minEig = gradv11+t;
-               comprDirX = s;
-               comprDirY = c;
-            }
-         }
-
-         // Computes the initial->physical transformation Jacobian.
-         const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double Jpi_00 = ((J_00*invJ0_00)+(J_10*invJ0_01));
-         const double Jpi_10 = ((J_00*invJ0_10)+(J_10*invJ0_11));
-         const double Jpi_01 = ((J_01*invJ0_00)+(J_11*invJ0_01));
-         const double Jpi_11 = ((J_01*invJ0_10)+(J_11*invJ0_11));
-         const double physDirX = (Jpi_00*comprDirX)+(Jpi_10*comprDirY);
-         const double physDirY = (Jpi_01*comprDirX)+(Jpi_11*comprDirY);
-         const double q_h = H0*sqrt((physDirX*physDirX)+(physDirY*physDirY));
-         // TODO: soundSpeed will be an input as well (function call or values per q)
-         const double soundSpeed = sqrt(GAMMA*(GAMMA-1.0)*q_e);
-         dtEst[ijN(q,el,NUM_QUAD)] = CFL*q_h / soundSpeed;
-         if (USE_VISCOSITY)
-         {
-            // TODO: Check how we can extract outside of kernel
-            const double mu = minEig;
-            double coeff = 2.0*q_rho*q_h*q_h*fabs(mu);
-            if (mu < 0)
-            {
-               coeff += 0.5*q_rho*q_h*soundSpeed;
-            }
-            for (int y = 0; y < NUM_DIM; ++y)
-            {
-               for (int x = 0; x < NUM_DIM; ++x)
-               {
-                  q_stress[ijN(x,y,2)] += coeff*q_gradv[ijN(x,y,2)];
-               }
-            }
-         }
-         const double S00 = q_stress[ijN(0,0,2)];
-         const double S10 = q_stress[ijN(1,0,2)];
-         const double S01 = q_stress[ijN(0,1,2)];
-         const double S11 = q_stress[ijN(1,1,2)];
-         stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_00)+(S10*invJ_01));
-         stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_10)+(S10*invJ_11));
-         stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_00)+(S11*invJ_01));
-         stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_10)+(S11*invJ_11));
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData3D(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double s_gradv[9*NUM_QUAD_3D];
-
-      for (int i = 0; i < (9*NUM_QUAD_3D); ++i)
-      {
-         s_gradv[i] = 0;
-      }
-
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double vDxy[3*NUM_QUAD_2D] ;
-         double vxDy[3*NUM_QUAD_2D] ;
-         double vxy[3*NUM_QUAD_2D]  ;
-         for (int i = 0; i < (3*NUM_QUAD_2D); ++i)
-         {
-            vDxy[i] = 0;
-            vxDy[i] = 0;
-            vxy[i]  = 0;
-         }
-
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double vDx[3*NUM_QUAD_1D] ;
-            double vx[3*NUM_QUAD_1D]  ;
-            for (int i = 0; i < (3*NUM_QUAD_1D); ++i)
-            {
-               vDx[i] = 0;
-               vx[i]  = 0;
-            }
-
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     vDx[ijN(vi,qx,3)] += v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,
-                                                     numElements)]*dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                     vx[ijN(vi,qx,3)]  += v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,
-                                                     numElements)]*dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  }
-               }
-            }
-
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy  = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               const double wDy = dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     vDxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)] += wy *vDx[ijN(vi,qx,3)];
-                     vxDy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)] += wDy*vx[ijN(vi,qx,3)];
-                     vxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)]  += wy *vx[ijN(vi,qx,3)];
-                  }
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_DOFS_1D; ++qz)
-         {
-            const double wz  = dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            const double wDz = dofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  const int q = qx+qy*NUM_QUAD_1D+qz*NUM_QUAD_2D;
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     s_gradv[ijkN(vi,0,q,3)] += wz *vDxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                     s_gradv[ijkN(vi,1,q,3)] += wz *vxDy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                     s_gradv[ijkN(vi,2,q,3)] += wDz*vxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                  }
-               }
-            }
-         }
-      }
-
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double q_gradv[9]  ;
-         double q_stress[9] ;
-
-         const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_20 = invJ[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_21 = invJ[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_02 = invJ[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_12 = invJ[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_22 = invJ[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         q_gradv[ijN(0,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_02));
-         q_gradv[ijN(1,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_12));
-         q_gradv[ijN(2,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_22));
-
-         q_gradv[ijN(0,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_02));
-         q_gradv[ijN(1,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_12));
-         q_gradv[ijN(2,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_22));
-
-         q_gradv[ijN(0,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_02));
-         q_gradv[ijN(1,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_12));
-         q_gradv[ijN(2,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_22));
-
-         const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-
-         const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-         const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-
-         const double s = -(GAMMA-1.0)*q_rho*q_e;
-         q_stress[ijN(0,0,3)] = s; q_stress[ijN(1,0,3)] = 0; q_stress[ijN(2,0,3)] = 0;
-         q_stress[ijN(0,1,3)] = 0; q_stress[ijN(1,1,3)] = s; q_stress[ijN(2,1,3)] = 0;
-         q_stress[ijN(0,2,3)] = 0; q_stress[ijN(1,2,3)] = 0; q_stress[ijN(2,2,3)] = s;
-
-         const double gradv00 = q_gradv[ijN(0,0,3)];
-         const double gradv11 = q_gradv[ijN(1,1,3)];
-         const double gradv22 = q_gradv[ijN(2,2,3)];
-         const double gradv10 = 0.5*(q_gradv[ijN(1,0,3)]+q_gradv[ijN(0,1,3)]);
-         const double gradv20 = 0.5*(q_gradv[ijN(2,0,3)]+q_gradv[ijN(0,2,3)]);
-         const double gradv21 = 0.5*(q_gradv[ijN(2,1,3)]+q_gradv[ijN(1,2,3)]);
-         q_gradv[ijN(1,0,3)] = gradv10; q_gradv[ijN(2,0,3)] = gradv20;
-         q_gradv[ijN(0,1,3)] = gradv10; q_gradv[ijN(2,1,3)] = gradv21;
-         q_gradv[ijN(0,2,3)] = gradv20; q_gradv[ijN(1,2,3)] = gradv21;
-
-         double minEig = 0;
-         double comprDirX = 1;
-         double comprDirY = 0;
-         double comprDirZ = 0;
-
-         {
-            // Compute eigenvalues using quadrature formula
-            const double q_ = (gradv00+gradv11+gradv22) / 3.0;
-            const double gradv_q00 = (gradv00-q_);
-            const double gradv_q11 = (gradv11-q_);
-            const double gradv_q22 = (gradv22-q_);
-
-            const double p1 = ((gradv10*gradv10) +
-                               (gradv20*gradv20) +
-                               (gradv21*gradv21));
-            const double p2 = ((gradv_q00*gradv_q00) +
-                               (gradv_q11*gradv_q11) +
-                               (gradv_q22*gradv_q22) +
-                               (2.0*p1));
-            const double p    = sqrt(p2 / 6.0);
-            const double pinv = 1.0 / p;
-            // det(pinv*(gradv-q*I))
-            const double r = (0.5*pinv*pinv*pinv *
-                              ((gradv_q00*gradv_q11*gradv_q22) +
-                               (2.0*gradv10*gradv21*gradv20) -
-                               (gradv_q11*gradv20*gradv20) -
-                               (gradv_q22*gradv10*gradv10) -
-                               (gradv_q00*gradv21*gradv21)));
-
-            double phi = 0;
-            if (r <= -1.0)
-            {
-               phi = M_PI / 3.0;
-            }
-            else if (r < 1.0)
-            {
-               phi = acos(r) / 3.0;
-            }
-
-            minEig = q_+(2.0*p*cos(phi+(2.0*M_PI / 3.0)));
-            const double eig3 = q_+(2.0*p*cos(phi));
-            const double eig2 = 3.0*q_-minEig-eig3;
-            double maxNorm = 0;
-
-            for (int i = 0; i < 3; ++i)
-            {
-               const double x = q_gradv[i+3*0]-(i == 0)*eig3;
-               const double y = q_gradv[i+3*1]-(i == 1)*eig3;
-               const double z = q_gradv[i+3*2]-(i == 2)*eig3;
-               const double cx = ((x*(gradv00-eig2)) +
-                                  (y*gradv10) +
-                                  (z*gradv20));
-               const double cy = ((x*gradv10) +
-                                  (y*(gradv11-eig2)) +
-                                  (z*gradv21));
-               const double cz = ((x*gradv20) +
-                                  (y*gradv21) +
-                                  (z*(gradv22-eig2)));
-               const double cNorm = (cx*cx+cy*cy+cz*cz);
-               //#warning 1e-16 to 1
-               if ((cNorm > 1.e-16) && (maxNorm < cNorm))
-               {
-                  comprDirX = cx;
-                  comprDirY = cy;
-                  comprDirZ = cz;
-                  maxNorm = cNorm;
-               }
-            }
-            //#warning 1e-16 to 1
-            if (maxNorm > 1.e-16)
-            {
-               const double maxNormInv = 1.0 / sqrt(maxNorm);
-               comprDirX *= maxNormInv;
-               comprDirY *= maxNormInv;
-               comprDirZ *= maxNormInv;
-            }
-         }
-
-         // Computes the initial->physical transformation Jacobian.
-         const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_20 = J[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_21 = J[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_02 = J[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_12 = J[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_22 = J[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_20 = invJ0[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_21 = invJ0[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_02 = invJ0[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_12 = invJ0[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_22 = invJ0[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         const double Jpi_00 = ((J_00*invJ0_00)+(J_10*invJ0_01)+(J_20*invJ0_02));
-         const double Jpi_10 = ((J_00*invJ0_10)+(J_10*invJ0_11)+(J_20*invJ0_12));
-         const double Jpi_20 = ((J_00*invJ0_20)+(J_10*invJ0_21)+(J_20*invJ0_22));
-
-         const double Jpi_01 = ((J_01*invJ0_00)+(J_11*invJ0_01)+(J_21*invJ0_02));
-         const double Jpi_11 = ((J_01*invJ0_10)+(J_11*invJ0_11)+(J_21*invJ0_12));
-         const double Jpi_21 = ((J_01*invJ0_20)+(J_11*invJ0_21)+(J_21*invJ0_22));
-
-         const double Jpi_02 = ((J_02*invJ0_00)+(J_12*invJ0_01)+(J_22*invJ0_02));
-         const double Jpi_12 = ((J_02*invJ0_10)+(J_12*invJ0_11)+(J_22*invJ0_12));
-         const double Jpi_22 = ((J_02*invJ0_20)+(J_12*invJ0_21)+(J_22*invJ0_22));
-
-         const double physDirX = ((Jpi_00*comprDirX)+(Jpi_10*comprDirY)+
-                                  (Jpi_20*comprDirZ));
-         const double physDirY = ((Jpi_01*comprDirX)+(Jpi_11*comprDirY)+
-                                  (Jpi_21*comprDirZ));
-         const double physDirZ = ((Jpi_02*comprDirX)+(Jpi_12*comprDirY)+
-                                  (Jpi_22*comprDirZ));
-
-         const double q_h = H0*sqrt((physDirX*physDirX)+
-                                    (physDirY*physDirY)+
-                                    (physDirZ*physDirZ));
-
-         const double soundSpeed = sqrt(GAMMA*(GAMMA-1.0)*q_e);
-         dtEst[ijN(q,el,NUM_QUAD)] = CFL*q_h / soundSpeed;
-
-         if (USE_VISCOSITY)
-         {
-            // TODO: Check how we can extract outside of kernel
-            const double mu = minEig;
-            double coeff = 2.0*q_rho*q_h*q_h*fabs(mu);
-            if (mu < 0)
-            {
-               coeff += 0.5*q_rho*q_h*soundSpeed;
-            }
-            for (int y = 0; y < 3; ++y)
-            {
-               for (int x = 0; x < 3; ++x)
-               {
-                  q_stress[ijN(x,y,3)] += coeff*q_gradv[ijN(x,y,3)];
-               }
-            }
-         }
-
-         const double S00 = q_stress[ijN(0,0,3)];
-         const double S10 = q_stress[ijN(1,0,3)];
-         const double S20 = q_stress[ijN(2,0,3)];
-         const double S01 = q_stress[ijN(0,1,3)];
-         const double S11 = q_stress[ijN(1,1,3)];
-         const double S21 = q_stress[ijN(2,1,3)];
-         const double S02 = q_stress[ijN(0,2,3)];
-         const double S12 = q_stress[ijN(1,2,3)];
-         const double S22 = q_stress[ijN(2,2,3)];
-
-         stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_00)+(S10*invJ_01)+(S20*invJ_02));
-         stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_10)+(S10*invJ_11)+(S20*invJ_12));
-         stressJinvT[ijklNM(2,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_20)+(S10*invJ_21)+(S20*invJ_22));
-
-         stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_00)+(S11*invJ_01)+(S21*invJ_02));
-         stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_10)+(S11*invJ_11)+(S21*invJ_12));
-         stressJinvT[ijklNM(2,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_20)+(S11*invJ_21)+(S21*invJ_22));
-
-         stressJinvT[ijklNM(0,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_00)+(S12*invJ_01)+(S22*invJ_02));
-         stressJinvT[ijklNM(1,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_10)+(S12*invJ_11)+(S22*invJ_12));
-         stressJinvT[ijklNM(2,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_20)+(S12*invJ_21)+(S22*invJ_22));
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fUpdateQuadratureData)(const double GAMMA,
-                                      const double H0,
-                                      const double CFL,
-                                      const bool USE_VISCOSITY,
-                                      const int numElements,
-                                      const double* restrict dofToQuad,
-                                      const double* restrict dofToQuadD,
-                                      const double* restrict quadWeights,
-                                      const double* restrict v,
-                                      const double* restrict e,
-                                      const double* restrict rho0DetJ0w,
-                                      const double* restrict invJ0,
-                                      const double* restrict J,
-                                      const double* restrict invJ,
-                                      const double* restrict detJ,
-                                      double* restrict stressJinvT,
-                                      double* restrict dtEst);
-
-// *****************************************************************************
-void rUpdateQuadratureData(const double GAMMA,
-                           const double H0,
-                           const double CFL,
-                           const bool USE_VISCOSITY,
-                           const int NUM_DIM,
-                           const int NUM_QUAD,
-                           const int NUM_QUAD_1D,
-                           const int NUM_DOFS_1D,
-                           const int nzones,
-                           const double* restrict dofToQuad,
-                           const double* restrict dofToQuadD,
-                           const double* restrict quadWeights,
-                           const double* restrict v,
-                           const double* restrict e,
-                           const double* restrict rho0DetJ0w,
-                           const double* restrict invJ0,
-                           const double* restrict J,
-                           const double* restrict invJ,
-                           const double* restrict detJ,
-                           double* restrict stressJinvT,
-                           double* restrict dtEst)
-{
-   const int blck = CUDA_BLOCK_SIZE;
-   const int grid = (nzones+blck-1)/blck;
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(IROOT(NUM_DIM,NUM_QUAD)==NUM_QUAD_1D);
-   const unsigned int id = (NUM_DIM<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned int, fUpdateQuadratureData> call =
-   {
-      // 2D
-      {0x20,&rUpdateQuadratureData2D<2,2*2,2,2>},
-      {0x21,&rUpdateQuadratureData2D<2,4*4,4,3>},
-      {0x22,&rUpdateQuadratureData2D<2,6*6,6,4>},
-      {0x23,&rUpdateQuadratureData2D<2,8*8,8,5>},
-      {0x24,&rUpdateQuadratureData2D<2,10*10,10,6>},
-      {0x25,&rUpdateQuadratureData2D<2,12*12,12,7>},
-      {0x26,&rUpdateQuadratureData2D<2,14*14,14,8>},
-      {0x27,&rUpdateQuadratureData2D<2,16*16,16,9>},
-      {0x28,&rUpdateQuadratureData2D<2,18*18,18,10>},
-      {0x29,&rUpdateQuadratureData2D<2,20*20,20,11>},
-      {0x2A,&rUpdateQuadratureData2D<2,22*22,22,12>},
-      {0x2B,&rUpdateQuadratureData2D<2,24*24,24,13>},
-      {0x2C,&rUpdateQuadratureData2D<2,26*26,26,14>},
-      {0x2D,&rUpdateQuadratureData2D<2,28*28,28,15>},
-      {0x2E,&rUpdateQuadratureData2D<2,30*30,30,16>},
-      {0x2F,&rUpdateQuadratureData2D<2,32*32,32,17>},
-      // 3D
-      {0x30,&rUpdateQuadratureData3D<3,2*2*2,2,2>},
-      {0x31,&rUpdateQuadratureData3D<3,4*4*4,4,3>},
-      {0x32,&rUpdateQuadratureData3D<3,6*6*6,6,4>},
-      {0x33,&rUpdateQuadratureData3D<3,8*8*8,8,5>},
-      {0x34,&rUpdateQuadratureData3D<3,10*10*10,10,6>},
-      {0x35,&rUpdateQuadratureData3D<3,12*12*12,12,7>},
-      {0x36,&rUpdateQuadratureData3D<3,14*14*14,14,8>},
-      {0x37,&rUpdateQuadratureData3D<3,16*16*16,16,9>},
-      {0x38,&rUpdateQuadratureData3D<3,18*18*18,18,10>},
-      {0x39,&rUpdateQuadratureData3D<3,20*20*20,20,11>},
-      {0x3A,&rUpdateQuadratureData3D<3,22*22*22,22,12>},
-      {0x3B,&rUpdateQuadratureData3D<3,24*24*24,24,13>},
-      {0x3C,&rUpdateQuadratureData3D<3,26*26*26,26,14>},
-      {0x3D,&rUpdateQuadratureData3D<3,28*28*28,28,15>},
-      {0x3E,&rUpdateQuadratureData3D<3,30*30*30,30,16>},
-      {0x3F,&rUpdateQuadratureData3D<3,32*32*32,32,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rUpdateQuadratureData] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         GAMMA,H0,CFL,USE_VISCOSITY,
-         nzones,dofToQuad,dofToQuadD,quadWeights,
-         v,e,rho0DetJ0w,invJ0,J,invJ,detJ,
-         stressJinvT,dtEst);
-}
diff --git a/cuda/cuda/kernels/share/forceS.cpp b/cuda/cuda/kernels/share/forceS.cpp
deleted file mode 100644
index a1c09b7f..00000000
--- a/cuda/cuda/kernels/share/forceS.cpp
+++ /dev/null
@@ -1,839 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMult2S(const int numElements,
-                  const double* restrict L2DofToQuad,
-                  const double* restrict H1QuadToDof,
-                  const double* restrict H1QuadToDofD,
-                  const double* restrict stressJinvT,
-                  const double* restrict e,
-                  double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int MAX_DOFS_1D = (L2_DOFS_1D > H1_DOFS_1D)?L2_DOFS_1D:H1_DOFS_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2DofToQuad[NUM_QUAD_1D * L2_DOFS_1D];
-      share double s_H1QuadToDof[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1QuadToDofD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_xy[MAX_DOFS_1D * NUM_QUAD_1D];
-      share double s_xDy[H1_DOFS_1D * NUM_QUAD_1D];
-      share double s_e[NUM_QUAD_2D];
-
-      const int idBlock = threadIdx.x;
-      {
-         for (int id = idBlock; id < (L2_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_L2DofToQuad[id] = L2DofToQuad[id];
-         }
-         for (int id = idBlock; id < (H1_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_H1QuadToDof[id]  = H1QuadToDof[id];
-            s_H1QuadToDofD[id] = H1QuadToDofD[id];
-         }
-      }
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            sync;
-            const int dx = threadIdx.x;
-            {
-               if (dx < L2_DOFS_1D)
-               {
-                  double r_x[L2_DOFS_1D];
-                  for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                  {
-                     r_x[dy] = e[ijkN(dx,dy,el,L2_DOFS_1D)];
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     double xy = 0;
-                     for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                     {
-                        xy += r_x[dy]*s_L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                     }
-                     s_xy[ijN(dx,qy,MAX_DOFS_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            const int qy = threadIdx.x;
-            {
-               if (qy < NUM_QUAD_1D)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     double r_e = 0;
-                     for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-                     {
-                        r_e += s_xy[ijN(dx,qy,MAX_DOFS_1D)]*s_L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                     }
-                     s_e[ijN(qx,qy,NUM_QUAD_1D)] = r_e;
-                  }
-               }
-            }
-
-            for (int c = 0; c < NUM_DIM; ++c)
-            {
-               sync;
-               const int qx = threadIdx.x;
-               {
-                  if (qx < NUM_QUAD_1D)
-                  {
-                     double r_x[NUM_QUAD_1D];
-                     double r_y[NUM_QUAD_1D];
-
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        const double r_e = s_e[(qx) + (NUM_QUAD_1D) * (qy)];
-                        r_x[qy] = r_e * stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)];
-                        r_y[qy] = r_e * stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)];
-                     }
-                     for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                     {
-                        double xy  = 0;
-                        double xDy = 0;
-                        for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                        {
-                           xy  += r_x[qy] * s_H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-                           xDy += r_y[qy] * s_H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-                        }
-                        s_xy[ijN(dy,qx,MAX_DOFS_1D)] = xy;
-                        s_xDy[ijN(dy,qx,H1_DOFS_1D)] = xDy;
-                     }
-                  }
-               }
-               sync;
-               const int dx = threadIdx.x;
-               {
-                  if (dx < H1_DOFS_1D)
-                  {
-                     for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                     {
-                        double r_v = 0;
-                        for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                        {
-                           r_v += ((s_xy[ijN(dy,qx,MAX_DOFS_1D)] * s_H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)]) +
-                                   (s_xDy[ijN(dy,qx,H1_DOFS_1D)] * s_H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)]));
-                        }
-                        v[ijklNM(dx,dy,el,c,NUM_DOFS_1D,numElements)] = r_v;
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMultTranspose2S(const int numElements,
-                           const double* restrict L2QuadToDof,
-                           const double* restrict H1DofToQuad,
-                           const double* restrict H1DofToQuadD,
-                           const double* restrict stressJinvT,
-                           const double* restrict v,
-                           double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD = NUM_QUAD_2D;
-   const int MAX_DOFS_1D = (L2_DOFS_1D > H1_DOFS_1D)?L2_DOFS_1D:H1_DOFS_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2QuadToDof[NUM_QUAD_1D * L2_DOFS_1D];
-      share double s_H1DofToQuad[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1DofToQuadD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_xy[MAX_DOFS_1D * NUM_QUAD_1D];
-      share double s_xDy[H1_DOFS_1D * NUM_QUAD_1D];
-      share double s_v[NUM_QUAD_1D  * NUM_QUAD_1D];
-
-      const int idBlock = 0 + threadIdx.x;
-      {
-         for (int id = idBlock; id < (L2_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_L2QuadToDof[id] = L2QuadToDof[id];
-         }
-         for (int id = idBlock; id < (H1_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_H1DofToQuad[id]  = H1DofToQuad[id];
-            s_H1DofToQuadD[id] = H1DofToQuadD[id];
-         }
-      }
-
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            sync;
-            const int qBlock = threadIdx.x;
-            {
-               for (int q = qBlock; q < NUM_QUAD; ++q)
-               {
-                  s_v[q] = 0;
-               }
-            }
-            for (int c = 0; c < NUM_DIM; ++c)
-            {
-               sync;
-               const int dx = threadIdx.x;
-               {
-                  if (dx < H1_DOFS_1D)
-                  {
-                     double r_v[H1_DOFS_1D];
-
-                     for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                     {
-                        r_v[dy] = v[ijklNM(dx,dy,el,c,H1_DOFS_1D,numElements)];
-                     }
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        double xy  = 0;
-                        double xDy = 0;
-                        for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                        {
-                           xy  += r_v[dy] * s_H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                           xDy += r_v[dy] * s_H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                        }
-                        s_xy[ijN(qy,dx,NUM_QUAD_1D)]  = xy;
-                        s_xDy[ijN(qy,dx,NUM_QUAD_1D)] = xDy;
-                     }
-                  }
-               }
-               sync;
-               const int qx = threadIdx.x;
-               {
-                  if (qx < NUM_QUAD_1D)
-                  {
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        double Dxy = 0;
-                        double xDy = 0;
-                        for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                        {
-                           Dxy += (s_xy[ijN(qy,dx,NUM_QUAD_1D)] * s_H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)]);
-                           xDy += (s_xDy[ijN(qy,dx,NUM_QUAD_1D)] * s_H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)]);
-                        }
-                        s_v[ijN(qx,qy,NUM_QUAD_1D)] += ((Dxy * stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,
-                                                                                   NUM_QUAD_1D)]) +
-                                                        (xDy * stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)]));
-                     }
-                  }
-               }
-            }
-            sync;
-            const int qx = threadIdx.x;
-            {
-               if (qx < NUM_QUAD_1D)
-               {
-                  double r_x[NUM_QUAD_1D];
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     r_x[qy] = s_v[ijN(qx,qy,NUM_QUAD_1D)];
-                  }
-                  for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                  {
-                     double xy = 0;
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        xy += r_x[qy] * s_L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-                     }
-                     s_xy[ijN(qx,dy,NUM_QUAD_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            const int dy = threadIdx.x;
-            {
-               if (dy < L2_DOFS_1D)
-               {
-                  for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-                  {
-                     double r_e = 0;
-                     for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                     {
-                        r_e += s_xy[ijN(qx,dy,NUM_QUAD_1D)] * s_L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-                     }
-                     e[ijkN(dx,dy,el,L2_DOFS_1D)] = r_e;
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-typedef void (*fForceMult2S)(const int numElements,
-                             const double* restrict L2DofToQuad,
-                             const double* restrict H1QuadToDof,
-                             const double* restrict H1QuadToDofD,
-                             const double* restrict stressJinvT,
-                             const double* restrict e,
-                             double* restrict v);
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMult3S(const int numElements,
-                  const double* restrict L2DofToQuad,
-                  const double* restrict H1QuadToDof,
-                  const double* restrict H1QuadToDofD,
-                  const double* restrict stressJinvT,
-                  const double* restrict e,
-                  double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int INNER_SIZE_2D = (INNER_SIZE * INNER_SIZE);
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2DofToQuad[NUM_QUAD_1D * L2_DOFS_1D];
-      share double s_H1QuadToDof[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1QuadToDofD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_Dxyz[INNER_SIZE_2D];
-      share double s_xDyz[NUM_QUAD_2D];
-      share double s_xyDz[NUM_QUAD_2D];
-
-      double r_z[NUM_QUAD_1D];
-
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * INNER_SIZE) + x;
-            for (int i = id; i < (L2_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_L2DofToQuad[i] = L2DofToQuad[i];
-            }
-            for (int i = id; i < (H1_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_H1QuadToDof[i]  = H1QuadToDof[i];
-               s_H1QuadToDofD[i] = H1QuadToDofD[i];
-            }
-         }
-      }
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            {
-               const int dy = threadIdx.y;
-               {
-                  const int dx = threadIdx.x;
-                  if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                  {
-                     // Calculate D -> Q in the Z axis
-                     const double r_e0 = e[ijklN(dx,dy,0,el,L2_DOFS_1D)];
-                     for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                     {
-                        r_z[qz] = r_e0 * s_L2DofToQuad[ijN(qz, 0,NUM_QUAD_1D)];
-                     }
-
-                     for (int dz = 1; dz < L2_DOFS_1D; ++dz)
-                     {
-                        const double r_e = e[ijklN(dx,dy,dz,el,L2_DOFS_1D)];
-                        for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                        {
-                           r_z[qz] += r_e * s_L2DofToQuad[ijN(qz, dz,NUM_QUAD_1D)];
-                        }
-                     }
-                  }
-               }
-            }
-            // For each xy plane
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               sync;
-               // Fill xy plane at given z position
-               {
-                  const int dy = threadIdx.y;
-                  {
-                     const int dx = threadIdx.x;
-                     if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                     {
-                        s_Dxyz[ijN(dx, dy,INNER_SIZE)] = r_z[qz];
-                     }
-                  }
-               }
-               // Calculate Dxyz, xDyz, xyDz in plane
-               sync;
-               {
-                  const int qy = threadIdx.y;
-                  {
-                     const int qx = threadIdx.x;
-                     if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-                     {
-                        double q_e = 0;
-                        for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                        {
-                           double q_ex = 0;
-                           for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-                           {
-                              q_ex += s_Dxyz[ijN(dx, dy,INNER_SIZE)] * s_L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                           }
-                           q_e += q_ex * s_L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                        }
-                        r_z[qz] = q_e;
-                     }
-                  }
-               }
-            }
-            for (int c = 0; c < NUM_DIM; ++c)
-            {
-               for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-               {
-                  // Fill xy plane at given z position
-                  sync;
-                  {
-                     const int qy = threadIdx.y;
-                     {
-                        const int qx = threadIdx.x;
-                        if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-                        {
-                           double r_Dxyz = 0;
-                           double r_xDyz = 0;
-                           double r_xyDz = 0;
-                           for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                           {
-                              const double r_e = r_z[qz];
-                              const double wz  = s_H1QuadToDof[ijN(dz, qz,H1_DOFS_1D)];
-                              const double wDz = s_H1QuadToDofD[ijN(dz, qz,H1_DOFS_1D)];
-                              r_Dxyz += r_e * wz  * stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,NUM_DIM,
-                                                                         NUM_QUAD_1D)];
-                              r_xDyz += r_e * wz  * stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,
-                                                                         NUM_QUAD_1D)];
-                              r_xyDz += r_e * wDz * stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,
-                                                                         NUM_QUAD_1D)];
-                           }
-                           s_Dxyz[ijN(qx,qy,INNER_SIZE)] = r_Dxyz;
-                           s_xDyz[ijN(qx,qy,NUM_QUAD_1D)] = r_xDyz;
-                           s_xyDz[ijN(qx,qy,NUM_QUAD_1D)] = r_xyDz;
-                        }
-                     }
-                  }
-                  // Finalize solution in xy plane
-                  sync;
-                  {
-                     const int dy = threadIdx.y;
-                     {
-                        const int dx = threadIdx.x;
-                        if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D))
-                        {
-                           double r_v = 0;
-                           for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                           {
-                              const double wy  = s_H1QuadToDof[ijN(dy, qy,H1_DOFS_1D)];
-                              const double wDy = s_H1QuadToDofD[ijN(dy, qy,H1_DOFS_1D)];
-                              for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                              {
-                                 const double wx  = s_H1QuadToDof[ijN(dx, qx,H1_DOFS_1D)];
-                                 const double wDx = s_H1QuadToDofD[ijN(dx, qx,H1_DOFS_1D)];
-                                 r_v += ((wDx * wy  * s_Dxyz[ijN(qx, qy,INNER_SIZE)]) +
-                                         (wx  * wDy * s_xDyz[ijN(qx, qy,NUM_QUAD_1D)]) +
-                                         (wx  * wy  * s_xyDz[ijN(qx, qy,NUM_QUAD_1D)]));
-                              }
-                           }
-                           v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] = r_v;
-                        }
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMultTranspose3S(const int numElements,
-                           const double* restrict L2QuadToDof,
-                           const double* restrict H1DofToQuad,
-                           const double* restrict H1DofToQuadD,
-                           const double* restrict stressJinvT,
-                           const double* restrict v,
-                           double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2QuadToDof[L2_DOFS_1D * NUM_QUAD_1D];
-      share double s_H1DofToQuad[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1DofToQuadD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_xyz[NUM_QUAD_2D * NUM_DIM];
-      share double s_xyDz[NUM_QUAD_2D * NUM_DIM];
-      share double s_v[NUM_QUAD_2D];
-
-      double r_xyz[NUM_QUAD_1D*NUM_DIM];
-      double r_xyDz[NUM_QUAD_1D*NUM_DIM];
-      const int y = threadIdx.y;
-      {
-         const int x = threadIdx.x;
-         {
-            const int id = (y * INNER_SIZE) + x;
-            for (int i = id; i < (L2_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_L2QuadToDof[i] = L2QuadToDof[i];
-            }
-            for (int i = id; i < (H1_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_H1DofToQuad[i]  = H1DofToQuad[i];
-               s_H1DofToQuadD[i] = H1DofToQuadD[i];
-            }
-         }
-      }
-
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            sync;
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               {
-                  if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D))
-                  {
-                     double r_v[NUM_DIM][H1_DOFS_1D];
-                     for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-                     {
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           r_v[c][dz] = v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)];
-                        }
-                     }
-                     for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                     {
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           double xyz  = 0;
-                           double xyDz = 0;
-                           for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-                           {
-                              xyz  += r_v[c][dz] * s_H1DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-                              xyDz += r_v[c][dz] * s_H1DofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-                           }
-                           r_xyz[ijN(c,qz,NUM_DIM)]  = xyz;
-                           r_xyDz[ijN(c,qz,NUM_DIM)] = xyDz;
-                        }
-                     }
-                  }
-               }
-            }
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               sync;
-               const int dy = threadIdx.y;
-               {
-                  const int dx = threadIdx.x;
-                  {
-                     if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D))
-                     {
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           s_xyz[ijkNM(c,dx,dy,NUM_DIM,NUM_QUAD_1D)]  = r_xyz[ijN(c,qz,NUM_DIM)];
-                           s_xyDz[ijkNM(c,dx,dy,NUM_DIM,NUM_QUAD_1D)] = r_xyDz[ijN(c,qz,NUM_DIM)];
-                        }
-                     }
-                  }
-               }
-               // Finalize solution in xy plane
-               sync;
-               const int qy = threadIdx.y;
-               {
-                  const int qx = threadIdx.x;
-                  {
-                     if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-                     {
-                        double r_qv = 0;
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           double Dxyz = 0;
-                           double xDyz = 0;
-                           double xyDz = 0;
-                           for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                           {
-                              const double wy  = s_H1DofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                              const double wDy = s_H1DofToQuadD[ijN(qy, dy,NUM_QUAD_1D)];
-                              double Dxz = 0;
-                              double xz  = 0;
-                              double xDz = 0;
-                              for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                              {
-                                 const double wx  = s_H1DofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                                 const double wDx = s_H1DofToQuadD[ijN(qx, dx,NUM_QUAD_1D)];
-                                 Dxz += wDx * s_xyz[ijkNM(c, dx, dy,NUM_DIM,NUM_QUAD_1D)];
-                                 xz  += wx  * s_xyz[ijkNM(c, dx, dy,NUM_DIM,NUM_QUAD_1D)];
-                                 xDz += wx  * s_xyDz[ijkNM(c, dx, dy,NUM_DIM,NUM_QUAD_1D)];
-                              }
-                              Dxyz += wy  * Dxz;
-                              xDyz += wDy * xz;
-                              xyDz += wy  * xDz;
-                           }
-                           r_qv += ((Dxyz * stressJinvT[ijklmnNM(0, c, qx, qy, qz, el,NUM_DIM,
-                                                                 NUM_QUAD_1D)]) +
-                                    (xDyz * stressJinvT[ijklmnNM(1, c, qx, qy, qz, el,NUM_DIM,NUM_QUAD_1D)]) +
-                                    (xyDz * stressJinvT[ijklmnNM(2, c, qx, qy, qz, el,NUM_DIM,NUM_QUAD_1D)]));
-                        }
-                        s_v[ijN(qx, qy,NUM_QUAD_1D)] = r_qv;
-                     }
-                  }
-               }
-               sync;
-               {
-                  const int dy = threadIdx.y;
-                  {
-                     const int dx = threadIdx.x;
-                     if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                     {
-                        double r_e = 0;
-                        for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                        {
-                           double r_ex = 0;
-                           for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                           {
-                              r_ex += s_v[ijN(qx, qy,NUM_QUAD_1D)] * s_L2QuadToDof[ijN(dx, qx,L2_DOFS_1D)];
-                           }
-                           r_e += r_ex * s_L2QuadToDof[ijN(dy, qy,L2_DOFS_1D)];
-                        }
-                        r_xyz[qz] = r_e;
-                     }
-                  }
-               }
-            }
-            sync;
-            {
-               const int dy = threadIdx.y;
-               const int dx = threadIdx.x;
-               {
-                  if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                  {
-                     for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-                     {
-                        double r_e = 0;
-                        for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                        {
-                           r_e += r_xyz[qz] * s_L2QuadToDof[ijN(dz,qz,L2_DOFS_1D)];
-                        }
-                        e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] = r_e;
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-void rForceMultS(const int NUM_DIM,
-                 const int NUM_DOFS_1D,
-                 const int NUM_QUAD_1D,
-                 const int L2_DOFS_1D,
-                 const int H1_DOFS_1D,
-                 const int numElements,
-                 const double* restrict L2QuadToDof,
-                 const double* restrict H1DofToQuad,
-                 const double* restrict H1DofToQuadD,
-                 const double* restrict stressJinvT,
-                 const double* restrict e,
-                 double* restrict v)
-{
-   if (NUM_DIM==1) { assert(false); }
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int grid = ((numElements+ELEMENT_BATCH-1)/ELEMENT_BATCH);
-   const dim3 blck(INNER_SIZE,INNER_SIZE,1);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   const unsigned int id =(NUM_DIM<<4)|(NUM_DOFS_1D-2);
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   static std::unordered_map<unsigned int, fForceMult2S> call =
-   {
-      {0x20,&rForceMult2S<2,2,2,1,2>},
-      {0x21,&rForceMult2S<2,3,4,2,3>},
-      {0x22,&rForceMult2S<2,4,6,3,4>},
-      {0x23,&rForceMult2S<2,5,8,4,5>},
-      {0x24,&rForceMult2S<2,6,10,5,6>},
-      {0x25,&rForceMult2S<2,7,12,6,7>},
-      {0x26,&rForceMult2S<2,8,14,7,8>},
-      {0x27,&rForceMult2S<2,9,16,8,9>},
-      {0x28,&rForceMult2S<2,10,18,9,10>},
-      {0x29,&rForceMult2S<2,11,20,10,11>},
-      {0x2A,&rForceMult2S<2,12,22,11,12>},
-      {0x2B,&rForceMult2S<2,13,24,12,13>},
-      {0x2C,&rForceMult2S<2,14,26,13,14>},
-      {0x2D,&rForceMult2S<2,15,28,14,15>},
-      {0x2E,&rForceMult2S<2,16,30,15,16>},
-      {0x2F,&rForceMult2S<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMult3S<3,2,2,1,2>},
-      {0x31,&rForceMult3S<3,3,4,2,3>},
-      {0x32,&rForceMult3S<3,4,6,3,4>},
-      {0x33,&rForceMult3S<3,5,8,4,5>},
-      {0x34,&rForceMult3S<3,6,10,5,6>},
-      {0x35,&rForceMult3S<3,7,12,6,7>},
-      {0x36,&rForceMult3S<3,8,14,7,8>},
-      {0x37,&rForceMult3S<3,9,16,8,9>},
-      {0x38,&rForceMult3S<3,10,18,9,10>},
-      {0x39,&rForceMult3S<3,11,20,10,11>},
-      {0x3A,&rForceMult3S<3,12,22,11,12>},
-      {0x3B,&rForceMult3S<3,13,24,12,13>},
-      {0x3C,&rForceMult3S<3,14,26,13,14>},
-      {0x3D,&rForceMult3S<3,15,28,14,15>},
-      {0x3E,&rForceMult3S<3,16,30,15,16>},
-      {0x3F,&rForceMult3S<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMult] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,e,v);
-}
-
-
-// *****************************************************************************
-typedef void (*fForceMultTransposeS)(const int numElements,
-                                     const double* restrict L2QuadToDof,
-                                     const double* restrict H1DofToQuad,
-                                     const double* restrict H1DofToQuadD,
-                                     const double* restrict stressJinvT,
-                                     const double* restrict v,
-                                     double* restrict e);
-
-// *****************************************************************************
-void rForceMultTransposeS(const int NUM_DIM,
-                          const int NUM_DOFS_1D,
-                          const int NUM_QUAD_1D,
-                          const int L2_DOFS_1D,
-                          const int H1_DOFS_1D,
-                          const int numElements,
-                          const double* restrict L2QuadToDof,
-                          const double* restrict H1DofToQuad,
-                          const double* restrict H1DofToQuadD,
-                          const double* restrict stressJinvT,
-                          const double* restrict v,
-                          double* restrict e)
-{
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int grid = ((numElements+ELEMENT_BATCH-1)/ELEMENT_BATCH);
-   const dim3 blck(INNER_SIZE,INNER_SIZE,1);
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned long long, fForceMultTransposeS> call =
-   {
-      // 2D
-      {0x20,&rForceMultTranspose2S<2,2,2,1,2>},
-      {0x21,&rForceMultTranspose2S<2,3,4,2,3>},
-      {0x22,&rForceMultTranspose2S<2,4,6,3,4>},
-      {0x23,&rForceMultTranspose2S<2,5,8,4,5>},
-      {0x24,&rForceMultTranspose2S<2,6,10,5,6>},
-      {0x25,&rForceMultTranspose2S<2,7,12,6,7>},
-      {0x26,&rForceMultTranspose2S<2,8,14,7,8>},
-      {0x27,&rForceMultTranspose2S<2,9,16,8,9>},
-      {0x28,&rForceMultTranspose2S<2,10,18,9,10>},
-      {0x29,&rForceMultTranspose2S<2,11,20,10,11>},
-      {0x2A,&rForceMultTranspose2S<2,12,22,11,12>},
-      {0x2B,&rForceMultTranspose2S<2,13,24,12,13>},
-      {0x2C,&rForceMultTranspose2S<2,14,26,13,14>},
-      {0x2D,&rForceMultTranspose2S<2,15,28,14,15>},
-      {0x2E,&rForceMultTranspose2S<2,16,30,15,16>},
-      {0x2F,&rForceMultTranspose2S<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMultTranspose3S<3,2,2,1,2>},
-      {0x31,&rForceMultTranspose3S<3,3,4,2,3>},
-      {0x32,&rForceMultTranspose3S<3,4,6,3,4>},
-      {0x33,&rForceMultTranspose3S<3,5,8,4,5>},
-      {0x34,&rForceMultTranspose3S<3,6,10,5,6>},
-      {0x35,&rForceMultTranspose3S<3,7,12,6,7>},
-      {0x36,&rForceMultTranspose3S<3,8,14,7,8>},
-      {0x37,&rForceMultTranspose3S<3,9,16,8,9>},
-      {0x38,&rForceMultTranspose3S<3,10,18,9,10>},
-      {0x39,&rForceMultTranspose3S<3,11,20,10,11>},
-      {0x3A,&rForceMultTranspose3S<3,12,22,11,12>},
-      {0x3B,&rForceMultTranspose3S<3,13,24,12,13>},
-      {0x3C,&rForceMultTranspose3S<3,14,26,13,14>},
-      //{0x3D,&rForceMultTranspose3S<3,15,28,14,15>}, // uses too much shared data
-      //{0x3E,&rForceMultTranspose3S<3,16,30,15,16>},
-      //{0x3F,&rForceMultTranspose3S<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMultTranspose] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,v,e);
-}
diff --git a/cuda/cuda/kernels/share/gridFuncToQuadS.cpp b/cuda/cuda/kernels/share/gridFuncToQuadS.cpp
deleted file mode 100644
index d929a319..00000000
--- a/cuda/cuda/kernels/share/gridFuncToQuadS.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad2S(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double * restrict gf,
-                       double* restrict out)
-{
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   // Iterate over elements
-   const int idx = blockIdx.x;
-   const int eOff = idx * M2_ELEMENT_BATCH;
-   if (eOff < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];//@dim(NUM_QUAD_1D, NUM_DOFS_1D);
-
-      // Store xy planes in shared memory
-      share double s_xy[NUM_QUAD_DOFS_1D];//@dim(NUM_DOFS_1D, NUM_QUAD_1D);
-
-      for (int x = 0; x < NUM_MAX_1D; ++x)
-      {
-         for (int id = x; id < NUM_QUAD_DOFS_1D; id += NUM_MAX_1D)
-         {
-            s_dofToQuad[id] = dofToQuad[id];
-         }
-      }
-
-      for (int e = eOff; e < (eOff + M2_ELEMENT_BATCH); ++e)
-      {
-         if (e < numElements)
-         {
-            sync;
-            {
-               const int dx = threadIdx.x;
-               if (dx < NUM_DOFS_1D)
-               {
-                  double r_x[NUM_DOFS_1D];
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     r_x[dy] = gf[l2gMap[ijkN(dx, dy, e,NUM_DOFS_1D)]];
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     double xy = 0;
-                     for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                     {
-                        xy += r_x[dy] * s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     }
-                     s_xy[ijN(dx, qy,NUM_DOFS_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            {
-               const int qy = threadIdx.x;
-               if (qy < NUM_QUAD_1D)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     double val = 0;
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        val += s_xy[ijN(dx, qy,NUM_DOFS_1D)] * s_dofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                     }
-                     out[ijkN(qx, qy, e,NUM_QUAD_1D)] = val;
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad3S(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int NUM_MAX_2D = NUM_MAX_1D*NUM_MAX_1D;
-   // Iterate over elements
-   const int idx = blockIdx.x;
-   const int e = idx ;
-   if (e < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      // Store xy planes in @shared memory
-      share double s_z[NUM_MAX_2D];
-      // Store z axis as registers
-      double r_qz[NUM_QUAD_1D];
-      sync;
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * NUM_MAX_1D) + x;
-            // Fetch Q <--> D maps
-            if (id < NUM_QUAD_DOFS_1D)
-            {
-               s_dofToQuad[id] = dofToQuad[id];
-            }
-            // Initialize our Z axis
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               r_qz[qz] = 0;
-            }
-         }
-      }
-
-      sync;
-      {
-         const int dy = threadIdx.y;
-         {
-            const int dx = threadIdx.x;
-            if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-            {
-               for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-               {
-                  const double val = gf[l2gMap[ijklN(dx,dy,dz,e,NUM_DOFS_1D)]];
-                  // Calculate D -> Q in the Z axis
-                  for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                  {
-                     r_qz[qz] += val * s_dofToQuad[ijN(qz, dz,NUM_QUAD_1D)];
-                  }
-               }
-            }
-         }
-      }
-      // For each xy plane
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         // Fill xy plane at given z position
-         sync;
-         {
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-               {
-                  s_z[ijN(dx, dy,NUM_DOFS_1D)] = r_qz[qz];
-               }
-            }
-         }
-         // Calculate Dxyz, xDyz, xyDz in plane
-         sync;
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = threadIdx.x;
-               if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-               {
-                  double val = 0;
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     const double wy = s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        const double wx = s_dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                        val += wx * wy * s_z[ijN(dx,dy,NUM_DOFS_1D)];
-                     }
-                  }
-                  out[ijklN(qx, qy, qz, e,NUM_QUAD_1D)] = val;
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-typedef void (*fGridFuncToQuad)(const int numElements,
-                                const double* restrict dofToQuad,
-                                const int* restrict l2gMap,
-                                const double* gf,
-                                double* restrict out);
-// *****************************************************************************
-void rGridFuncToQuadS(const int DIM,
-                      const int NUM_VDIM,
-                      const int NUM_DOFS_1D,
-                      const int NUM_QUAD_1D,
-                      const int numElements,
-                      const double* dofToQuad,
-                      const int* l2gMap,
-                      const double* gf,
-                      double* __restrict out)
-{
-   if (DIM==1) { assert(false); }
-   const int MX_ELEMENT_BATCH = DIM==2?M2_ELEMENT_BATCH:1;
-   const int grid = ((numElements+MX_ELEMENT_BATCH-1)/MX_ELEMENT_BATCH);
-   const int b1d = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const dim3 blck(b1d,b1d,1);
-   const unsigned int id = (DIM<<8)|(NUM_VDIM<<4)|(NUM_DOFS_1D-1);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(NUM_VDIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-1)<=4);
-   assert(NUM_QUAD_1D==2*NUM_DOFS_1D);
-   if (NUM_QUAD_1D!=2*NUM_DOFS_1D)
-   {
-      return exit(
-                printf("\033[31;1m[rGridFuncToQuad] order ERROR: -ok=p -ot=p-1, p in [1,16]\033[m\n"));
-   }
-   static std::unordered_map<unsigned int, fGridFuncToQuad> call =
-   {
-      // 2D
-      {0x210,&rGridFuncToQuad2S<1,1,2>},
-      {0x211,&rGridFuncToQuad2S<1,2,4>},
-      {0x212,&rGridFuncToQuad2S<1,3,6>},
-      {0x213,&rGridFuncToQuad2S<1,4,8>},
-      {0x214,&rGridFuncToQuad2S<1,5,10>},
-      {0x215,&rGridFuncToQuad2S<1,6,12>},
-      {0x216,&rGridFuncToQuad2S<1,7,14>},
-      {0x217,&rGridFuncToQuad2S<1,8,16>},
-      {0x218,&rGridFuncToQuad2S<1,9,18>},
-      {0x219,&rGridFuncToQuad2S<1,10,20>},
-      {0x21A,&rGridFuncToQuad2S<1,11,22>},
-      {0x21B,&rGridFuncToQuad2S<1,12,24>},
-      {0x21C,&rGridFuncToQuad2S<1,13,26>},
-      {0x21D,&rGridFuncToQuad2S<1,14,28>},
-      {0x21E,&rGridFuncToQuad2S<1,15,30>},
-      {0x21F,&rGridFuncToQuad2S<1,16,32>},
-      // 3D
-      {0x310,&rGridFuncToQuad3S<1,1,2>},
-      {0x311,&rGridFuncToQuad3S<1,2,4>},
-      {0x312,&rGridFuncToQuad3S<1,3,6>},
-      {0x313,&rGridFuncToQuad3S<1,4,8>},
-      {0x314,&rGridFuncToQuad3S<1,5,10>},
-      {0x315,&rGridFuncToQuad3S<1,6,12>},
-      {0x316,&rGridFuncToQuad3S<1,7,14>},
-      {0x317,&rGridFuncToQuad3S<1,8,16>},
-      {0x318,&rGridFuncToQuad3S<1,9,18>},
-      {0x319,&rGridFuncToQuad3S<1,10,20>},
-      {0x31A,&rGridFuncToQuad3S<1,11,22>},
-      {0x31B,&rGridFuncToQuad3S<1,12,24>},
-      {0x31C,&rGridFuncToQuad3S<1,13,26>},
-      {0x31D,&rGridFuncToQuad3S<1,14,28>},
-      {0x31E,&rGridFuncToQuad3S<1,15,30>},
-      {0x31F,&rGridFuncToQuad3S<1,16,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rGridFuncToQuad] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck, numElements,dofToQuad,l2gMap,gf,out);
-}
diff --git a/cuda/cuda/kernels/share/massAssembleS.cpp b/cuda/cuda/kernels/share/massAssembleS.cpp
deleted file mode 100644
index 46282651..00000000
--- a/cuda/cuda/kernels/share/massAssembleS.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble2S0(const int numElements,
-                      const int NUM_QUAD,
-                      const double COEFF,
-                      const double* quadWeights,
-                      const double* J,
-                      double* __restrict oper)
-{
-   const int idx = blockIdx.x;
-   const int eOff = idx;
-   if (eOff < numElements)
-   {
-      {
-         const int e = threadIdx.x;
-         {
-            const int qOff = threadIdx.y;
-            for (int q = qOff; q < NUM_QUAD; q += 1)
-            {
-               const double J11 = J[ijklNM(0, 0, q, e,2,NUM_QUAD)];
-               const double J12 = J[ijklNM(1, 0, q, e,2,NUM_QUAD)];
-               const double J21 = J[ijklNM(0, 1, q, e,2,NUM_QUAD)];
-               const double J22 = J[ijklNM(1, 1, q, e,2,NUM_QUAD)];
-
-               oper[ijN(q,e,NUM_QUAD)] = quadWeights[q] * COEFF * ((J11 * J22) - (J21 * J12));
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble3S0(const int numElements,
-                      const int NUM_QUAD,
-                      const double COEFF,
-                      const double* restrict quadWeights,
-                      const double* restrict J,
-                      double* __restrict oper)
-{
-   const int idx = blockIdx.x;
-   const int eOff = idx;
-   if (eOff < numElements)
-   {
-      const int e = threadIdx.x;
-      {
-         if (e < numElements)
-         {
-            const int qOff = threadIdx.y;
-            {
-               for (int q = qOff; q < NUM_QUAD; q += 1)
-               {
-                  const double J11 = J[ijklNM(0, 0, q, e,3,NUM_QUAD)];
-                  const double J12 = J[ijklNM(1, 0, q, e,3,NUM_QUAD)];
-                  const double J13 = J[ijklNM(2, 0, q, e,3,NUM_QUAD)];
-                  const double J21 = J[ijklNM(0, 1, q, e,3,NUM_QUAD)];
-                  const double J22 = J[ijklNM(1, 1, q, e,3,NUM_QUAD)];
-                  const double J23 = J[ijklNM(2, 1, q, e,3,NUM_QUAD)];
-                  const double J31 = J[ijklNM(0, 2, q, e,3,NUM_QUAD)];
-                  const double J32 = J[ijklNM(1, 2, q, e,3,NUM_QUAD)];
-                  const double J33 = J[ijklNM(2, 2, q, e,3,NUM_QUAD)];
-
-                  const double detJ = ((J11 * J22 * J33) + (J12 * J23 * J31) + (J13 * J21 * J32) -
-                                       (J13 * J22 * J31) - (J12 * J21 * J33) - (J11 * J23 * J32));
-
-                  oper[ijN(q, e,NUM_QUAD)] = quadWeights[q] * COEFF * detJ;
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-static void rMassAssemble2S(const int NUM_QUAD_2D,
-                            const int numElements,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   dim3 threads(1, 1, 1);
-   dim3 blocks(numElements, 1, 1);
-   cuKerGBS(rMassAssemble2S,blocks,threads,numElements,NUM_QUAD_2D,COEFF,
-            quadWeights,J,oper);
-}
-
-// *****************************************************************************
-static void rMassAssemble3S(const int NUM_QUAD_3D,
-                            const int numElements,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   dim3 threads(1, 1, 1);
-   dim3 blocks(numElements, 1, 1);
-   cuKerGBS(rMassAssemble3S,blocks,threads,numElements,NUM_QUAD_3D,COEFF,
-            quadWeights,J,oper);
-}
-
-// *****************************************************************************
-void rMassAssembleS(const int dim,
-                    const int NUM_QUAD,
-                    const int numElements,
-                    const double* quadWeights,
-                    const double* J,
-                    const double COEFF,
-                    double* __restrict oper)
-{
-   assert(false);
-   if (dim==1) {assert(false);}
-   if (dim==2) { rMassAssemble2S(NUM_QUAD,numElements,COEFF,quadWeights,J,oper); }
-   if (dim==3) { rMassAssemble3S(NUM_QUAD,numElements,COEFF,quadWeights,J,oper); }
-}
diff --git a/cuda/cuda/kernels/share/massMultAddS.cpp b/cuda/cuda/kernels/share/massMultAddS.cpp
deleted file mode 100644
index bddbbe43..00000000
--- a/cuda/cuda/kernels/share/massMultAddS.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd2S(const int numElements,
-                    const double* restrict dofToQuad,
-                    const double* restrict dofToQuadD,
-                    const double* restrict quadToDof,
-                    const double* restrict quadToDofD,
-                    const double* restrict oper,
-                    const double* restrict solIn,
-                    double* restrict solOut)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   // Iterate over elements
-   const int idx = blockIdx.x;
-   const int eOff = idx * M2_ELEMENT_BATCH;
-   if (eOff < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      share double s_quadToDof[NUM_QUAD_DOFS_1D];
-
-      // Store xy planes in shared memory
-      share double s_xy[NUM_QUAD_DOFS_1D];
-      share double s_xy2[NUM_QUAD_2D];
-
-      double r_x[NUM_MAX_1D];
-
-      const int x = threadIdx.x;
-      {
-         for (int id = x; id < NUM_QUAD_DOFS_1D; id += NUM_MAX_1D)
-         {
-            s_dofToQuad[id]  = dofToQuad[id];
-            s_quadToDof[id]  = quadToDof[id];
-         }
-      }
-
-      for (int e = eOff; e < (eOff + M2_ELEMENT_BATCH); ++e)
-      {
-         if (e < numElements)
-         {
-            {
-               const int dx = threadIdx.x;
-               if (dx < NUM_DOFS_1D)
-               {
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     s_xy[ijN(dx, qy,NUM_DOFS_1D)] = 0;
-                  }
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     r_x[dy] = solIn[ijkN(dx, dy, e,NUM_DOFS_1D)];
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     double xy = 0;
-                     for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                     {
-                        xy += r_x[dy] * s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     }
-                     s_xy[ijN(dx, qy,NUM_DOFS_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            const int qy = threadIdx.x;
-            {
-               if (qy < NUM_QUAD_1D)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     double s = 0;
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        s += s_xy[ijN(dx, qy,NUM_DOFS_1D)] * s_dofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                     }
-                     s_xy2[ijN(qx, qy,NUM_QUAD_1D)] = s * oper[ijkN(qx, qy, e,NUM_QUAD_1D)];
-                  }
-               }
-            }
-            sync;
-            const int qx = threadIdx.x;
-            {
-               if (qx < NUM_QUAD_1D)
-               {
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     s_xy[ijN(dy, qx,NUM_DOFS_1D)] = 0;
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     r_x[qy] = s_xy2[ijN(qx, qy,NUM_QUAD_1D)];
-                  }
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     double s = 0;
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        s += r_x[qy] * s_quadToDof[ijN(dy, qy,NUM_DOFS_1D)];
-                     }
-                     s_xy[ijN(dy, qx,NUM_DOFS_1D)] = s;
-                  }
-               }
-            }
-            sync;
-            const int dx = threadIdx.x;
-            {
-               if (dx < NUM_DOFS_1D)
-               {
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     double s = 0;
-                     for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                     {
-                        s += (s_xy[ijN(dy, qx,NUM_DOFS_1D)] * s_quadToDof[ijN(dx, qx,NUM_DOFS_1D)]);
-                     }
-                     solOut[ijkN(dx, dy, e,NUM_DOFS_1D)] += s;
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd3S(const int numElements,
-                    const double* restrict dofToQuad,
-                    const double* restrict dofToQuadD,
-                    const double* restrict quadToDof,
-                    const double* restrict quadToDofD,
-                    const double* restrict oper,
-                    const double* restrict solIn,
-                    double* restrict solOut)
-{
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int NUM_MAX_2D = NUM_MAX_1D*NUM_MAX_1D;
-   // Iterate over elements
-   const int e = blockIdx.x;
-   if (e < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      share double s_quadToDof[NUM_QUAD_DOFS_1D];
-      // Store xy planes in @shared memory
-      share double s_xy[NUM_MAX_2D];
-      // Store z axis as registers
-      double r_z[NUM_QUAD_1D];
-      double r_z2[NUM_DOFS_1D];
-
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * NUM_MAX_1D) + x;
-            // Fetch Q <--> D maps
-            if (id < NUM_QUAD_DOFS_1D)
-            {
-               s_dofToQuad[id]  = dofToQuad[id];
-               s_quadToDof[id]  = quadToDof[id];
-            }
-            // Initialize our Z axis
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               r_z[qz] = 0;
-            }
-            for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-            {
-               r_z2[dz] = 0;
-            }
-         }
-      }
-      sync;
-      {
-         const int dy = threadIdx.y;
-         {
-            const int dx = threadIdx.x;
-            if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-            {
-               for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-               {
-                  const double s = solIn[ijklN(dx,dy,dz,e,NUM_DOFS_1D)];
-                  // Calculate D -> Q in the Z axis
-                  for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                  {
-                     r_z[qz] += s * s_dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-                  }
-               }
-            }
-         }
-      }
-      // For each xy plane
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         // Fill xy plane at given z position
-         sync;
-         {
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-               {
-                  s_xy[ijN(dx, dy,NUM_DOFS_1D)] = r_z[qz];
-               }
-            }
-         }
-         // Calculate Dxyz, xDyz, xyDz in plane
-         sync;
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = threadIdx.x;
-               if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-               {
-                  double s = 0;
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     const double wy = s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        const double wx = s_dofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                        s += wx * wy * s_xy[ijN(dx, dy,NUM_DOFS_1D)];
-                     }
-                  }
-
-                  s *= oper[ijklN(qx, qy, qz,e,NUM_QUAD_1D)];
-
-                  for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-                  {
-                     const double wz  = s_quadToDof[ijN(dz, qz,NUM_DOFS_1D)];
-                     r_z2[dz] += wz * s;
-                  }
-               }
-            }
-         }
-      }
-      // Iterate over xy planes to compute solution
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         // Place xy plane in @shared memory
-         sync;
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = threadIdx.x;
-               if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-               {
-                  s_xy[ijN(qx, qy,NUM_QUAD_1D)] = r_z2[dz];
-               }
-            }
-         }
-         // Finalize solution in xy plane
-         sync;
-         {
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-               {
-                  double solZ = 0;
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     const double wy = s_quadToDof[ijN(dy, qy,NUM_DOFS_1D)];
-                     for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                     {
-                        const double wx = s_quadToDof[ijN(dx, qx,NUM_DOFS_1D)];
-                        solZ += wx * wy * s_xy[ijN(qx, qy,NUM_QUAD_1D)];
-                     }
-                  }
-                  solOut[ijklN(dx,dy,dz,e,NUM_DOFS_1D)] += solZ;
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fMassMultAdd)(const int numElements,
-                             const double* dofToQuad,
-                             const double* dofToQuadD,
-                             const double* quadToDof,
-                             const double* quadToDofD,
-                             const double* oper,
-                             const double* solIn,
-                             double* __restrict solOut);
-
-// *****************************************************************************
-void rMassMultAddS(const int DIM,
-                   const int NUM_DOFS_1D,
-                   const int NUM_QUAD_1D,
-                   const int numElements,
-                   const double* dofToQuad,
-                   const double* dofToQuadD,
-                   const double* quadToDof,
-                   const double* quadToDofD,
-                   const double* op,
-                   const double* x,
-                   double* __restrict y)
-{
-   if (DIM==1) { assert(false); }
-   const int b1d = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int MX_ELEMENT_BATCH = DIM==2?M2_ELEMENT_BATCH:1;
-   const int grid = ((numElements+MX_ELEMENT_BATCH-1)/MX_ELEMENT_BATCH);
-   const dim3 blck(b1d,b1d,1);
-   assert(LOG2(DIM)<=4);
-   assert((NUM_QUAD_1D&1)==0);
-   assert(LOG2(NUM_DOFS_1D-1)<=8);
-   assert(LOG2(NUM_QUAD_1D>>1)<=8);
-   const unsigned int id = (DIM<<16)|((NUM_DOFS_1D-1)<<8)|(NUM_QUAD_1D>>1);
-   static std::unordered_map<unsigned int, fMassMultAdd> call =
-   {
-      // 2D
-      {0x20001,&rMassMultAdd2S<1,2>},    {0x20101,&rMassMultAdd2S<2,2>},
-      {0x20102,&rMassMultAdd2S<2,4>},    {0x20202,&rMassMultAdd2S<3,4>},
-      {0x20203,&rMassMultAdd2S<3,6>},    {0x20303,&rMassMultAdd2S<4,6>},
-      {0x20304,&rMassMultAdd2S<4,8>},    {0x20404,&rMassMultAdd2S<5,8>},
-      {0x20405,&rMassMultAdd2S<5,10>},   {0x20505,&rMassMultAdd2S<6,10>},
-      {0x20506,&rMassMultAdd2S<6,12>},   {0x20606,&rMassMultAdd2S<7,12>},
-      {0x20607,&rMassMultAdd2S<7,14>},   {0x20707,&rMassMultAdd2S<8,14>},
-      {0x20708,&rMassMultAdd2S<8,16>},   {0x20808,&rMassMultAdd2S<9,16>},
-      {0x20809,&rMassMultAdd2S<9,18>},   {0x20909,&rMassMultAdd2S<10,18>},
-      {0x2090A,&rMassMultAdd2S<10,20>},  {0x20A0A,&rMassMultAdd2S<11,20>},
-      {0x20A0B,&rMassMultAdd2S<11,22>},  {0x20B0B,&rMassMultAdd2S<12,22>},
-      {0x20B0C,&rMassMultAdd2S<12,24>},  {0x20C0C,&rMassMultAdd2S<13,24>},
-      {0x20C0D,&rMassMultAdd2S<13,26>},  {0x20D0D,&rMassMultAdd2S<14,26>},
-      {0x20D0E,&rMassMultAdd2S<14,28>},  {0x20E0E,&rMassMultAdd2S<15,28>},
-      {0x20E0F,&rMassMultAdd2S<15,30>},  {0x20F0F,&rMassMultAdd2S<16,30>},
-      {0x20F10,&rMassMultAdd2S<16,32>},  {0x21010,&rMassMultAdd2S<17,32>},
-      // 3D
-      {0x30001,&rMassMultAdd3S<1,2>},    {0x30101,&rMassMultAdd3S<2,2>},
-      {0x30102,&rMassMultAdd3S<2,4>},    {0x30202,&rMassMultAdd3S<3,4>},
-      {0x30203,&rMassMultAdd3S<3,6>},    {0x30303,&rMassMultAdd3S<4,6>},
-      {0x30304,&rMassMultAdd3S<4,8>},    {0x30404,&rMassMultAdd3S<5,8>},
-      {0x30405,&rMassMultAdd3S<5,10>},   {0x30505,&rMassMultAdd3S<6,10>},
-      {0x30506,&rMassMultAdd3S<6,12>},   {0x30606,&rMassMultAdd3S<7,12>},
-      {0x30607,&rMassMultAdd3S<7,14>},   {0x30707,&rMassMultAdd3S<8,14>},
-      {0x30708,&rMassMultAdd3S<8,16>},   {0x30808,&rMassMultAdd3S<9,16>},
-      {0x30809,&rMassMultAdd3S<9,18>},   {0x30909,&rMassMultAdd3S<10,18>},
-      {0x3090A,&rMassMultAdd3S<10,20>},  {0x30A0A,&rMassMultAdd3S<11,20>},
-      {0x30A0B,&rMassMultAdd3S<11,22>},  {0x30B0B,&rMassMultAdd3S<12,22>},
-      {0x30B0C,&rMassMultAdd3S<12,24>},  {0x30C0C,&rMassMultAdd3S<13,24>},
-      {0x30C0D,&rMassMultAdd3S<13,26>},  {0x30D0D,&rMassMultAdd3S<14,26>},
-      {0x30D0E,&rMassMultAdd3S<14,28>},  {0x30E0E,&rMassMultAdd3S<15,28>},
-      {0x30E0F,&rMassMultAdd3S<15,30>},  {0x30F0F,&rMassMultAdd3S<16,30>},
-      {0x30F10,&rMassMultAdd3S<16,32>},  {0x31010,&rMassMultAdd3S<17,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rMassMultAddS] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuad,dofToQuadD,quadToDof,quadToDofD,op,x,y);
-}
diff --git a/cuda/cuda/kernels/share/qDataUpdateS.cpp b/cuda/cuda/kernels/share/qDataUpdateS.cpp
deleted file mode 100644
index 57fcc20e..00000000
--- a/cuda/cuda/kernels/share/qDataUpdateS.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../cuda.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData2S(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int idx = blockIdx.x;
-   const int el = idx;
-   if (el < numElements)
-   {
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];//@dim(NUM_QUAD_1D, NUM_DOFS_1D);
-      share double s_dofToQuadD[NUM_QUAD_DOFS_1D];//@dim(NUM_QUAD_1D, NUM_DOFS_1D);
-
-      share double s_xy[NUM_DIM *
-                        NUM_QUAD_DOFS_1D];//@dim(NUM_DIM, NUM_DOFS_1D, NUM_QUAD_1D);
-      share double s_xDy[NUM_DIM *
-                         NUM_QUAD_DOFS_1D];//@dim(NUM_DIM, NUM_DOFS_1D, NUM_QUAD_1D);
-
-      share double s_gradv[NUM_DIM * NUM_DIM *
-                           NUM_QUAD_2D];//@dim(NUM_DIM, NUM_DIM, NUM_QUAD_2D);
-
-      double r_v[NUM_DIM * NUM_DOFS_1D];//@dim(NUM_DIM, NUM_DOFS_1D);
-
-      {
-         const int x = threadIdx.x;
-         for (int id = x; id < NUM_QUAD_DOFS_1D; id += NUM_MAX_1D)
-         {
-            s_dofToQuad[id]  = dofToQuad[id];
-            s_dofToQuadD[id] = dofToQuadD[id];
-         }
-      }
-
-      sync;
-      {
-         const int dx = threadIdx.x;
-         if (dx < NUM_DOFS_1D)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  s_xy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] = 0;
-                  s_xDy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] = 0;
-               }
-            }
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  r_v[ijN(vi, dy,NUM_DIM)] = v[_ijklNM(vi,dx,dy,el,NUM_DOFS_1D,numElements)];
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               double xy[NUM_DIM];
-               double xDy[NUM_DIM];
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  xy[vi]  = 0;
-                  xDy[vi] = 0;
-               }
-               for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-               {
-                  for (int vi = 0; vi < NUM_DIM; ++vi)
-                  {
-                     xy[vi]  += r_v[ijN(vi, dy,NUM_DIM)] * s_dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                     xDy[vi] += r_v[ijN(vi, dy,NUM_DIM)] * s_dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                  }
-               }
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  s_xy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)]  = xy[vi];
-                  s_xDy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] = xDy[vi];
-               }
-            }
-         }
-      }
-
-      sync;
-      {
-         const int qy = threadIdx.x;
-         if (qy < NUM_QUAD_1D)
-         {
-            for (int qx = 0; qx < NUM_MAX_1D; ++qx)
-            {
-               double gradX[NUM_DIM];
-               double gradY[NUM_DIM];
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  gradX[vi] = 0;
-                  gradY[vi] = 0;
-               }
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  for (int vi = 0; vi < NUM_DIM; ++vi)
-                  {
-                     gradX[vi] += s_xy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)]  * s_dofToQuadD[ijN(qx,
-                                                                                                  dx,NUM_QUAD_1D)];
-                     gradY[vi] += s_xDy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] * s_dofToQuad[ijN(qx,
-                                                                                                 dx,NUM_QUAD_1D)];
-                  }
-               }
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  s_gradv[ijkN(vi, 0, qx + qy*NUM_QUAD_1D,NUM_DIM)] = gradX[vi];
-                  s_gradv[ijkN(vi, 1, qx + qy*NUM_QUAD_1D,NUM_DIM)] = gradY[vi];
-               }
-            }
-         }
-      }
-
-      sync;
-      {
-         const int qBlock = threadIdx.x;
-         for (int q = qBlock; q < NUM_QUAD; q += NUM_MAX_1D)
-         {
-            double q_gradv[NUM_DIM * NUM_DIM];//@dim(NUM_DIM, NUM_DIM);
-            double q_stress[NUM_DIM * NUM_DIM];//@dim(NUM_DIM, NUM_DIM);
-
-            const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-            q_gradv[ijN(0,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_00) + (s_gradv[ijkN(1,0,q,
-                                                                                    2)]*invJ_01));
-            q_gradv[ijN(1,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_10) + (s_gradv[ijkN(1,0,q,
-                                                                                    2)]*invJ_11));
-            q_gradv[ijN(0,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_00) + (s_gradv[ijkN(1,1,q,
-                                                                                    2)]*invJ_01));
-            q_gradv[ijN(1,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_10) + (s_gradv[ijkN(1,1,q,
-                                                                                    2)]*invJ_11));
-
-            const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-
-            const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)]/q_Jw;
-            const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-
-            // TODO: Input OccaVector eos(q,e) -> (stress, soundSpeed)
-            const double s = -(GAMMA - 1.0) * q_rho * q_e;
-            q_stress[ijN(0,0,2)] = s; q_stress[ijN(1,0,2)] = 0;
-            q_stress[ijN(0,1,2)] = 0; q_stress[ijN(1,1,2)] = s;
-
-            const double gradv00 = q_gradv[ijN(0,0,2)];
-            const double gradv11 = q_gradv[ijN(1,1,2)];
-            const double gradv10 = 0.5 * (q_gradv[ijN(1,0,2)] + q_gradv[ijN(0,1,2)]);
-            q_gradv[ijN(1,0,2)] = gradv10;
-            q_gradv[ijN(0,1,2)] = gradv10;
-
-            double comprDirX = 1;
-            double comprDirY = 0;
-            double minEig = 0;
-            // linalg/densemat.cpp: Eigensystem2S()
-            if (gradv10 == 0)
-            {
-               minEig = (gradv00 < gradv11) ? gradv00 : gradv11;
-            }
-            else
-            {
-               const double zeta  = (gradv11 - gradv00) / (2.0 * gradv10);
-               const double azeta = fabs(zeta);
-               double t = 1.0 / (azeta + sqrt(1.0 + zeta*zeta));
-               if ((t < 0) != (zeta < 0))
-               {
-                  t = -t;
-               }
-
-               const double c = sqrt(1.0 / (1.0 + t*t));
-               const double s = c * t;
-               t *= gradv10;
-
-               if ((gradv00 - t) <= (gradv11 + t))
-               {
-                  minEig = gradv00 - t;
-                  comprDirX = c;
-                  comprDirY = -s;
-               }
-               else
-               {
-                  minEig = gradv11 + t;
-                  comprDirX = s;
-                  comprDirY = c;
-               }
-            }
-
-            // Computes the initial->physical transformation Jacobian.
-            const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-            const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-            const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-            const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01));
-            const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11));
-            const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01));
-            const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11));
-
-            const double physDirX = (Jpi_00 * comprDirX) + (Jpi_10 * comprDirY);
-            const double physDirY = (Jpi_01 * comprDirX) + (Jpi_11 * comprDirY);
-
-            const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY));
-
-            // TODO: soundSpeed will be an input as well (function call or values per q)
-            const double soundSpeed = sqrt(GAMMA * (GAMMA - 1.0) * q_e);
-            dtEst[ijN(q, el,NUM_QUAD)] = CFL * q_h / soundSpeed;
-
-            if (USE_VISCOSITY)
-            {
-               // TODO: Check how we can extract outside of kernel
-               const double mu = minEig;
-               double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-               if (mu < 0)
-               {
-                  coeff += 0.5 * q_rho * q_h * soundSpeed;
-               }
-               for (int y = 0; y < NUM_DIM; ++y)
-               {
-                  for (int x = 0; x < NUM_DIM; ++x)
-                  {
-                     q_stress[ijN(x,y,2)] += coeff * q_gradv[ijN(x,y,2)];
-                  }
-               }
-            }
-            const double S00 = q_stress[ijN(0,0,2)];
-            const double S10 = q_stress[ijN(1,0,2)];
-            const double S01 = q_stress[ijN(0,1,2)];
-            const double S11 = q_stress[ijN(1,1,2)];
-
-            stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01));
-            stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11));
-
-            stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01));
-            stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11));
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData3S(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int el = blockIdx.x;
-   if (el < numElements)
-   {
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      share double s_dofToQuadD[NUM_QUAD_DOFS_1D];
-
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * NUM_QUAD_1D) + x;
-            for (int i = id; i < (NUM_DOFS_1D * NUM_QUAD_1D); i += NUM_QUAD_2D)
-            {
-               s_dofToQuad[id]  = dofToQuad[id];
-               s_dofToQuadD[id] = dofToQuadD[id];
-            }
-         }
-      }
-      sync;
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = 0 + threadIdx.x;
-               const int q = qx + qy*NUM_QUAD_1D + qz*NUM_QUAD_2D;
-               double gradv[9];
-               double q_gradv[9];
-               double q_stress[9];
-
-               // Brute-force convertion of dof -> quad for now
-               for (int i = 0; i < 9; ++i)
-               {
-                  gradv[i] = 0;
-               }
-               for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-               {
-                  double xy[3];
-                  double Dxy[3];
-                  double xDy[3];
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     xy[vi] = Dxy[vi] = xDy[vi] = 0;
-                  }
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     double x[3];
-                     double Dx[3];
-                     for (int vi = 0; vi < 3; ++vi)
-                     {
-                        x[vi] = Dx[vi] = 0;
-                     }
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        const double wx  = s_dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                        const double wDx = s_dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                        for (int vi = 0; vi < 3; ++vi)
-                        {
-                           const double r_v = v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,numElements)];
-                           x[vi]  += wx  * r_v;
-                           Dx[vi] += wDx * r_v;
-                        }
-                     }
-                     const double wy  = s_dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                     const double wDy = s_dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                     for (int vi = 0; vi < 3; ++vi)
-                     {
-                        xy[vi]  += wy  * x[vi];
-                        Dxy[vi] += wy  * Dx[vi];
-                        xDy[vi] += wDy * x[vi];
-                     }
-                  }
-                  const double wz  = s_dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-                  const double wDz = s_dofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     gradv[ijN(vi,0,3)] += wz  * Dxy[vi];
-                     gradv[ijN(vi,1,3)] += wz  * xDy[vi];
-                     gradv[ijN(vi,2,3)] += wDz * xy[vi];
-                  }
-               }
-
-               const double invJ_00 = invJ[ijklNM(0, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_10 = invJ[ijklNM(1, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_20 = invJ[ijklNM(2, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_01 = invJ[ijklNM(0, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_11 = invJ[ijklNM(1, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_21 = invJ[ijklNM(2, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_02 = invJ[ijklNM(0, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_12 = invJ[ijklNM(1, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_22 = invJ[ijklNM(2, 2, q, el,NUM_DIM,NUM_QUAD)];
-
-               q_gradv[ijN(0,0,3)] = ((gradv[ijN(0,0,3)] * invJ_00) + (gradv[ijN(1,0,
-                                                                                 3)] * invJ_01) + (gradv[ijN(2,0,3)] * invJ_02));
-               q_gradv[ijN(1,0,3)] = ((gradv[ijN(0,0,3)] * invJ_10) + (gradv[ijN(1,0,
-                                                                                 3)] * invJ_11) + (gradv[ijN(2,0,3)] * invJ_12));
-               q_gradv[ijN(2,0,3)] = ((gradv[ijN(0,0,3)] * invJ_20) + (gradv[ijN(1,0,
-                                                                                 3)] * invJ_21) + (gradv[ijN(2,0,3)] * invJ_22));
-
-               q_gradv[ijN(0,1,3)] = ((gradv[ijN(0,1,3)] * invJ_00) + (gradv[ijN(1,1,
-                                                                                 3)] * invJ_01) + (gradv[ijN(2,1,3)] * invJ_02));
-               q_gradv[ijN(1,1,3)] = ((gradv[ijN(0,1,3)] * invJ_10) + (gradv[ijN(1,1,
-                                                                                 3)] * invJ_11) + (gradv[ijN(2,1,3)] * invJ_12));
-               q_gradv[ijN(2,1,3)] = ((gradv[ijN(0,1,3)] * invJ_20) + (gradv[ijN(1,1,
-                                                                                 3)] * invJ_21) + (gradv[ijN(2,1,3)] * invJ_22));
-
-               q_gradv[ijN(0,2,3)] = ((gradv[ijN(0,2,3)] * invJ_00) + (gradv[ijN(1,2,
-                                                                                 3)] * invJ_01) + (gradv[ijN(2,2,3)] * invJ_02));
-               q_gradv[ijN(1,2,3)] = ((gradv[ijN(0,2,3)] * invJ_10) + (gradv[ijN(1,2,
-                                                                                 3)] * invJ_11) + (gradv[ijN(2,2,3)] * invJ_12));
-               q_gradv[ijN(2,2,3)] = ((gradv[ijN(0,2,3)] * invJ_20) + (gradv[ijN(1,2,
-                                                                                 3)] * invJ_21) + (gradv[ijN(2,2,3)] * invJ_22));
-
-               const double q_Jw = detJ[ijN(q,el,NUM_QUAD)] * quadWeights[q];
-
-               const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-               const double q_e   = fmax(0.0, e[ijN(q,el,NUM_QUAD)]);
-
-               const double s = -(GAMMA - 1.0) * q_rho * q_e;
-               q_stress[ijN(0, 0,3)] = s; q_stress[ijN(1, 0,3)] = 0; q_stress[ijN(2, 0,3)] = 0;
-               q_stress[ijN(0, 1,3)] = 0; q_stress[ijN(1, 1,3)] = s; q_stress[ijN(2, 1,3)] = 0;
-               q_stress[ijN(0, 2,3)] = 0; q_stress[ijN(1, 2,3)] = 0; q_stress[ijN(2, 2,3)] = s;
-
-               const double gradv00 = q_gradv[ijN(0, 0,3)];
-               const double gradv11 = q_gradv[ijN(1, 1,3)];
-               const double gradv22 = q_gradv[ijN(2, 2,3)];
-               const double gradv10 = 0.5 * (q_gradv[ijN(1, 0,3)] + q_gradv[ijN(0, 1,3)]);
-               const double gradv20 = 0.5 * (q_gradv[ijN(2, 0,3)] + q_gradv[ijN(0, 2,3)]);
-               const double gradv21 = 0.5 * (q_gradv[ijN(2, 1,3)] + q_gradv[ijN(1, 2,3)]);
-               q_gradv[ijN(1, 0,3)] = gradv10; q_gradv[ijN(2, 0,3)] = gradv20;
-               q_gradv[ijN(0, 1,3)] = gradv10; q_gradv[ijN(2, 1,3)] = gradv21;
-               q_gradv[ijN(0, 2,3)] = gradv20; q_gradv[ijN(1, 2,3)] = gradv21;
-
-               double minEig = 0;
-               double comprDirX = 1;
-               double comprDirY = 0;
-               double comprDirZ = 0;
-
-               {
-                  // Compute eigenvalues using quadrature formula
-                  const double q_ = (gradv00 + gradv11 + gradv22) / 3.0;
-                  const double gradv_q00 = (gradv00 - q_);
-                  const double gradv_q11 = (gradv11 - q_);
-                  const double gradv_q22 = (gradv22 - q_);
-
-                  const double p1 = ((gradv10 * gradv10) +
-                                     (gradv20 * gradv20) +
-                                     (gradv21 * gradv21));
-                  const double p2 = ((gradv_q00 * gradv_q00) +
-                                     (gradv_q11 * gradv_q11) +
-                                     (gradv_q22 * gradv_q22) +
-                                     (2.0 * p1));
-                  const double p    = sqrt(p2 / 6.0);
-                  const double pinv = 1.0 / p;
-                  // det(pinv * (gradv - q*I))
-                  const double r = (0.5 * pinv * pinv * pinv *
-                                    ((gradv_q00 * gradv_q11 * gradv_q22) +
-                                     (2.0 * gradv10 * gradv21 * gradv20) -
-                                     (gradv_q11 * gradv20 * gradv20) -
-                                     (gradv_q22 * gradv10 * gradv10) -
-                                     (gradv_q00 * gradv21 * gradv21)));
-
-                  double phi = 0;
-                  if (r <= -1.0)
-                  {
-                     phi = M_PI / 3.0;
-                  }
-                  else if (r < 1.0)
-                  {
-                     phi = acos(r) / 3.0;
-                  }
-
-                  minEig = q_ + (2.0 * p * cos(phi + (2.0 * M_PI / 3.0)));
-                  const double eig3 = q_ + (2.0 * p * cos(phi));
-                  const double eig2 = 3.0 * q_ - minEig - eig3;
-                  double maxNorm = 0;
-
-                  for (int i = 0; i < 3; ++i)
-                  {
-                     const double x = q_gradv[i + 3*0] - (i == 0)*eig3;
-                     const double y = q_gradv[i + 3*1] - (i == 1)*eig3;
-                     const double z = q_gradv[i + 3*2] - (i == 2)*eig3;
-                     const double cx = ((x * (gradv00 - eig2)) +
-                                        (y * gradv10) +
-                                        (z * gradv20));
-                     const double cy = ((x * gradv10) +
-                                        (y * (gradv11 - eig2)) +
-                                        (z * gradv21));
-                     const double cz = ((x * gradv20) +
-                                        (y * gradv21) +
-                                        (z * (gradv22 - eig2)));
-                     const double cNorm = (cx*cx + cy*cy + cz*cz);
-                     if ((cNorm > 1e-16) && (maxNorm < cNorm))
-                     {
-                        comprDirX = cx;
-                        comprDirY = cy;
-                        comprDirZ = cz;
-                        maxNorm = cNorm;
-                     }
-                  }
-                  if (maxNorm > 1e-16)
-                  {
-                     const double maxNormInv = 1.0 / sqrt(maxNorm);
-                     comprDirX *= maxNormInv;
-                     comprDirY *= maxNormInv;
-                     comprDirZ *= maxNormInv;
-                  }
-               }
-
-               // Computes the initial->physical transformation Jacobian.
-               const double J_00 = J[ijklNM(0, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_10 = J[ijklNM(1, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_20 = J[ijklNM(2, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_01 = J[ijklNM(0, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_11 = J[ijklNM(1, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_21 = J[ijklNM(2, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_02 = J[ijklNM(0, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_12 = J[ijklNM(1, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_22 = J[ijklNM(2, 2, q, el,NUM_DIM,NUM_QUAD)];
-
-               const double invJ0_00 = invJ0[ijklNM(0, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_10 = invJ0[ijklNM(1, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_20 = invJ0[ijklNM(2, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_01 = invJ0[ijklNM(0, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_11 = invJ0[ijklNM(1, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_21 = invJ0[ijklNM(2, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_02 = invJ0[ijklNM(0, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_12 = invJ0[ijklNM(1, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_22 = invJ0[ijklNM(2, 2, q, el,NUM_DIM,NUM_QUAD)];
-
-               const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01) +
-                                      (J_20 * invJ0_02));
-               const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11) +
-                                      (J_20 * invJ0_12));
-               const double Jpi_20 = ((J_00 * invJ0_20) + (J_10 * invJ0_21) +
-                                      (J_20 * invJ0_22));
-
-               const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01) +
-                                      (J_21 * invJ0_02));
-               const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11) +
-                                      (J_21 * invJ0_12));
-               const double Jpi_21 = ((J_01 * invJ0_20) + (J_11 * invJ0_21) +
-                                      (J_21 * invJ0_22));
-
-               const double Jpi_02 = ((J_02 * invJ0_00) + (J_12 * invJ0_01) +
-                                      (J_22 * invJ0_02));
-               const double Jpi_12 = ((J_02 * invJ0_10) + (J_12 * invJ0_11) +
-                                      (J_22 * invJ0_12));
-               const double Jpi_22 = ((J_02 * invJ0_20) + (J_12 * invJ0_21) +
-                                      (J_22 * invJ0_22));
-
-               const double physDirX = ((Jpi_00 * comprDirX) + (Jpi_10 * comprDirY) +
-                                        (Jpi_20 * comprDirZ));
-               const double physDirY = ((Jpi_01 * comprDirX) + (Jpi_11 * comprDirY) +
-                                        (Jpi_21 * comprDirZ));
-               const double physDirZ = ((Jpi_02 * comprDirX) + (Jpi_12 * comprDirY) +
-                                        (Jpi_22 * comprDirZ));
-
-               const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY) +
-                                            (physDirZ * physDirZ));
-
-               const double soundSpeed = sqrt(GAMMA * (GAMMA - 1.0) * q_e);
-               dtEst[ijN(q, el,NUM_QUAD)] = CFL * q_h / soundSpeed;
-
-               if (USE_VISCOSITY)
-               {
-                  // TODO: Check how we can extract outside of kernel
-                  const double mu = minEig;
-                  double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-                  if (mu < 0)
-                  {
-                     coeff += 0.5 * q_rho * q_h * soundSpeed;
-                  }
-                  for (int y = 0; y < 3; ++y)
-                  {
-                     for (int x = 0; x < 3; ++x)
-                     {
-                        q_stress[ijN(x, y,3)] += coeff * q_gradv[ijN(x, y,3)];
-                     }
-                  }
-               }
-
-               const double S00 = q_stress[ijN(0, 0,3)];
-               const double S10 = q_stress[ijN(1, 0,3)];
-               const double S20 = q_stress[ijN(2, 0,3)];
-               const double S01 = q_stress[ijN(0, 1,3)];
-               const double S11 = q_stress[ijN(1, 1,3)];
-               const double S21 = q_stress[ijN(2, 1,3)];
-               const double S02 = q_stress[ijN(0, 2,3)];
-               const double S12 = q_stress[ijN(1, 2,3)];
-               const double S22 = q_stress[ijN(2, 2,3)];
-
-               stressJinvT[ijklNM(0, 0, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01) + (S20 * invJ_02));
-               stressJinvT[ijklNM(1, 0, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11) + (S20 * invJ_12));
-               stressJinvT[ijklNM(2, 0, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S00 * invJ_20) + (S10 * invJ_21) + (S20 * invJ_22));
-
-               stressJinvT[ijklNM(0, 1, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01) + (S21 * invJ_02));
-               stressJinvT[ijklNM(1, 1, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11) + (S21 * invJ_12));
-               stressJinvT[ijklNM(2, 1, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S01 * invJ_20) + (S11 * invJ_21) + (S21 * invJ_22));
-
-               stressJinvT[ijklNM(0, 2, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S02 * invJ_00) + (S12 * invJ_01) + (S22 * invJ_02));
-               stressJinvT[ijklNM(1, 2, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S02 * invJ_10) + (S12 * invJ_11) + (S22 * invJ_12));
-               stressJinvT[ijklNM(2, 2, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S02 * invJ_20) + (S12 * invJ_21) + (S22 * invJ_22));
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fUpdateQuadratureDataS)(const double GAMMA,
-                                       const double H0,
-                                       const double CFL,
-                                       const bool USE_VISCOSITY,
-                                       const int numElements,
-                                       const double* restrict dofToQuad,
-                                       const double* restrict dofToQuadD,
-                                       const double* restrict quadWeights,
-                                       const double* restrict v,
-                                       const double* restrict e,
-                                       const double* restrict rho0DetJ0w,
-                                       const double* restrict invJ0,
-                                       const double* restrict J,
-                                       const double* restrict invJ,
-                                       const double* restrict detJ,
-                                       double* restrict stressJinvT,
-                                       double* restrict dtEst);
-
-// *****************************************************************************
-void rUpdateQuadratureDataS(const double GAMMA,
-                            const double H0,
-                            const double CFL,
-                            const bool USE_VISCOSITY,
-                            const int NUM_DIM,
-                            const int NUM_QUAD,
-                            const int NUM_QUAD_1D,
-                            const int NUM_DOFS_1D,
-                            const int nzones,
-                            const double* restrict dofToQuad,
-                            const double* restrict dofToQuadD,
-                            const double* restrict quadWeights,
-                            const double* restrict v,
-                            const double* restrict e,
-                            const double* restrict rho0DetJ0w,
-                            const double* restrict invJ0,
-                            const double* restrict J,
-                            const double* restrict invJ,
-                            const double* restrict detJ,
-                            double* restrict stressJinvT,
-                            double* restrict dtEst)
-{
-   const int grid = nzones;
-   const int b1d = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const dim3 blck(b1d,b1d,1);
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(IROOT(NUM_DIM,NUM_QUAD)==NUM_QUAD_1D);
-   const unsigned int id = (NUM_DIM<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned int,fUpdateQuadratureDataS> call =
-   {
-      // 2D
-      {0x20,&rUpdateQuadratureData2S<2,2*2,2,2>},
-      {0x21,&rUpdateQuadratureData2S<2,4*4,4,3>},
-      {0x22,&rUpdateQuadratureData2S<2,6*6,6,4>},
-      {0x23,&rUpdateQuadratureData2S<2,8*8,8,5>},
-      {0x24,&rUpdateQuadratureData2S<2,10*10,10,6>},
-      {0x25,&rUpdateQuadratureData2S<2,12*12,12,7>},
-      {0x26,&rUpdateQuadratureData2S<2,14*14,14,8>},
-      {0x27,&rUpdateQuadratureData2S<2,16*16,16,9>},
-      {0x28,&rUpdateQuadratureData2S<2,18*18,18,10>},
-      {0x29,&rUpdateQuadratureData2S<2,20*20,20,11>},
-      {0x2A,&rUpdateQuadratureData2S<2,22*22,22,12>},
-      {0x2B,&rUpdateQuadratureData2S<2,24*24,24,13>},
-      {0x2C,&rUpdateQuadratureData2S<2,26*26,26,14>},
-      {0x2D,&rUpdateQuadratureData2S<2,28*28,28,15>},
-      //{0x2E,&rUpdateQuadratureData2S<2,30*30,30,16>}, uses too much shared data
-      //{0x2F,&rUpdateQuadratureData2S<2,32*32,32,17>}, uses too much shared data
-      // 3D
-      {0x30,&rUpdateQuadratureData3S<3,2*2*2,2,2>},
-      {0x31,&rUpdateQuadratureData3S<3,4*4*4,4,3>},
-      {0x32,&rUpdateQuadratureData3S<3,6*6*6,6,4>},
-      {0x33,&rUpdateQuadratureData3S<3,8*8*8,8,5>},
-      {0x34,&rUpdateQuadratureData3S<3,10*10*10,10,6>},
-      {0x35,&rUpdateQuadratureData3S<3,12*12*12,12,7>},
-      {0x36,&rUpdateQuadratureData3S<3,14*14*14,14,8>},
-      {0x37,&rUpdateQuadratureData3S<3,16*16*16,16,9>},
-      {0x38,&rUpdateQuadratureData3S<3,18*18*18,18,10>},
-      {0x39,&rUpdateQuadratureData3S<3,20*20*20,20,11>},
-      {0x3A,&rUpdateQuadratureData3S<3,22*22*22,22,12>},
-      {0x3B,&rUpdateQuadratureData3S<3,24*24*24,24,13>},
-      {0x3C,&rUpdateQuadratureData3S<3,26*26*26,26,14>},
-      {0x3D,&rUpdateQuadratureData3S<3,28*28*28,28,15>},
-      {0x3E,&rUpdateQuadratureData3S<3,30*30*30,30,16>},
-      {0x3F,&rUpdateQuadratureData3S<3,32*32*32,32,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rUpdateQuadratureDataS] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         GAMMA,H0,CFL,USE_VISCOSITY,
-         nzones,dofToQuad,dofToQuadD,quadWeights,
-         v,e,rho0DetJ0w,invJ0,J,invJ,detJ,
-         stressJinvT,dtEst);
-}
diff --git a/cuda/cuda/linalg/ode.hpp b/cuda/cuda/linalg/ode.hpp
deleted file mode 100644
index 9bd7f2d3..00000000
--- a/cuda/cuda/linalg/ode.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_ODE
-#define LAGHOS_CUDA_ODE
-
-namespace mfem
-{
-
-// ***************************************************************************
-class CudaODESolver
-{
-protected:
-   CudaTimeDependentOperator *f;
-public:
-   CudaODESolver() : f(NULL) {}
-   virtual ~CudaODESolver() {}
-   virtual void Init(CudaTimeDependentOperator &f) { this->f = &f; }
-   virtual void Step(CudaVector &x, double &t, double &dt) =0;
-};
-
-// ***************************************************************************
-class CudaForwardEulerSolver : public CudaODESolver
-{
-private:
-   CudaVector dxdt;
-public:
-   void Init(CudaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      dxdt.SetSize(f->Width());
-   }
-   void Step(CudaVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, dxdt);
-      x.Add(dt, dxdt);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class CudaRK2Solver : public CudaODESolver
-{
-private:
-   double a;
-   CudaVector dxdt, x1;
-public:
-   CudaRK2Solver(const double _a = 2./3.) : a(_a) { }
-   void Init(CudaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      dxdt.SetSize(n);
-      x1.SetSize(n);
-   }
-   void Step(CudaVector &x, double &t, double &dt)
-   {
-      const double b = 0.5/a;
-      f->SetTime(t);
-      f->Mult(x, dxdt);
-      add(x, (1. - b)*dt, dxdt, x1);
-      x.Add(a*dt, dxdt);
-      f->SetTime(t + a*dt);
-      f->Mult(x, dxdt);
-      add(x1, b*dt, dxdt, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class CudaRK3SSPSolver : public CudaODESolver
-{
-private:
-   CudaVector y, k;
-public:
-   void Init(CudaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      y.SetSize(n);
-      k.SetSize(n);
-   }
-   void Step(CudaVector &x, double &t, double &dt)
-   {
-      // x0 = x, t0 = t, k0 = dt*f(t0, x0)
-      f->SetTime(t);
-      f->Mult(x, k);
-      // x1 = x + k0, t1 = t + dt, k1 = dt*f(t1, x1)
-      add(x, dt, k, y);
-      f->SetTime(t + dt);
-      f->Mult(y, k);
-      // x2 = 3/4*x + 1/4*(x1 + k1), t2 = t + 1/2*dt, k2 = dt*f(t2, x2)
-      y.Add(dt, k);
-      add(3./4, x, 1./4, y, y);
-      f->SetTime(t + dt/2);
-      f->Mult(y, k);
-      // x3 = 1/3*x + 2/3*(x2 + k2), t3 = t + dt
-      y.Add(dt, k);
-      add(1./3, x, 2./3, y, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class CudaRK4Solver : public CudaODESolver
-{
-private:
-   CudaVector y, k, z;
-public:
-   void Init(CudaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = CudaODESolver::f->Width();
-      y.SetSize(n);
-      k.SetSize(n);
-      z.SetSize(n);
-   }
-
-   void Step(CudaVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, k); // k1
-      add(x, dt/2, k, y);
-      add(x, dt/6, k, z);
-      f->SetTime(t + dt/2);
-      f->Mult(y, k); // k2
-      add(x, dt/2, k, y);
-      z.Add(dt/3, k);
-      f->Mult(y, k); // k3
-      add(x, dt, k, y);
-      z.Add(dt/3, k);
-      f->SetTime(t + dt);
-      f->Mult(y, k); // k4
-      add(z, dt/6, k, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class CudaExplicitRKSolver : public CudaODESolver
-{
-private:
-   int s;
-   const double *a, *b, *c;
-   CudaVector y, *k;
-public:
-   CudaExplicitRKSolver(int _s, const double *_a,
-                        const double *_b, const double *_c)
-   {
-      s = _s;
-      a = _a;
-      b = _b;
-      c = _c;
-      k = new CudaVector[s];
-   }
-   void Init(CudaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      y.SetSize(n);
-      for (int i = 0; i < s; i++)
-      {
-         k[i].SetSize(n);
-      }
-   }
-   void Step(CudaVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, k[0]);
-      for (int l = 0, i = 1; i < s; i++)
-      {
-         add(x, a[l++]*dt, k[0], y);
-         for (int j = 1; j < i; j++)
-         {
-            y.Add(a[l++]*dt, k[j]);
-         }
-         f->SetTime(t + c[i-1]*dt);
-         f->Mult(y, k[i]);
-      }
-      for (int i = 0; i < s; i++)
-      {
-         x.Add(b[i]*dt, k[i]);
-      }
-      t += dt;
-   }
-   ~CudaExplicitRKSolver()
-   {
-      delete [] k;
-   }
-};
-
-// ***************************************************************************
-// ***************************************************************************
-static const double RK6_a[28] =
-{
-   .6e-1,
-   .1923996296296296296296296296296296296296e-1,
-   .7669337037037037037037037037037037037037e-1,
-   .35975e-1,
-   0.,
-   .107925,
-   1.318683415233148260919747276431735612861,
-   0.,
-   -5.042058063628562225427761634715637693344,
-   4.220674648395413964508014358283902080483,
-   -41.87259166432751461803757780644346812905,
-   0.,
-   159.4325621631374917700365669070346830453,
-   -122.1192135650100309202516203389242140663,
-   5.531743066200053768252631238332999150076,
-   -54.43015693531650433250642051294142461271,
-   0.,
-   207.0672513650184644273657173866509835987,
-   -158.6108137845899991828742424365058599469,
-   6.991816585950242321992597280791793907096,
-   -.1859723106220323397765171799549294623692e-1,
-   -54.66374178728197680241215648050386959351,
-   0.,
-   207.9528062553893734515824816699834244238,
-   -159.2889574744995071508959805871426654216,
-   7.018743740796944434698170760964252490817,
-   -.1833878590504572306472782005141738268361e-1,
-   -.5119484997882099077875432497245168395840e-3
-};
-
-static const double RK6_b[8] =
-{
-   .3438957868357036009278820124728322386520e-1,
-   0.,
-   0.,
-   .2582624555633503404659558098586120858767,
-   .4209371189673537150642551514069801967032,
-   4.405396469669310170148836816197095664891,
-   -176.4831190242986576151740942499002125029,
-   172.3641334014150730294022582711902413315
-};
-
-static const double RK6_c[7] =
-{
-   .6e-1,
-   .9593333333333333333333333333333333333333e-1,
-   .1439,
-   .4973,
-   .9725,
-   .9995,
-   1.,
-};
-
-class CudaRK6Solver : public CudaExplicitRKSolver
-{
-public:
-   CudaRK6Solver() : CudaExplicitRKSolver(8, RK6_a, RK6_b, RK6_c) { }
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_ODE
diff --git a/cuda/cuda/linalg/operator.hpp b/cuda/cuda/linalg/operator.hpp
deleted file mode 100644
index 5f7f9540..00000000
--- a/cuda/cuda/linalg/operator.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_OPERATOR
-#define LAGHOS_CUDA_OPERATOR
-
-namespace mfem
-{
-
-// ***************************************************************************
-class CudaOperator : public rmemcpy
-{
-protected:
-   int height;
-   int width;
-public:
-   explicit CudaOperator(int s = 0) { height = width = s; }
-   CudaOperator(int h, int w) { height = h; width = w; }
-   inline int Height() const { return height; }
-   inline int Width() const { return width; }
-   virtual void Mult(const CudaVector &x, CudaVector &y) const  { assert(false); };
-   virtual void MultTranspose(const CudaVector &x, CudaVector &y) const { assert(false); }
-   virtual const CudaOperator *GetProlongation() const { assert(false); return NULL; }
-   virtual const CudaOperator *GetRestriction() const  { assert(false); return NULL; }
-   virtual void RecoverFEMSolution(const CudaVector &X,
-                                   const CudaVector &b,
-                                   CudaVector &x) {assert(false);}
-};
-
-
-// ***************************************************************************
-class CudaTimeDependentOperator : public CudaOperator
-{
-private:
-   double t;
-public:
-   explicit CudaTimeDependentOperator(int n = 0,
-                                      double t_ = 0.0) : CudaOperator(n), t(t_) {}
-   void SetTime(const double _t) { t = _t; }
-};
-
-// ***************************************************************************
-class CudaSolverOperator : public CudaOperator
-{
-public:
-   bool iterative_mode;
-   explicit CudaSolverOperator(int s = 0,
-                               bool iter_mode = false) :
-      CudaOperator(s),
-      iterative_mode(iter_mode) { }
-   virtual void SetOperator(const CudaOperator &op) = 0;
-};
-
-// ***************************************************************************
-class CudaRAPOperator : public CudaOperator
-{
-private:
-   const CudaOperator &Rt;
-   const CudaOperator &A;
-   const CudaOperator &P;
-   mutable CudaVector Px;
-   mutable CudaVector APx;
-public:
-   /// Construct the RAP operator given R^T, A and P.
-   CudaRAPOperator(const CudaOperator &Rt_, const CudaOperator &A_,
-                   const CudaOperator &P_)
-      : CudaOperator(Rt_.Width(), P_.Width()), Rt(Rt_), A(A_), P(P_),
-        Px(P.Height()), APx(A.Height()) { }
-   /// Operator application.
-   void Mult(const CudaVector & x, CudaVector & y) const
-   {
-      P.Mult(x, Px);
-      A.Mult(Px, APx);
-      Rt.MultTranspose(APx, y);
-   }
-   /// Application of the transpose.
-   void MultTranspose(const CudaVector & x, CudaVector & y) const
-   {
-      Rt.Mult(x, APx);
-      A.MultTranspose(APx, Px);
-      P.MultTranspose(Px, y);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_OPERATOR
diff --git a/cuda/cuda/linalg/solvers.cpp b/cuda/cuda/linalg/solvers.cpp
deleted file mode 100644
index b243c400..00000000
--- a/cuda/cuda/linalg/solvers.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-// *************************************************************************
-void CudaCGSolver::h_Mult(const CudaVector &b, CudaVector &x) const
-{
-   int i;
-   double r0, den, nom, nom0, betanom, alpha, beta;
-   if (iterative_mode)
-   {
-      oper->Mult(x, r);
-      subtract(b, r, r); // r = b - A x
-   }
-   else
-   {
-      r = b;
-      x = 0.0;
-   }
-
-   if (prec)
-   {
-      prec->Mult(r, z); // z = B r
-      d = z;
-   }
-   else
-   {
-      d = r;
-   }
-
-   nom0 = nom = Dot(d, r);
-   MFEM_ASSERT(IsFinite(nom), "nom = " << nom);
-
-   if (print_level == 1
-       || print_level == 3)
-   {
-      mfem::out << "   Iteration : " << std::setw(3) << 0 << "  (B r, r) = "
-                << nom << (print_level == 3 ? " ...\n" : "\n");
-   }
-
-   r0 = std::max(nom*rel_tol*rel_tol,abs_tol*abs_tol);
-
-   if (nom <= r0)
-   {
-      converged = 1;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-   }
-
-   oper->Mult(d, z);  // z = A d
-
-   den = Dot(z, d);
-   MFEM_ASSERT(IsFinite(den), "den = " << den);
-
-   if (print_level >= 0 && den < 0.0)
-   {
-      mfem::out << "Negative denominator in step 0 of PCG: " << den << '\n';
-   }
-
-   if (den == 0.0)
-   {
-      converged = 0;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-   }
-
-   // start iteration
-   converged = 0;
-   final_iter = max_iter;
-   for (i = 1; true; )
-   {
-      alpha = nom/den;
-      add(x,  alpha, d, x);     //  x = x + alpha d
-      add(r, -alpha, z, r);     //  r = r - alpha A d
-
-      if (prec)
-      {
-         prec->Mult(r, z);      //  z = B r
-         betanom = Dot(r, z);
-      }
-      else
-      {
-         betanom = Dot(r, r);
-      }
-      MFEM_ASSERT(IsFinite(betanom), "betanom = " << betanom);
-
-      if (print_level == 1)
-      {
-         mfem::out << "   Iteration : " << std::setw(3) << i << "  (B r, r) = "
-                   << betanom << '\n';
-      }
-
-      if (betanom < r0)
-      {
-         if (print_level == 2)
-         {
-            mfem::out << "Number of PCG iterations: " << i << '\n';
-         }
-         else if (print_level == 3)
-         {
-            mfem::out << "   Iteration : " << std::setw(3) << i << "  (B r, r) = "
-                      << betanom << '\n';
-         }
-         converged = 1;
-         final_iter = i;
-         break;
-      }
-
-      if (++i > max_iter)
-      {
-         break;
-      }
-
-      beta = betanom/nom;
-      if (prec)
-      {
-         add(z, beta, d, d);   //  d = z + beta d
-      }
-      else
-      {
-         add(r, beta, d, d);
-      }
-
-      oper->Mult(d, z);       //  z = A d
-      den = Dot(d, z);
-
-      MFEM_ASSERT(IsFinite(den), "den = " << den);
-      if (den <= 0.0)
-      {
-         if (print_level >= 0 && Dot(d, d) > 0.0)
-            mfem::out << "PCG: The operator is not positive definite. (Ad, d) = "
-                      << den << '\n';
-      }
-      nom = betanom;
-   }
-
-   if (print_level >= 0 && !converged)
-   {
-      if (print_level != 1)
-      {
-         if (print_level != 3)
-         {
-            mfem::out << "   Iteration : " << std::setw(3) << 0 << "  (B r, r) = "
-                      << nom0 << " ...\n";
-         }
-         mfem::out << "   Iteration : " << std::setw(3) << final_iter << "  (B r, r) = "
-                   << betanom << '\n';
-      }
-      mfem::out << "PCG: No convergence!" << '\n';
-   }
-
-   if (print_level >= 1 || (print_level >= 0 && !converged))
-   {
-      mfem::out << "Average reduction factor = "
-                << pow (betanom/nom0, 0.5/final_iter) << '\n';
-   }
-   final_norm = sqrt(betanom);
-}
-
-} // mfem
diff --git a/cuda/cuda/linalg/solvers.hpp b/cuda/cuda/linalg/solvers.hpp
deleted file mode 100644
index 87b7f127..00000000
--- a/cuda/cuda/linalg/solvers.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_SOLVERS
-#define LAGHOS_CUDA_SOLVERS
-
-#ifdef MFEM_USE_MPI
-#include <mpi.h>
-#endif
-
-namespace mfem
-{
-
-// ***************************************************************************
-class CudaIterativeSolver : public CudaSolverOperator
-{
-#ifdef MFEM_USE_MPI
-private:
-   int dot_prod_type; // 0 - local, 1 - global over 'comm'
-   MPI_Comm comm;
-#endif
-protected:
-   const CudaOperator *oper;
-   CudaSolverOperator *prec;
-   int max_iter, print_level;
-   double rel_tol, abs_tol;
-   // stats
-   mutable int final_iter, converged;
-   mutable double final_norm;
-   double Dot(const CudaVector &x,
-              const CudaVector &y) const
-   {
-#ifndef MFEM_USE_MPI
-      return (x * y);
-#else
-      if (dot_prod_type == 0)
-      {
-         return (x * y);
-      }
-      double local_dot = (x * y);
-      double global_dot;
-      MPI_Allreduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, comm);
-      return global_dot;
-#endif
-   }
-   double Norm(const CudaVector &x) const { return sqrt(Dot(x, x)); }
-public:
-   CudaIterativeSolver(): CudaSolverOperator(0, true)
-   {
-      oper = NULL;
-      prec = NULL;
-      max_iter = 10;
-      print_level = -1;
-      rel_tol = abs_tol = 0.0;
-#ifdef MFEM_USE_MPI
-      dot_prod_type = 0;
-#endif
-   }
-
-#ifdef MFEM_USE_MPI
-   CudaIterativeSolver(MPI_Comm _comm)
-      : CudaSolverOperator(0, true)
-   {
-      oper = NULL;
-      prec = NULL;
-      max_iter = 10;
-      print_level = -1;
-      rel_tol = abs_tol = 0.0;
-      dot_prod_type = 1;
-      comm = _comm;
-   }
-#endif
-
-   void SetRelTol(double rtol) { rel_tol = rtol; }
-   void SetAbsTol(double atol) { abs_tol = atol; }
-   void SetMaxIter(int max_it) { max_iter = max_it; }
-   void SetPrintLevel(int print_lvl)
-   {
-#ifndef MFEM_USE_MPI
-      print_level = print_lvl;
-#else
-      if (dot_prod_type == 0)
-      {
-         print_level = print_lvl;
-      }
-      else
-      {
-         int rank;
-         MPI_Comm_rank(comm, &rank);
-         if (rank == 0)
-         {
-            print_level = print_lvl;
-         }
-      }
-#endif
-   }
-   int GetNumIterations() const { return final_iter; }
-   int GetConverged() const { return converged; }
-   double GetFinalNorm() const { return final_norm; }
-   /// This should be called before SetOperator
-   virtual void SetPreconditioner(CudaSolverOperator &pr)
-   {
-      prec = &pr;
-      prec->iterative_mode = false;
-   }
-   /// Also calls SetOperator for the preconditioner if there is one
-   virtual void SetOperator(const CudaOperator &op)
-   {
-      oper = &op;
-      height = op.Height();
-      width = op.Width();
-      if (prec)
-      {
-         prec->SetOperator(*oper);
-      }
-   }
-};
-
-// ***************************************************************************
-// Conjugate gradient method
-// ***************************************************************************
-class CudaCGSolver : public CudaIterativeSolver
-{
-protected:
-   mutable CudaVector r, d, z;
-   void UpdateVectors()
-   {
-      r.SetSize(width);
-      d.SetSize(width);
-      z.SetSize(width);
-   }
-public:
-   CudaCGSolver() { }
-#ifdef MFEM_USE_MPI
-   CudaCGSolver(MPI_Comm _comm) : CudaIterativeSolver(_comm) { }
-#endif
-   virtual void SetOperator(const CudaOperator &op)
-   {
-      CudaIterativeSolver::SetOperator(op);
-      UpdateVectors();
-   }
-   void h_Mult(const CudaVector &b, CudaVector &x) const ;
-   virtual void Mult(const CudaVector &b, CudaVector &x) const
-   {
-      h_Mult(b,x);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_CUDA_SOLVERS
diff --git a/cuda/cuda/linalg/vector.cpp b/cuda/cuda/linalg/vector.cpp
deleted file mode 100644
index ffbe82a4..00000000
--- a/cuda/cuda/linalg/vector.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../cuda.hpp"
-
-namespace mfem
-{
-
-CudaVector::~CudaVector()
-{
-   if (!own) { return; }
-   rmalloc::operator delete (data);
-}
-
-// ***************************************************************************
-double* CudaVector::alloc(const size_t sz)
-{
-   return (double*) rmalloc::operator new (sz);
-}
-
-// ***************************************************************************
-void CudaVector::SetSize(const size_t sz, const void* ptr)
-{
-   own=true;
-   size = sz;
-   if (!data) { data = alloc(sz); }
-   if (ptr) { rDtoD(data,ptr,bytes()); }
-}
-
-// ***************************************************************************
-CudaVector::CudaVector(const size_t sz):size(sz),data(alloc(sz)),own(true) {}
-CudaVector::CudaVector(const size_t sz,double value):
-   size(sz),data(alloc(sz)),own(true)
-{
-   *this=value;
-}
-
-CudaVector::CudaVector(const CudaVector& v):
-   size(0),data(NULL),own(true) { SetSize(v.Size(), v); }
-
-CudaVector::CudaVector(const CudaVector *v):size(v->size),data(v->data),
-   own(false) {}
-
-CudaVector::CudaVector(CudaArray<double>& v):size(v.size()),data(v.ptr()),
-   own(false) {}
-
-// Host 2 Device ***************************************************************
-CudaVector::CudaVector(const Vector& v):size(v.Size()),data(alloc(size)),
-   own(true)
-{
-   assert(v.GetData());
-   rmemcpy::rHtoD(data,v.GetData(),size*sizeof(double));
-}
-
-// Device 2 Host ***************************************************************
-CudaVector::operator Vector()
-{
-   if (!rconfig::Get().Cuda()) { return Vector(data,size); }
-   double *h_data= (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   Vector mfem_vector(h_data,size);
-   mfem_vector.MakeDataOwner();
-   return mfem_vector;
-}
-
-CudaVector::operator Vector() const
-{
-   if (!rconfig::Get().Cuda()) { return Vector(data,size); }
-   double *h_data= (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   Vector mfem_vector(h_data,size);
-   mfem_vector.MakeDataOwner();
-   return mfem_vector;
-}
-
-// ***************************************************************************
-void CudaVector::Print(std::ostream& out, int width) const
-{
-   double *h_data = (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   for (size_t i=0; i<size; i+=1)
-   {
-      printf("\n\t[%ld] %.15e",i,h_data[i]);
-   }
-   free(h_data);
-}
-
-// ***************************************************************************
-CudaVector* CudaVector::GetRange(const size_t offset,
-                                 const size_t entries) const
-{
-   static CudaVector ref;
-   ref.size = entries;
-   ref.data = (double*) ((unsigned char*)data + (offset*sizeof(double)));
-   ref.own = false;
-   return &ref;
-}
-
-// ***************************************************************************
-CudaVector& CudaVector::operator=(const CudaVector& v)
-{
-   SetSize(v.Size(),v.data);
-   own = false;
-   return *this;
-}
-
-// ***************************************************************************
-CudaVector& CudaVector::operator=(const Vector& v)
-{
-   size=v.Size();
-   if (!rconfig::Get().Cuda()) { SetSize(size,v.GetData()); }
-   else { rHtoD(data,v.GetData(),bytes()); }
-   own = false;
-   return *this;
-}
-
-// ***************************************************************************
-CudaVector& CudaVector::operator=(double value)
-{
-   vector_op_eq(size, value, data);
-   return *this;
-}
-
-// ***************************************************************************
-double CudaVector::operator*(const CudaVector& v) const
-{
-   return vector_dot(size, data, v.data);
-}
-
-// *****************************************************************************
-CudaVector& CudaVector::operator-=(const CudaVector& v)
-{
-   vector_vec_sub(size, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-CudaVector& CudaVector::operator+=(const CudaVector& v)
-{
-   vector_vec_add(size, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-CudaVector& CudaVector::operator+=(const Vector& v)
-{
-   double *d_v_data;
-   assert(v.GetData());
-   if (!rconfig::Get().Cuda()) { d_v_data=v.GetData(); }
-   else { rmemcpy::rHtoD(d_v_data = alloc(size),v.GetData(),bytes()); }
-   vector_vec_add(size, data, d_v_data);
-   return *this;
-}
-
-// ***************************************************************************
-CudaVector& CudaVector::operator*=(const double d)
-{
-   vector_vec_mul(size, data, d);
-   return *this;
-}
-
-// ***************************************************************************
-CudaVector& CudaVector::Add(const double alpha, const CudaVector& v)
-{
-   vector_axpy(Size(),alpha, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-void CudaVector::Neg()
-{
-   vector_neg(Size(),ptr());
-}
-
-// *****************************************************************************
-void CudaVector::SetSubVector(const CudaArray<int> &ess_tdofs,
-                              const double value,
-                              const int N)
-{
-   vector_set_subvector_const(N, value, data, ess_tdofs.ptr());
-}
-
-
-// ***************************************************************************
-double CudaVector::Min() const
-{
-   return vector_min(Size(),(double*)data);
-}
-
-// ***************************************************************************
-void add(const CudaVector& v1, const double alpha,
-         const CudaVector& v2, CudaVector& out)
-{
-   vector_xpay(out.Size(),alpha,out.ptr(),v1.ptr(),v2.ptr());
-}
-
-// *****************************************************************************
-void add(const double alpha,
-         const CudaVector& v1,
-         const double beta,
-         const CudaVector& v2,
-         CudaVector& out) { assert(false); }
-
-// ***************************************************************************
-void subtract(const CudaVector& v1,
-              const CudaVector& v2,
-              CudaVector& out)
-{
-   vector_xsy(out.Size(),out.ptr(),v1.ptr(),v2.ptr());
-}
-
-} // mfem
diff --git a/cuda/cuda/linalg/vector.hpp b/cuda/cuda/linalg/vector.hpp
deleted file mode 100644
index 5053df42..00000000
--- a/cuda/cuda/linalg/vector.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_CUDA_VECTOR
-#define LAGHOS_CUDA_VECTOR
-
-namespace mfem
-{
-
-class CudaVector : public rmalloc<double>
-{
-private:
-   size_t size = 0;
-   double* data = NULL;
-   bool own = true;
-public:
-   CudaVector(): size(0),data(NULL),own(true) {}
-   CudaVector(const CudaVector&);
-   CudaVector(const CudaVector*);
-   CudaVector(const size_t);
-   CudaVector(const size_t,double);
-   CudaVector(const Vector& v);
-   CudaVector(CudaArray<double>& v);
-   operator Vector();
-   operator Vector() const;
-   double* alloc(const size_t);
-   inline double* ptr() const { return data;}
-   inline double* GetData() const { return data;}
-   inline operator double* () { return data; }
-   inline operator const double* () const { return data; }
-   void Print(std::ostream& = std::cout, int = 8) const;
-   void SetSize(const size_t,const void* =NULL);
-   inline size_t Size() const { return size; }
-   inline size_t bytes() const { return size*sizeof(double); }
-   double operator* (const CudaVector& v) const;
-   CudaVector& operator = (const CudaVector& v);
-   CudaVector& operator = (const Vector& v);
-   CudaVector& operator = (double value);
-   CudaVector& operator -= (const CudaVector& v);
-   CudaVector& operator += (const CudaVector& v);
-   CudaVector& operator += (const Vector& v);
-   CudaVector& operator *=(const double d);
-   CudaVector& Add(const double a, const CudaVector& Va);
-   void Neg();
-   CudaVector* GetRange(const size_t, const size_t) const;
-   void SetSubVector(const CudaArray<int> &, const double, const int);
-   double Min() const;
-   ~CudaVector();
-};
-
-// ***************************************************************************
-void add(const CudaVector&,const double,const CudaVector&,CudaVector&);
-void add(const CudaVector&,const CudaVector&,CudaVector&);
-void add(const double,const CudaVector&,const double,const CudaVector&,
-         CudaVector&);
-void subtract(const CudaVector&,const CudaVector&,CudaVector&);
-
-}
-
-#endif // LAGHOS_CUDA_VECTOR
diff --git a/cuda/laghos.cpp b/cuda/laghos.cpp
deleted file mode 100644
index 9ba39d6e..00000000
--- a/cuda/laghos.cpp
+++ /dev/null
@@ -1,663 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-//
-//                     __                __
-//                    / /   ____  ____  / /_  ____  _____
-//                   / /   / __ `/ __ `/ __ \/ __ \/ ___/
-//                  / /___/ /_/ / /_/ / / / / /_/ (__  )
-//                 /_____/\__,_/\__, /_/ /_/\____/____/
-//                             /____/
-//
-//             High-order Lagrangian Hydrodynamics Miniapp
-//
-//                             CUDA version
-//
-// Laghos(LAGrangian High-Order Solver) is a miniapp that solves the
-// time-dependent Euler equation of compressible gas dynamics in a moving
-// Lagrangian frame using unstructured high-order finite element spatial
-// discretization and explicit high-order time-stepping. Laghos is based on the
-// numerical algorithm described in the following article:
-//
-//    V. Dobrev, Tz. Kolev and R. Rieben, "High-order curvilinear finite element
-//    methods for Lagrangian hydrodynamics", SIAM Journal on Scientific
-//    Computing, (34) 2012, pp.B606–B641, https://doi.org/10.1137/120864672.
-//
-// Sample runs:
-//    mpirun -np 8 laghos -p 0 -m data/square01_quad.mesh -rs 3 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m data/square01_tri.mesh  -rs 1 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m data/cube01_hex.mesh    -rs 1 -tf 2.0
-//    mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 3 -tf 0.8
-//    mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 0 -tf 0.8 -ok 7 -ot 6
-//    mpirun -np 8 laghos -p 1 -m data/cube01_hex.mesh    -rs 2 -tf 0.6
-//    mpirun -np 8 laghos -p 2 -m data/segment01.mesh     -rs 5 -tf 0.2
-//    mpirun -np 8 laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 2.5
-//    mpirun -np 8 laghos -p 3 -m data/box01_hex.mesh        -rs 1 -tf 2.5
-//
-// Test problems:
-//    p = 0  --> Taylor-Green vortex (smooth problem).
-//    p = 1  --> Sedov blast.
-//    p = 2  --> 1D Sod shock tube.
-//    p = 3  --> Triple point.
-
-
-#include "laghos_solver.hpp"
-#include <memory>
-#include <iostream>
-#include <fstream>
-#include <sys/time.h>
-
-using namespace std;
-using namespace mfem;
-using namespace mfem::hydrodynamics;
-
-// Choice for the problem setup.
-int problem = 0;
-
-void display_banner(ostream & os);
-
-int main(int argc, char *argv[])
-{
-   // Initialize MPI.
-   MPI_Session mpi(argc, argv);
-   int myid = mpi.WorldRank();
-
-   // Print the banner.
-   if (mpi.Root()) { display_banner(cout); }
-
-   // Parse command-line options.
-   const char *mesh_file = "../data/square01_quad.mesh";
-   int rs_levels = 0;
-   int rp_levels = 0;
-   int order_v = 2;
-   int order_e = 1;
-   int ode_solver_type = 4;
-   double t_final = 0.5;
-   double cfl = 0.5;
-   double cg_tol = 1e-8;
-   int cg_max_iter = 300;
-   int max_tsteps = -1;
-   bool p_assembly = true;
-   bool visualization = false;
-   int vis_steps = 5;
-   bool visit = false;
-   bool gfprint = false;
-   const bool cuda = true;
-   bool uvm = false;
-   bool aware = false;
-   bool share = false;
-   bool hcpo = false; // do Host Conforming Prolongation Operation
-   bool sync = false;
-
-   const char *basename = "results/Laghos";
-   OptionsParser args(argc, argv);
-   // Standard Options *********************************************************
-   args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use.");
-   args.AddOption(&rs_levels, "-rs", "--refine-serial",
-                  "Number of times to refine the mesh uniformly in serial.");
-   args.AddOption(&rp_levels, "-rp", "--refine-parallel",
-                  "Number of times to refine the mesh uniformly in parallel.");
-   args.AddOption(&problem, "-p", "--problem", "Problem setup to use.");
-   args.AddOption(&order_v, "-ok", "--order-kinematic",
-                  "Order (degree) of the kinematic finite element space.");
-   args.AddOption(&order_e, "-ot", "--order-thermo",
-                  "Order (degree) of the thermodynamic finite element space.");
-   args.AddOption(&ode_solver_type, "-s", "--ode-solver",
-                  "ODE solver: 1 - Forward Euler,\n\t"
-                  "            2 - RK2 SSP, 3 - RK3 SSP, 4 - RK4, 6 - RK6.");
-   args.AddOption(&t_final, "-tf", "--t-final",
-                  "Final time; start time is 0.");
-   args.AddOption(&cfl, "-cfl", "--cfl", "CFL-condition number.");
-   args.AddOption(&cg_tol, "-cgt", "--cg-tol",
-                  "Relative CG tolerance (velocity linear solve).");
-   args.AddOption(&cg_max_iter, "-cgm", "--cg-max-steps",
-                  "Maximum number of CG iterations (velocity linear solve).");
-   args.AddOption(&max_tsteps, "-ms", "--max-steps",
-                  "Maximum number of steps (negative means no restriction).");
-   args.AddOption(&p_assembly, "-pa", "--partial-assembly", "-fa",
-                  "--full-assembly",
-                  "Activate 1D tensor-based assembly (partial assembly).");
-   args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
-                  "--no-visualization",
-                  "Enable or disable GLVis visualization.");
-   args.AddOption(&vis_steps, "-vs", "--visualization-steps",
-                  "Visualize every n-th timestep.");
-   args.AddOption(&visit, "-visit", "--visit", "-no-visit", "--no-visit",
-                  "Enable or disable VisIt visualization.");
-   args.AddOption(&gfprint, "-print", "--print", "-no-print", "--no-print",
-                  "Enable or disable result output (files in mfem format).");
-   args.AddOption(&basename, "-k", "--outputfilename",
-                  "Name of the visit dump files");
-   // CUDA Options *************************************************************
-   args.AddOption(&uvm, "-uvm", "--uvm", "-no-uvm", "--no-uvm",
-                  "[32mEnable or disable Unified Memory.[m");
-   args.AddOption(&aware, "-aware", "--aware", "-no-aware", "--no-aware",
-                  "[32mEnable or disable MPI CUDA Aware (GPUDirect).[m");
-   args.AddOption(&hcpo, "-hcpo", "--hcpo", "-not-hcpo", "--no-hcpo",
-                  "[32mEnable or disable Host Conforming Prolongation Operations,\n"
-                  "\twhich transfers ALL the data to the host before communications.[m");
-   args.AddOption(&sync, "-sync", "--sync", "-no-sync", "--no-sync",
-                  "[32mEnable or disable Enforced Kernel Synchronization.[m");
-   // Not usable Options *******************************************************
-   args.AddOption(&share, "-share", "--share", "-no-share", "--no-share",
-                  "Enable or disable SHARE kernels (WIP, not usable).");
-   args.Parse();
-   if (!args.Good())
-   {
-      if (mpi.Root()) { args.PrintUsage(cout); }
-      return 1;
-   }
-   if (mpi.Root()) { args.PrintOptions(cout); }
-
-   // CUDA set device & options
-   // **************************************************************************
-   rconfig::Get().Setup(mpi.WorldRank(),mpi.WorldSize(),
-                        cuda,uvm,aware,share,hcpo,sync,rs_levels);
-
-   // Read the serial mesh from the given mesh file on all processors.
-   // Refine the mesh in serial to increase the resolution.
-   Mesh *mesh = new Mesh(mesh_file, 1, 1);
-   const int dim = mesh->Dimension();
-   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
-
-   if (p_assembly && dim == 1)
-   {
-      p_assembly = false;
-      if (mpi.Root())
-      {
-         cout << "Laghos does not support PA in 1D. Switching to FA." << endl;
-      }
-   }
-
-   // Parallel partitioning of the mesh.
-   // **************************************************************************
-   ParMesh *pmesh = NULL;
-   const int num_tasks = mpi.WorldSize();
-   const int partitions = floor(pow(num_tasks, 1.0 / dim) + 1e-2);
-   int *nxyz = new int[dim];
-   int product = 1;
-   for (int d = 0; d < dim; d++)
-   {
-      nxyz[d] = partitions;
-      product *= partitions;
-   }
-   if (product == num_tasks)
-   {
-      if (myid == 0)
-      {
-         printf("\033[32m[laghos] \033[32;1mCartesian\033[m\033[32m partitioning will be used\033[m\n");
-      }
-      int *partitioning = mesh->CartesianPartitioning(nxyz);
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
-      delete[] partitioning;
-   }
-   else
-   {
-      if (myid == 0)
-      {
-         printf("\033[32m[laghos] Non-Cartesian partitioning through METIS will be used\033[m\n");
-#ifndef MFEM_USE_METIS
-         cout << "MFEM was built without METIS. "
-              << "Adjust the number of tasks to use a Cartesian split." << endl;
-#endif
-      }
-#ifndef MFEM_USE_METIS
-      return 1;
-#endif
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh);
-   }
-   delete [] nxyz;
-   delete mesh;
-
-   // **************************************************************************
-   // We need at least some elements in each partition for now
-#ifdef MFEM_USE_MPI
-   int global_pmesh_NE;
-   const int pmesh_NE=pmesh->GetNE();
-   MPI_Allreduce(&pmesh_NE,&global_pmesh_NE,1,MPI_INT,MPI_MIN,pmesh->GetComm());
-   if (global_pmesh_NE==0) { return printf("[Laghos] ERROR: pmesh->GetNE()==0!"); }
-   else { printf("\033[32m[laghos] pmesh->GetNE()=%d\033[m\n",global_pmesh_NE); }
-   assert(pmesh->GetNE()>0);
-#endif
-
-   // Refine the mesh further in parallel to increase the resolution.
-   for (int lev = 0; lev < rp_levels; lev++) { pmesh->UniformRefinement(); }
-
-   // Define the parallel finite element spaces. We use:
-   // - H1 (Gauss-Lobatto, continuous) for position and velocity.
-   // - L2 (Bernstein, discontinuous) for specific internal energy.
-   L2_FECollection L2FEC(order_e, dim, BasisType::Positive);
-   H1_FECollection H1FEC(order_v, dim);
-   CudaFiniteElementSpace L2FESpace(pmesh, &L2FEC);
-   CudaFiniteElementSpace H1FESpace(pmesh, &H1FEC, pmesh->Dimension());
-
-   // Boundary conditions: all tests use v.n = 0 on the boundary,
-   // and we assume that the boundaries are straight.
-   Array<int> essential_tdofs;
-   {
-      Array<int> ess_bdr(pmesh->bdr_attributes.Max()), tdofs1d;
-      for (int d = 0; d < pmesh->Dimension(); d++)
-      {
-         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e., we must
-         // enforce v_x/y/z = 0 for the velocity components.
-         ess_bdr = 0; ess_bdr[d] = 1;
-         H1FESpace.GetEssentialTrueDofs(ess_bdr, tdofs1d, d);
-         essential_tdofs.Append(tdofs1d);
-      }
-   }
-
-   // Define the explicit ODE solver used for time integration.
-   CudaODESolver *ode_solver = NULL;
-   switch (ode_solver_type)
-   {
-      case 1: ode_solver = new CudaForwardEulerSolver; break;
-      case 2: ode_solver = new CudaRK2Solver(0.5); break;
-      case 3: ode_solver = new CudaRK3SSPSolver; break;
-      case 4: ode_solver = new CudaRK4Solver; break;
-      case 6: ode_solver = new CudaRK6Solver; break;
-      default:
-         if (myid == 0)
-         {
-            cout << "Unknown ODE solver type: " << ode_solver_type << '\n';
-         }
-         delete pmesh;
-         MPI_Finalize();
-         return 3;
-   }
-
-   HYPRE_Int glob_size_l2 = L2FESpace.GlobalTrueVSize();
-   HYPRE_Int glob_size_h1 = H1FESpace.GlobalTrueVSize();
-
-   if (mpi.Root())
-   {
-      cout << "Number of kinematic (position, velocity) dofs: "
-           << glob_size_h1 << endl;
-      cout << "Number of specific internal energy dofs: "
-           << glob_size_l2 << endl<< endl;
-   }
-
-   int Vsize_l2 = L2FESpace.GetVSize();
-   int Vsize_h1 = H1FESpace.GetVSize();
-
-   // The monolithic BlockVector stores unknown fields as:
-   // - 0 -> position
-   // - 1 -> velocity
-   // - 2 -> specific internal energy
-   Array<int> true_offset(4);
-   true_offset[0] = 0;
-   true_offset[1] = true_offset[0] + Vsize_h1;
-   true_offset[2] = true_offset[1] + Vsize_h1;
-   true_offset[3] = true_offset[2] + Vsize_l2;
-   CudaVector S(true_offset[3]);
-
-   // Define GridFunction objects for the position, velocity and specific
-   // internal energy.  There is no function for the density, as we can always
-   // compute the density values given the current mesh position, using the
-   // property of pointwise mass conservation.
-   ParGridFunction x_gf(&H1FESpace);
-   ParGridFunction v_gf(&H1FESpace);
-   ParGridFunction e_gf(&L2FESpace);
-
-   CudaGridFunction d_x_gf(H1FESpace, S.GetRange(true_offset[0], true_offset[1]));
-   CudaGridFunction d_v_gf(H1FESpace, S.GetRange(true_offset[1], true_offset[2]));
-   CudaGridFunction d_e_gf(L2FESpace, S.GetRange(true_offset[2], true_offset[3]));
-
-   // Initialize x_gf using the starting mesh coordinates. This also links the
-   // mesh positions to the values in x_gf.
-   pmesh->SetNodalGridFunction(&x_gf);
-   d_x_gf = x_gf;
-
-   // Initialize the velocity.
-   VectorFunctionCoefficient v_coeff(pmesh->Dimension(), v0);
-   v_gf.ProjectCoefficient(v_coeff);
-   d_v_gf = v_gf;
-
-   // Initialize density and specific internal energy values. We interpolate in
-   // a non-positive basis to get the correct values at the dofs.  Then we do an
-   // L2 projection to the positive basis in which we actually compute. The goal
-   // is to get a high-order representation of the initial condition. Note that
-   // this density is a temporary function and it will not be updated during the
-   // time evolution.
-   ParGridFunction rho(&L2FESpace);
-   FunctionCoefficient rho_coeff(hydrodynamics::rho0);
-   L2_FECollection l2_fec(order_e, pmesh->Dimension());
-   CudaFiniteElementSpace l2_fes(pmesh, &l2_fec);
-   ParGridFunction l2_rho(&l2_fes), l2_e(&l2_fes);
-   l2_rho.ProjectCoefficient(rho_coeff);
-   rho.ProjectGridFunction(l2_rho);
-   CudaGridFunction d_rho(L2FESpace);
-   d_rho = rho;
-   if (problem == 1)
-   {
-      // For the Sedov test, we use a delta function at the origin.
-      DeltaCoefficient e_coeff(0, 0, 0.25);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   else
-   {
-      FunctionCoefficient e_coeff(e0);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   e_gf.ProjectGridFunction(l2_e);
-   d_e_gf = e_gf;
-
-   Coefficient *material_pcf = new FunctionCoefficient(hydrodynamics::gamma);
-
-   // Additional details, depending on the problem.
-   int source = 0; bool visc=false;
-   switch (problem)
-   {
-      case 0: if (pmesh->Dimension() == 2) { source = 1; }
-         visc = false; break;
-      case 1: visc = true; break;
-      case 2: visc = true; break;
-      case 3: visc = true; break;
-      default: MFEM_ABORT("Wrong problem specification!");
-   }
-
-   LagrangianHydroOperator oper(S.Size(), H1FESpace, L2FESpace,
-                                essential_tdofs, d_rho, source, cfl, material_pcf,
-                                visc, p_assembly, cg_tol, cg_max_iter);
-
-   socketstream vis_rho, vis_v, vis_e;
-   char vishost[] = "localhost";
-   int  visport   = 19916;
-
-   ParGridFunction rho_gf;
-   if (visualization || visit) { oper.ComputeDensity(rho_gf); }
-
-   if (visualization)
-   {
-      // Make sure all MPI ranks have sent their 'v' solution before initiating
-      // another set of GLVis connections (one from each rank):
-      MPI_Barrier(pmesh->GetComm());
-
-      vis_rho.precision(8);
-      vis_v.precision(8);
-      vis_e.precision(8);
-
-      int Wx = 0, Wy = 0; // window position
-      const int Ww = 350, Wh = 350; // window size
-      int offx = Ww+10; // window offsets
-
-      VisualizeField(vis_rho, vishost, visport, rho_gf,
-                     "Density", Wx, Wy, Ww, Wh);
-      Wx += offx;
-      VisualizeField(vis_v, vishost, visport, v_gf,
-                     "Velocity", Wx, Wy, Ww, Wh);
-      Wx += offx;
-      VisualizeField(vis_e, vishost, visport, e_gf,
-                     "Specific Internal Energy", Wx, Wy, Ww, Wh);
-   }
-
-   // Save data for VisIt visualization
-   VisItDataCollection visit_dc(basename, pmesh);
-   if (visit)
-   {
-      visit_dc.RegisterField("Density",  &rho_gf);
-      visit_dc.RegisterField("Velocity", &v_gf);
-      visit_dc.RegisterField("Specific Internal Energy", &e_gf);
-      visit_dc.SetCycle(0);
-      visit_dc.SetTime(0.0);
-      visit_dc.Save();
-   }
-
-   // Perform time-integration (looping over the time iterations, ti, with a
-   // time-step dt). The object oper is of type LagrangianHydroOperator that
-   // defines the Mult() method that used by the time integrators.
-   ode_solver->Init(oper);
-   oper.ResetTimeStepEstimate();
-   double t = 0.0, dt = oper.GetTimeStepEstimate(S), t_old;
-   bool last_step = false;
-   int steps = 0;
-   CudaVector S_old(S);
-
-   for (int ti = 1; !last_step; ti++)
-   {
-      if (t + dt >= t_final)
-      {
-         dt = t_final - t;
-         last_step = true;
-      }
-      if (steps == max_tsteps) { last_step = true; }
-
-      S_old = S;
-      t_old = t;
-      oper.ResetTimeStepEstimate();
-
-      // S is the vector of dofs, t is the current time,
-      // and dt is the time step to advance.
-      ode_solver->Step(S, t, dt);
-      steps++;
-
-      // Make sure that the mesh corresponds to the new solution state.
-      x_gf = d_x_gf;
-      pmesh->NewNodes(x_gf, false);
-
-      // Adaptive time step control.
-      const double dt_est = oper.GetTimeStepEstimate(S);
-      if (dt_est < dt)
-      {
-         // Repeat (solve again) with a decreased time step - decrease of the
-         // time estimate suggests appearance of oscillations.
-         dt *= 0.85;
-         if (dt < numeric_limits<double>::epsilon())
-         { MFEM_ABORT("The time step crashed!"); }
-         t = t_old;
-         S = S_old;
-         oper.ResetQuadratureData();
-         if (mpi.Root()) { cout << "Repeating step " << ti << endl; }
-         ti--; continue;
-      }
-      else if (dt_est > 1.25 * dt) { dt *= 1.02; }
-
-
-      if (last_step || (ti % vis_steps) == 0)
-      {
-         double loc_norm = d_e_gf * d_e_gf, tot_norm;
-         MPI_Allreduce(&loc_norm, &tot_norm, 1, MPI_DOUBLE, MPI_SUM,
-                       pmesh->GetComm());
-         if (mpi.Root())
-         {
-            cout << fixed;
-            cout << "step " << setw(5) << ti
-                 << ",\tt = " << setw(5) << setprecision(4) << t
-                 << ",\tdt = " << setw(5) << setprecision(6) << dt
-                 << ",\t|e| = " << setprecision(10)
-                 << sqrt(tot_norm) << endl;
-         }
-
-         // Make sure all ranks have sent their 'v' solution before initiating
-         // another set of GLVis connections (one from each rank):
-         MPI_Barrier(pmesh->GetComm());
-
-         if (visualization || visit || gfprint) { oper.ComputeDensity(rho_gf); }
-         if (visualization)
-         {
-            int Wx = 0, Wy = 0; // window position
-            int Ww = 350, Wh = 350; // window size
-            int offx = Ww+10; // window offsets
-
-            VisualizeField(vis_rho, vishost, visport, rho_gf,
-                           "Density", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_v, vishost, visport,
-                           v_gf, "Velocity", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_e, vishost, visport, e_gf,
-                           "Specific Internal Energy", Wx, Wy, Ww,Wh);
-            Wx += offx;
-         }
-
-         if (visit)
-         {
-            visit_dc.SetCycle(ti);
-            visit_dc.SetTime(t);
-            visit_dc.Save();
-         }
-
-         if (gfprint)
-         {
-            ostringstream mesh_name, rho_name, v_name, e_name;
-            mesh_name << basename << "_" << ti
-                      << "_mesh." << setfill('0') << setw(6) << myid;
-            rho_name  << basename << "_" << ti
-                      << "_rho." << setfill('0') << setw(6) << myid;
-            v_name << basename << "_" << ti
-                   << "_v." << setfill('0') << setw(6) << myid;
-            e_name << basename << "_" << ti
-                   << "_e." << setfill('0') << setw(6) << myid;
-
-            ofstream mesh_ofs(mesh_name.str().c_str());
-            mesh_ofs.precision(8);
-            pmesh->Print(mesh_ofs);
-            mesh_ofs.close();
-
-            ofstream rho_ofs(rho_name.str().c_str());
-            rho_ofs.precision(8);
-            rho_gf.Save(rho_ofs);
-            rho_ofs.close();
-
-            ofstream v_ofs(v_name.str().c_str());
-            v_ofs.precision(8);
-            v_gf.Save(v_ofs);
-            v_ofs.close();
-
-            ofstream e_ofs(e_name.str().c_str());
-            e_ofs.precision(8);
-            e_gf.Save(e_ofs);
-            e_ofs.close();
-         }
-      }
-   }
-
-   switch (ode_solver_type)
-   {
-      case 2: steps *= 2; break;
-      case 3: steps *= 3; break;
-      case 4: steps *= 4; break;
-      case 6: steps *= 6;
-   }
-   oper.PrintTimingData(mpi.Root(), steps);
-
-   if (visualization)
-   {
-      vis_v.close();
-      vis_e.close();
-   }
-
-   // Free the used memory.
-   delete ode_solver;
-   delete pmesh;
-   delete material_pcf;
-   return 0;
-}
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-double rho0(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0: return 1.0;
-      case 1: return 1.0;
-      case 2: if (x(0) < 0.5) { return 1.0; }
-         else { return 0.1; }
-      case 3: if (x(0) > 1.0 && x(1) <= 1.5) { return 1.0; }
-         else { return 0.125; }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-double gamma(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0: return 5./3.;
-      case 1: return 1.4;
-      case 2: return 1.4;
-      case 3: if (x(0) > 1.0 && x(1) <= 1.5) { return 1.4; }
-         else { return 1.5; }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-void v0(const Vector &x, Vector &v)
-{
-   switch (problem)
-   {
-      case 0:
-         v(0) =  sin(M_PI*x(0)) * cos(M_PI*x(1));
-         v(1) = -cos(M_PI*x(0)) * sin(M_PI*x(1));
-         if (x.Size() == 3)
-         {
-            v(0) *= cos(M_PI*x(2));
-            v(1) *= cos(M_PI*x(2));
-            v(2) = 0.0;
-         }
-         break;
-      case 1: v = 0.0; break;
-      case 2: v = 0.0; break;
-      case 3: v = 0.0; break;
-      default: MFEM_ABORT("Bad number given for problem id!");
-   }
-}
-
-double e0(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0:
-      {
-         const double denom = 2.0 / 3.0;  // (5/3 - 1) * density.
-         double val;
-         if (x.Size() == 2)
-         {
-            val = 1.0 + (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) / 4.0;
-         }
-         else
-         {
-            val = 100.0 + ((cos(2*M_PI*x(2)) + 2) *
-                           (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) - 2) / 16.0;
-         }
-         return val/denom;
-      }
-      case 1: return 0.0; // This case in initialized in main().
-      case 2: if (x(0) < 0.5) { return 1.0 / rho0(x) / (gamma(x) - 1.0); }
-         else { return 0.1 / rho0(x) / (gamma(x) - 1.0); }
-      case 3: if (x(0) > 1.0) { return 0.1 / rho0(x) / (gamma(x) - 1.0); }
-         else { return 1.0 / rho0(x) / (gamma(x) - 1.0); }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-void display_banner(ostream & os)
-{
-   os << endl
-      << "       __                __                 " << endl
-      << "      / /   ____  ____  / /_  ____  _____   " << endl
-      << "     / /   / __ `/ __ `/ __ \\/ __ \\/ ___/ " << endl
-      << "    / /___/ /_/ / /_/ / / / / /_/ (__  )    " << endl
-      << "   /_____/\\__,_/\\__, /_/ /_/\\____/____/  " << endl
-      << "               /____/                       " << endl << endl;
-}
diff --git a/cuda/laghos_assembly.cpp b/cuda/laghos_assembly.cpp
deleted file mode 100644
index 1b7520c3..00000000
--- a/cuda/laghos_assembly.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project (17-SC-20-SC)
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_assembly.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-QuadratureData::QuadratureData(int dim,
-                               int nzones,
-                               int nqp)
-{ Setup(dim, nzones, nqp); }
-
-
-void QuadratureData::Setup(int dim,
-                           int nzones,
-                           int nqp)
-{
-   rho0DetJ0w.SetSize(nqp * nzones);
-   stressJinvT.SetSize(dim * dim * nqp * nzones);
-   dtEst.SetSize(nqp * nzones);
-}
-
-void DensityIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
-                                               ElementTransformation &Tr,
-                                               Vector &elvect)
-{
-   const int ip_cnt = integ_rule.GetNPoints();
-   Vector shape(fe.GetDof());
-   Vector rho0DetJ0w = quad_data.rho0DetJ0w;
-   elvect.SetSize(fe.GetDof());
-   elvect = 0.0;
-   for (int q = 0; q < ip_cnt; q++)
-   {
-      fe.CalcShape(integ_rule.IntPoint(q), shape);
-      shape *= rho0DetJ0w(Tr.ElementNo*ip_cnt + q);
-      elvect += shape;
-   }
-}
-
-// *****************************************************************************
-CudaMassOperator::CudaMassOperator(CudaFiniteElementSpace &fes_,
-                                   const IntegrationRule &integ_rule_,
-                                   QuadratureData *quad_data_)
-   : CudaOperator(fes_.GetTrueVSize()),
-     fes(fes_),
-     integ_rule(integ_rule_),
-     ess_tdofs_count(0),
-     bilinearForm(&fes),
-     quad_data(quad_data_),
-     x_gf(fes),
-     y_gf(fes) {}
-
-// *****************************************************************************
-CudaMassOperator::~CudaMassOperator()
-{
-}
-
-// *****************************************************************************
-void CudaMassOperator::Setup()
-{
-   dim=fes.GetMesh()->Dimension();
-   nzones=fes.GetMesh()->GetNE();
-   CudaMassIntegrator &massInteg = *(new CudaMassIntegrator());
-   massInteg.SetIntegrationRule(integ_rule);
-   massInteg.SetOperator(quad_data->rho0DetJ0w);
-   bilinearForm.AddDomainIntegrator(&massInteg);
-   bilinearForm.Assemble();
-   bilinearForm.FormOperator(Array<int>(), massOperator);
-}
-
-// *************************************************************************
-void CudaMassOperator::SetEssentialTrueDofs(Array<int> &dofs)
-{
-   ess_tdofs_count = dofs.Size();
-   if (ess_tdofs.Size()==0)
-   {
-#ifdef MFEM_USE_MPI
-      int global_ess_tdofs_count;
-      const MPI_Comm comm = fes.GetParMesh()->GetComm();
-      MPI_Allreduce(&ess_tdofs_count,&global_ess_tdofs_count,
-                    1, MPI_INT, MPI_SUM, comm);
-      assert(global_ess_tdofs_count>0);
-      ess_tdofs.allocate(global_ess_tdofs_count);
-#else
-      assert(ess_tdofs_count>0);
-      ess_tdofs.allocate(ess_tdofs_count);
-#endif
-   }
-   else { assert(ess_tdofs_count<=ess_tdofs.Size()); }
-   assert(ess_tdofs.ptr());
-   if (ess_tdofs_count == 0) { return; }
-   assert(ess_tdofs_count>0);
-   assert(dofs.GetData());
-   rHtoD(ess_tdofs.ptr(),dofs.GetData(),ess_tdofs_count*sizeof(int));
-}
-
-// *****************************************************************************
-void CudaMassOperator::EliminateRHS(CudaVector &b)
-{
-   if (ess_tdofs_count > 0)
-   {
-      b.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-// *************************************************************************
-void CudaMassOperator::Mult(const CudaVector &x, CudaVector &y) const
-{
-   distX = x;
-   if (ess_tdofs_count)
-   {
-      distX.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-   massOperator->Mult(distX, y);
-   if (ess_tdofs_count)
-   {
-      y.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-
-// *****************************************************************************
-// * CudaForceOperator
-// *****************************************************************************
-CudaForceOperator::CudaForceOperator(CudaFiniteElementSpace &h1fes_,
-                                     CudaFiniteElementSpace &l2fes_,
-                                     const IntegrationRule &integ_rule_,
-                                     const QuadratureData *quad_data_)
-   : CudaOperator(l2fes_.GetTrueVSize(), h1fes_.GetTrueVSize()),
-     dim(h1fes_.GetMesh()->Dimension()),
-     nzones(h1fes_.GetMesh()->GetNE()),
-     h1fes(h1fes_),
-     l2fes(l2fes_),
-     integ_rule(integ_rule_),
-     quad_data(quad_data_),
-     gVecL2(l2fes.GetLocalDofs() * nzones),
-     gVecH1(h1fes.GetVDim() * h1fes.GetLocalDofs() * nzones) { }
-
-// *****************************************************************************
-CudaForceOperator::~CudaForceOperator() {}
-
-// *************************************************************************
-void CudaForceOperator::Setup()
-{
-   h1D2Q = CudaDofQuadMaps::Get(h1fes, integ_rule);
-   l2D2Q = CudaDofQuadMaps::Get(l2fes, integ_rule);
-}
-
-// *************************************************************************
-void CudaForceOperator::Mult(const CudaVector &vecL2,
-                             CudaVector &vecH1) const
-{
-   l2fes.GlobalToLocal(vecL2, gVecL2);
-   const int NUM_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int L2_DOFS_1D = l2fes.GetFE(0)->GetOrder()+1;
-   const int H1_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   if (rconfig::Get().Share())
-      rForceMultS(dim,
-                  NUM_DOFS_1D,
-                  NUM_QUAD_1D,
-                  L2_DOFS_1D,
-                  H1_DOFS_1D,
-                  nzones,
-                  l2D2Q->dofToQuad,
-                  h1D2Q->quadToDof,
-                  h1D2Q->quadToDofD,
-                  quad_data->stressJinvT,
-                  gVecL2,
-                  gVecH1);
-   else
-      rForceMult(dim,
-                 NUM_DOFS_1D,
-                 NUM_QUAD_1D,
-                 L2_DOFS_1D,
-                 H1_DOFS_1D,
-                 nzones,
-                 l2D2Q->dofToQuad,
-                 h1D2Q->quadToDof,
-                 h1D2Q->quadToDofD,
-                 quad_data->stressJinvT,
-                 gVecL2,
-                 gVecH1);
-   h1fes.LocalToGlobal(gVecH1, vecH1);
-}
-
-// *************************************************************************
-void CudaForceOperator::MultTranspose(const CudaVector &vecH1,
-                                      CudaVector &vecL2) const
-{
-   h1fes.GlobalToLocal(vecH1, gVecH1);
-   const int NUM_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int L2_DOFS_1D = l2fes.GetFE(0)->GetOrder()+1;
-   const int H1_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   if (rconfig::Get().Share())
-      rForceMultTransposeS(dim,
-                           NUM_DOFS_1D,
-                           NUM_QUAD_1D,
-                           L2_DOFS_1D,
-                           H1_DOFS_1D,
-                           nzones,
-                           l2D2Q->quadToDof,
-                           h1D2Q->dofToQuad,
-                           h1D2Q->dofToQuadD,
-                           quad_data->stressJinvT,
-                           gVecH1,
-                           gVecL2);
-   else
-      rForceMultTranspose(dim,
-                          NUM_DOFS_1D,
-                          NUM_QUAD_1D,
-                          L2_DOFS_1D,
-                          H1_DOFS_1D,
-                          nzones,
-                          l2D2Q->quadToDof,
-                          h1D2Q->dofToQuad,
-                          h1D2Q->dofToQuadD,
-                          quad_data->stressJinvT,
-                          gVecH1,
-                          gVecL2);
-   l2fes.LocalToGlobal(gVecL2, vecL2);
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/cuda/laghos_assembly.hpp b/cuda/laghos_assembly.hpp
deleted file mode 100644
index 45afca47..00000000
--- a/cuda/laghos_assembly.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_ASSEMBLY
-#define MFEM_LAGHOS_ASSEMBLY
-
-#include "mfem.hpp"
-#include "cuda/cuda.hpp"
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-// Container for all data needed at quadrature points.
-struct QuadratureData
-{
-   // TODO: use QuadratureFunctions?
-
-   // Reference to physical Jacobian for the initial mesh. These are computed
-   // only at time zero and stored here.
-   CudaVector Jac0inv;
-
-   // Quadrature data used for full/partial assembly of the force operator. At
-   // each quadrature point, it combines the stress, inverse Jacobian,
-   // determinant of the Jacobian and the integration weight. It must be
-   // recomputed in every time step.
-   CudaVector stressJinvT;
-   CudaDofQuadMaps *dqMaps;
-   CudaGeometry *geom;
-
-   // Quadrature data used for full/partial assembly of the mass matrices. At
-   // time zero, we compute and store (rho0 * det(J0) * qp_weight) at each
-   // quadrature point. Note the at any other time, we can compute
-   // rho = rho0 * det(J0) / det(J), representing the notion of pointwise mass
-   // conservation.
-   CudaVector rho0DetJ0w;
-
-
-   // Initial length scale. This represents a notion of local mesh size. We
-   // assume that all initial zones have similar size.
-   double h0;
-
-   // Estimate of the minimum time step over all quadrature points. This is
-   // recomputed at every time step to achieve adaptive time stepping.
-   double dt_est;
-   CudaVector dtEst;
-
-   QuadratureData(int dim, int nzones, int quads_per_zone);
-
-   void Setup(int dim, int nzones, int quads_per_zone);
-};
-
-// This class is used only for visualization. It assembles (rho, phi) in each
-// zone, which is used by LagrangianHydroOperator::ComputeDensity to do an L2
-// projection of the density.
-class DensityIntegrator : public LinearFormIntegrator
-{
-private:
-   const QuadratureData &quad_data;
-   const IntegrationRule &integ_rule;
-public:
-   DensityIntegrator(const QuadratureData &qd,
-                     const IntegrationRule &ir) : quad_data(qd),
-      integ_rule(ir) {}
-
-   void AssembleRHSElementVect(const FiniteElement &fe,
-                               ElementTransformation &Tr,
-                               Vector &elvect);
-
-   void AssembleRHSElementVect(const FiniteElement &el,
-                               FaceElementTransformations &Tr,
-                               Vector &elvect) {assert(false);}
-
-};
-
-// *****************************************************************************
-// * CudaMassOperator
-// *****************************************************************************
-class CudaMassOperator : public CudaOperator
-{
-private:
-   int dim;
-   int nzones;
-   CudaFiniteElementSpace &fes;
-   const IntegrationRule &integ_rule;
-   unsigned int ess_tdofs_count;
-   CudaArray<int> ess_tdofs;
-   CudaBilinearForm bilinearForm;
-   CudaOperator *massOperator;
-   QuadratureData *quad_data;
-   // For distributing X
-   mutable CudaVector distX;
-   mutable CudaGridFunction x_gf, y_gf;
-public:
-   CudaMassOperator(CudaFiniteElementSpace &fes_,
-                    const IntegrationRule &integ_rule_,
-                    QuadratureData *quad_data_);
-   ~CudaMassOperator();
-   void Setup();
-   void SetEssentialTrueDofs(Array<int> &dofs);
-   // Can be used for both velocity and specific internal energy. For the case
-   // of velocity, we only work with one component at a time.
-   void Mult(const CudaVector &x, CudaVector &y) const;
-   void EliminateRHS(CudaVector &b);
-   void ComputeDiagonal2D(Vector &diag) const;
-   void ComputeDiagonal3D(Vector &diag) const;
-};
-
-// Performs partial assembly, which corresponds to (and replaces) the use of the
-// LagrangianHydroOperator::Force global matrix.
-class CudaForceOperator : public CudaOperator
-{
-private:
-   const int dim;
-   const int nzones;
-   const CudaFiniteElementSpace &h1fes, &l2fes;
-   const IntegrationRule &integ_rule;
-   const QuadratureData *quad_data;
-   const CudaDofQuadMaps *l2D2Q, *h1D2Q;
-   mutable CudaVector gVecL2, gVecH1;
-public:
-   CudaForceOperator(CudaFiniteElementSpace &h1fes_,
-                     CudaFiniteElementSpace &l2fes_,
-                     const IntegrationRule &integ_rule,
-                     const QuadratureData *quad_data_);
-   void Setup();
-   void Mult(const CudaVector &vecL2, CudaVector &vecH1) const;
-   void MultTranspose(const CudaVector &vecH1, CudaVector &vecL2) const;
-   ~CudaForceOperator();
-};
-
-// Scales by the inverse diagonal of the MassPAOperator.
-class DiagonalSolver : public Solver
-{
-private:
-   Vector diag;
-   FiniteElementSpace &FESpace;
-public:
-   DiagonalSolver(FiniteElementSpace &fes): Solver(fes.GetVSize()),
-      diag(),
-      FESpace(fes) { }
-
-   void SetDiagonal(Vector &d)
-   {
-      const Operator *P = FESpace.GetProlongationMatrix();
-      diag.SetSize(P->Width());
-      P->MultTranspose(d, diag);
-   }
-
-   virtual void Mult(const Vector &x, Vector &y) const
-   {
-      for (int i = 0; i < x.Size(); i++) { y(i) = x(i) / diag(i); }
-   }
-   virtual void SetOperator(const Operator &op) { }
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS_ASSEMBLY
diff --git a/cuda/laghos_solver.cpp b/cuda/laghos_solver.cpp
deleted file mode 100644
index 360d3dff..00000000
--- a/cuda/laghos_solver.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_solver.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x, int y, int w, int h, bool vec)
-{
-   ParMesh &pmesh = *gf.ParFESpace()->GetParMesh();
-   MPI_Comm comm = pmesh.GetComm();
-
-   int num_procs, myid;
-   MPI_Comm_size(comm, &num_procs);
-   MPI_Comm_rank(comm, &myid);
-
-   bool newly_opened = false;
-   int connection_failed;
-
-   do
-   {
-      if (myid == 0)
-      {
-         if (!sock.is_open() || !sock)
-         {
-            sock.open(vishost, visport);
-            sock.precision(8);
-            newly_opened = true;
-         }
-         sock << "solution\n";
-      }
-
-      pmesh.PrintAsOne(sock);
-      gf.SaveAsOne(sock);
-
-      if (myid == 0 && newly_opened)
-      {
-         sock << "window_title '" << title << "'\n"
-              << "window_geometry "
-              << x << " " << y << " " << w << " " << h << "\n"
-              << "keys maaAcl";
-         if ( vec ) { sock << "vvv"; }
-         sock << endl;
-      }
-
-      if (myid == 0)
-      {
-         connection_failed = !sock && !newly_opened;
-      }
-      MPI_Bcast(&connection_failed, 1, MPI_INT, 0, comm);
-   }
-   while (connection_failed);
-}
-
-// ***************************************************************************
-// * LagrangianHydroOperator
-// ***************************************************************************
-LagrangianHydroOperator::LagrangianHydroOperator(int size,
-                                                 CudaFiniteElementSpace &h1_fes,
-                                                 CudaFiniteElementSpace &l2_fes,
-                                                 Array<int> &essential_tdofs,
-                                                 CudaGridFunction &rho0,
-                                                 int source_type_, double cfl_,
-                                                 Coefficient *material_,
-                                                 bool visc, bool pa,
-                                                 double cgt, int cgiter)
-   : CudaTimeDependentOperator(size),
-     H1FESpace(h1_fes), L2FESpace(l2_fes),
-     H1compFESpace(h1_fes.GetParMesh(), h1_fes.FEColl(),1),
-     ess_tdofs(essential_tdofs),
-     dim(h1_fes.GetMesh()->Dimension()),
-     nzones(h1_fes.GetMesh()->GetNE()),
-     l2dofs_cnt(l2_fes.GetFE(0)->GetDof()),
-     h1dofs_cnt(h1_fes.GetFE(0)->GetDof()),
-     source_type(source_type_), cfl(cfl_),
-     use_viscosity(visc), p_assembly(pa), cg_rel_tol(cgt), cg_max_iter(cgiter),
-     material_pcf(material_),
-     integ_rule(IntRules.Get(h1_fes.GetMesh()->GetElementBaseGeometry(0),
-                             3*h1_fes.GetOrder(0) + l2_fes.GetOrder(0) - 1)),
-     quad_data(dim, nzones, integ_rule.GetNPoints()),
-     quad_data_is_current(false),
-     VMassPA(H1compFESpace, integ_rule, &quad_data),
-     EMassPA(L2FESpace, integ_rule, &quad_data),
-     VMassPA_prec(H1FESpace),
-     ForcePA(H1FESpace, L2FESpace, integ_rule, &quad_data),
-     CG_VMass(H1FESpace.GetParMesh()->GetComm()),
-     CG_EMass(L2FESpace.GetParMesh()->GetComm()),
-     timer(),
-     v(),e(),
-     rhs(H1FESpace.GetVSize()),
-     B(H1compFESpace.GetTrueVSize()),X(H1compFESpace.GetTrueVSize()),
-     one(L2FESpace.GetVSize(),1.0),
-     e_rhs(L2FESpace.GetVSize()),
-     rhs_c(H1compFESpace.GetVSize()),
-     v_local(H1FESpace.GetVDim() * H1FESpace.GetLocalDofs()*nzones),
-     e_quad()
-{
-   // Initial local mesh size (assumes similar cells).
-   double loc_area = 0.0, glob_area;
-   int loc_z_cnt = nzones, glob_z_cnt;
-   ParMesh *pm = H1FESpace.GetParMesh();
-   for (int i = 0; i < nzones; i++) { loc_area += pm->GetElementVolume(i); }
-   MPI_Allreduce(&loc_area, &glob_area, 1, MPI_DOUBLE, MPI_SUM, pm->GetComm());
-   MPI_Allreduce(&loc_z_cnt, &glob_z_cnt, 1, MPI_INT, MPI_SUM, pm->GetComm());
-   switch (pm->GetElementBaseGeometry(0))
-   {
-      case Geometry::SEGMENT:
-         quad_data.h0 = glob_area / glob_z_cnt; break;
-      case Geometry::SQUARE:
-         quad_data.h0 = sqrt(glob_area / glob_z_cnt); break;
-      case Geometry::TRIANGLE:
-         quad_data.h0 = sqrt(2.0 * glob_area / glob_z_cnt); break;
-      case Geometry::CUBE:
-         quad_data.h0 = pow(glob_area / glob_z_cnt, 1.0/3.0); break;
-      case Geometry::TETRAHEDRON:
-         quad_data.h0 = pow(6.0 * glob_area / glob_z_cnt, 1.0/3.0); break;
-      default: MFEM_ABORT("Unknown zone type!");
-   }
-   quad_data.h0 /= (double) H1FESpace.GetOrder(0);
-
-   quad_data.dqMaps = CudaDofQuadMaps::Get(H1FESpace,integ_rule);
-   quad_data.geom = CudaGeometry::Get(H1FESpace,integ_rule);
-   quad_data.Jac0inv = quad_data.geom->invJ;
-
-   CudaVector rhoValues; // used in rInitQuadratureData
-   rho0.ToQuad(integ_rule, rhoValues);
-
-   if (dim==1) { assert(false); }
-   const int NUM_QUAD = integ_rule.GetNPoints();
-
-   rInitQuadratureData(NUM_QUAD,
-                       nzones,
-                       rhoValues,
-                       quad_data.geom->detJ,
-                       quad_data.dqMaps->quadWeights,
-                       quad_data.rho0DetJ0w);
-
-   // Needs quad_data.rho0DetJ0w
-   ForcePA.Setup();
-   VMassPA.Setup();
-   EMassPA.Setup();
-
-   {
-      // Setup the preconditioner of the velocity mass operator.
-      //Vector d;
-      //#warning ComputeDiagonal
-      //(dim == 2) ? VMassPA.ComputeDiagonal2D(d) : VMassPA.ComputeDiagonal3D(d);
-      //VMassPA_prec.SetDiagonal(d);
-   }
-
-   CG_VMass.SetOperator(VMassPA);
-   CG_VMass.SetRelTol(cg_rel_tol);
-   CG_VMass.SetAbsTol(0.0);
-   CG_VMass.SetMaxIter(cg_max_iter);
-   CG_VMass.SetPrintLevel(-1);
-
-   CG_EMass.SetOperator(EMassPA);
-   CG_EMass.iterative_mode = false;
-   CG_EMass.SetRelTol(1e-8);
-   CG_EMass.SetAbsTol(1e-8 * numeric_limits<double>::epsilon());
-   CG_EMass.SetMaxIter(200);
-   CG_EMass.SetPrintLevel(-1);
-}
-
-// *****************************************************************************
-LagrangianHydroOperator::~LagrangianHydroOperator() {}
-
-// *****************************************************************************
-void LagrangianHydroOperator::Mult(const CudaVector &S, CudaVector &dS_dt) const
-{
-   dS_dt = 0.0;
-
-   // Make sure that the mesh positions correspond to the ones in S. This is
-   // needed only because some mfem time integrators don't update the solution
-   // vector at every intermediate stage (hence they don't change the mesh).
-   Vector h_x = CudaVector(S.GetRange(0, H1FESpace.GetVSize()));
-   ParGridFunction x(&H1FESpace, h_x.GetData());
-   H1FESpace.GetParMesh()->NewNodes(x, false);
-
-   UpdateQuadratureData(S);
-
-   // The monolithic BlockVector stores the unknown fields as follows:
-   // - Position
-   // - Velocity
-   // - Specific Internal Energy
-   const int VsizeL2 = L2FESpace.GetVSize();
-   const int VsizeH1 = H1FESpace.GetVSize();
-
-   v = S.GetRange(VsizeH1, VsizeH1);
-   e = S.GetRange(2*VsizeH1, VsizeL2);
-
-   CudaVector dx = dS_dt.GetRange(0, VsizeH1);
-   CudaVector dv = dS_dt.GetRange(VsizeH1, VsizeH1);
-   CudaVector de = dS_dt.GetRange(2*VsizeH1, VsizeL2);
-
-   // Set dx_dt = v (explicit)
-   dx = v;
-
-   // Solve for velocity.
-   timer.sw_force.Start();
-   ForcePA.Mult(one, rhs);
-   timer.sw_force.Stop();
-   rhs.Neg();
-
-   // Partial assembly solve for each velocity component.
-   const int size = H1compFESpace.GetVSize();
-
-   for (int c = 0; c < dim; c++)
-   {
-      rhs_c = rhs.GetRange(c*size, size);
-      CudaVector dv_c = dv.GetRange(c*size, size);
-      Array<int> c_tdofs;
-      Array<int> ess_bdr(H1FESpace.GetMesh()->bdr_attributes.Max());
-      // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e.,
-      // we must enforce v_x/y/z = 0 for the velocity components.
-      ess_bdr = 0; ess_bdr[c] = 1;
-      // Essential true dofs as if there's only one component.
-      H1compFESpace.GetEssentialTrueDofs(ess_bdr, c_tdofs);
-
-      dv_c = 0.0;
-
-      H1compFESpace.GetProlongationOperator()->MultTranspose(rhs_c, B);
-      H1compFESpace.GetRestrictionOperator()->Mult(dv_c, X);
-
-      VMassPA.SetEssentialTrueDofs(c_tdofs);
-      VMassPA.EliminateRHS(B);
-
-      timer.sw_cgH1.Start();
-      CG_VMass.Mult(B, X);
-      timer.sw_cgH1.Stop();
-      timer.H1cg_iter += CG_VMass.GetNumIterations();
-      //printf("\n[H1cg_iter] %d",timer.H1cg_iter);
-      H1compFESpace.GetProlongationOperator()->Mult(X, dv_c);
-   }
-
-
-   // Solve for energy, assemble the energy source if such exists.
-   LinearForm *e_source = NULL;
-   if (source_type == 1) // 2D Taylor-Green.
-   {
-      e_source = new LinearForm(&L2FESpace);
-      assert(L2FESpace.FEColl());
-      TaylorCoefficient coeff;
-      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &integ_rule);
-      e_source->AddDomainIntegrator(d);
-      e_source->Assemble();
-   }
-   Array<int> l2dofs;
-   {
-      timer.sw_force.Start();
-      ForcePA.MultTranspose(v, e_rhs);
-      timer.sw_force.Stop();
-   }
-
-   if (e_source) { e_rhs += *e_source; }
-
-   {
-      timer.sw_cgL2.Start();
-      CG_EMass.Mult(e_rhs, de);
-      timer.sw_cgL2.Stop();
-      timer.L2cg_iter += CG_EMass.GetNumIterations();
-   }
-   delete e_source;
-   quad_data_is_current = false;
-}
-
-double LagrangianHydroOperator::GetTimeStepEstimate(const CudaVector &S) const
-{
-   UpdateQuadratureData(S);
-   double glob_dt_est;
-   MPI_Allreduce(&quad_data.dt_est, &glob_dt_est, 1, MPI_DOUBLE, MPI_MIN,
-                 H1FESpace.GetParMesh()->GetComm());
-   return glob_dt_est;
-}
-
-void LagrangianHydroOperator::ResetTimeStepEstimate() const
-{
-   quad_data.dt_est = numeric_limits<double>::infinity();
-}
-
-void LagrangianHydroOperator::ComputeDensity(ParGridFunction &rho)
-{
-   rho.SetSpace(&L2FESpace);
-   DenseMatrix Mrho(l2dofs_cnt);
-   Vector rhs(l2dofs_cnt), rho_z(l2dofs_cnt);
-   Array<int> dofs(l2dofs_cnt);
-   DenseMatrixInverse inv(&Mrho);
-   MassIntegrator mi(&integ_rule);
-   DensityIntegrator di(quad_data,integ_rule);
-   for (int i = 0; i < nzones; i++)
-   {
-      di.AssembleRHSElementVect(*L2FESpace.GetFE(i),
-                                *L2FESpace.GetElementTransformation(i), rhs);
-      mi.AssembleElementMatrix(*L2FESpace.GetFE(i),
-                               *L2FESpace.GetElementTransformation(i), Mrho);
-      inv.Factor();
-      inv.Mult(rhs, rho_z);
-      L2FESpace.GetElementDofs(i, dofs);
-      rho.SetSubVector(dofs, rho_z);
-   }
-}
-
-void LagrangianHydroOperator::PrintTimingData(bool IamRoot, int steps)
-{
-   double my_rt[5], rt_max[5];
-   my_rt[0] = timer.sw_cgH1.RealTime();
-   my_rt[1] = timer.sw_cgL2.RealTime();
-   my_rt[2] = timer.sw_force.RealTime();
-   my_rt[3] = timer.sw_qdata.RealTime();
-   my_rt[4] = my_rt[0] + my_rt[2] + my_rt[3];
-   MPI_Reduce(my_rt, rt_max, 5, MPI_DOUBLE, MPI_MAX, 0, H1FESpace.GetComm());
-
-   HYPRE_Int mydata[2], alldata[2];
-   mydata[0] = timer.L2cg_iter;
-   mydata[1] = timer.quad_tstep;
-   MPI_Reduce(mydata, alldata, 2, HYPRE_MPI_INT, MPI_SUM, 0, H1FESpace.GetComm());
-
-   if (IamRoot)
-   {
-      const HYPRE_Int H1gsize = H1FESpace.GlobalTrueVSize(),
-                      L2gsize = L2FESpace.GlobalTrueVSize();
-      using namespace std;
-      cout << endl;
-      cout << "CG (H1) total time: " << rt_max[0] << endl;
-      cout << "CG (H1) rate (megadofs="<<H1gsize<<" x cg_iterations="<<timer.H1cg_iter<<" / second): "
-           << 1e-6 * H1gsize * timer.H1cg_iter / rt_max[0] << endl;
-      cout << endl;
-      cout << "CG (L2) total time: " << rt_max[1] << endl;
-      cout << "CG (L2) rate (megadofs x cg_iterations / second): "
-           << 1e-6 * L2gsize * timer.L2cg_iter/*alldata[0]*/ / rt_max[1] << endl;
-      cout << endl;
-      // The Force operator is applied twice per time step, on the H1 and the L2
-      // vectors, respectively.
-      cout << "Forces total time: " << rt_max[2] << endl;
-      cout << "Forces rate (megadofs x timesteps / second): "
-           << 1e-6 * steps * (H1gsize + L2gsize) / rt_max[2] << endl;
-      cout << endl;
-      cout << "UpdateQuadData total time: " << rt_max[3] << endl;
-      cout << "UpdateQuadData rate (megaquads x timesteps / second): "
-           << 1e-6 * alldata[1] * integ_rule.GetNPoints() / rt_max[3] << endl;
-      cout << endl;
-      cout << "Major kernels total time (seconds): " << rt_max[4] << endl;
-      cout << "Major kernels total rate (megadofs x time steps / second): "
-           << 1e-6 * H1gsize * steps / rt_max[4] << endl;
-   }
-}
-
-// *****************************************************************************
-void LagrangianHydroOperator::UpdateQuadratureData(const CudaVector &S) const
-{
-   if (quad_data_is_current) { return; }
-
-   timer.sw_qdata.Start();
-
-   const int vSize = H1FESpace.GetVSize();
-   const int eSize = L2FESpace.GetVSize();
-
-   const CudaVector x = S.GetRange(0, vSize);
-   CudaVector v = S.GetRange(vSize, vSize);
-   CudaGridFunction e(L2FESpace, S.GetRange(2*vSize, eSize));
-
-   quad_data.geom = CudaGeometry::Get(H1FESpace,integ_rule,x);
-   H1FESpace.GlobalToLocal(v, v_local);
-   e.ToQuad(integ_rule, e_quad);
-
-   const int NUM_QUAD = integ_rule.GetNPoints();
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int NUM_DOFS_1D  = H1FESpace.GetFE(0)->GetOrder()+1;
-
-   ElementTransformation *T = H1FESpace.GetElementTransformation(0);
-   const IntegrationPoint &ip = integ_rule.IntPoint(0);
-   const double gamma = material_pcf->Eval(*T, ip);
-   if (rconfig::Get().Share())
-      rUpdateQuadratureDataS(gamma,
-                             quad_data.h0,
-                             cfl,
-                             use_viscosity,
-                             dim,
-                             NUM_QUAD,
-                             NUM_QUAD_1D,
-                             NUM_DOFS_1D,
-                             nzones,
-                             quad_data.dqMaps->dofToQuad,
-                             quad_data.dqMaps->dofToQuadD,
-                             quad_data.dqMaps->quadWeights,
-                             v_local,
-                             e_quad,
-                             quad_data.rho0DetJ0w,
-                             quad_data.Jac0inv,
-                             quad_data.geom->J,
-                             quad_data.geom->invJ,
-                             quad_data.geom->detJ,
-                             quad_data.stressJinvT,
-                             quad_data.dtEst);
-   else
-      rUpdateQuadratureData(gamma,
-                            quad_data.h0,
-                            cfl,
-                            use_viscosity,
-                            dim,
-                            NUM_QUAD,
-                            NUM_QUAD_1D,
-                            NUM_DOFS_1D,
-                            nzones,
-                            quad_data.dqMaps->dofToQuad,
-                            quad_data.dqMaps->dofToQuadD,
-                            quad_data.dqMaps->quadWeights,
-                            v_local,
-                            e_quad,
-                            quad_data.rho0DetJ0w,
-                            quad_data.Jac0inv,
-                            quad_data.geom->J,
-                            quad_data.geom->invJ,
-                            quad_data.geom->detJ,
-                            quad_data.stressJinvT,
-                            quad_data.dtEst);
-
-   quad_data.dt_est = quad_data.dtEst.Min();
-   quad_data_is_current = true;
-   timer.sw_qdata.Stop();
-   timer.quad_tstep += nzones;
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/cuda/laghos_solver.hpp b/cuda/laghos_solver.hpp
deleted file mode 100644
index 9168c99d..00000000
--- a/cuda/laghos_solver.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_SOLVER
-#define MFEM_LAGHOS_SOLVER
-
-#include "mfem.hpp"
-#include "cuda/cuda.hpp"
-
-#include "laghos_assembly.hpp"
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-#include <fstream>
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-/// Visualize the given parallel grid function, using a GLVis server on the
-/// specified host and port. Set the visualization window title, and optionally,
-/// its geometry.
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x = 0, int y = 0, int w = 400, int h = 400,
-                    bool vec = false);
-
-
-// These are defined in laghos.cpp
-double rho0(const Vector &);
-void v0(const Vector &, Vector &);
-double e0(const Vector &);
-double gamma(const Vector &);
-
-struct TimingData
-{
-   // Total times for all major computations:
-   // CG solves (H1 and L2) / force RHS assemblies / quadrature computations.
-   StopWatch sw_cgH1, sw_cgL2, sw_force, sw_qdata;
-
-   // These accumulate the total processed dofs or quad points:
-   // #(CG iterations) for the H1 CG solves.
-   // #dofs  * #(CG iterations) for the L2 CG solve.
-   // #quads * #(RK sub steps) for the quadrature data computations.
-   int H1cg_iter, L2cg_iter, quad_tstep;
-
-   TimingData()
-      : H1cg_iter(0), L2cg_iter(0), quad_tstep(0) {}
-};
-
-// Given a solutions state (x, v, e), this class performs all necessary
-// computations to evaluate the new slopes (dx_dt, dv_dt, de_dt).
-class LagrangianHydroOperator : public CudaTimeDependentOperator
-{
-protected:
-   CudaFiniteElementSpace &H1FESpace;
-   CudaFiniteElementSpace &L2FESpace;
-   mutable CudaFiniteElementSpace H1compFESpace;
-
-   Array<int> &ess_tdofs;
-
-   const int dim, nzones, l2dofs_cnt, h1dofs_cnt, source_type;
-   const double cfl;
-   const bool use_viscosity, p_assembly;
-   const double cg_rel_tol;
-   const int cg_max_iter;
-   Coefficient *material_pcf;
-
-   // Integration rule for all assemblies.
-   const IntegrationRule &integ_rule;
-
-   // Data associated with each quadrature point in the mesh. These values are
-   // recomputed at each time step.
-   mutable QuadratureData quad_data;
-   mutable bool quad_data_is_current;
-
-   // Force matrix that combines the kinematic and thermodynamic spaces. It is
-   // assembled in each time step and then it's used to compute the final
-   // right-hand sides for momentum and specific internal energy.
-   mutable CudaMassOperator VMassPA, EMassPA;
-   mutable DiagonalSolver VMassPA_prec;
-   mutable CudaForceOperator ForcePA;
-
-   // Linear solver for energy.
-   //CudaCGSolver locCG;
-   CudaCGSolver CG_VMass,CG_EMass;
-
-   mutable TimingData timer;
-
-   // Device vectors we want to keep
-   mutable CudaVector v,e,rhs,B,X;
-   const CudaVector one;
-   mutable CudaVector e_rhs;
-   mutable CudaVector rhs_c;
-   mutable CudaVector v_local,e_quad;
-
-   virtual void ComputeMaterialProperties(int nvalues, const double gamma[],
-                                          const double rho[], const double e[],
-                                          double p[], double cs[]) const
-   {
-      for (int v = 0; v < nvalues; v++)
-      {
-         p[v]  = (gamma[v] - 1.0) * rho[v] * e[v];
-         cs[v] = sqrt(gamma[v] * (gamma[v]-1.0) * e[v]);
-      }
-   }
-
-   void UpdateQuadratureData(const CudaVector &S) const;
-
-public:
-   LagrangianHydroOperator(int size, CudaFiniteElementSpace &h1_fes,
-                           CudaFiniteElementSpace &l2_fes,
-                           Array<int> &essential_tdofs, CudaGridFunction &rho0,
-                           int source_type_, double cfl_,
-                           Coefficient *material_, bool visc, bool pa,
-                           double cgt, int cgiter);
-
-   // Solve for dx_dt, dv_dt and de_dt.
-   virtual void Mult(const CudaVector &S, CudaVector &dS_dt) const;
-
-   // Calls UpdateQuadratureData to compute the new quad_data.dt_est.
-   double GetTimeStepEstimate(const CudaVector &S) const;
-   void ResetTimeStepEstimate() const;
-   void ResetQuadratureData() const { quad_data_is_current = false; }
-
-   // The density values, which are stored only at some quadrature points, are
-   // projected as a ParGridFunction.
-   void ComputeDensity(ParGridFunction &rho);
-
-   void PrintTimingData(bool IamRoot, int steps);
-
-   ~LagrangianHydroOperator();
-};
-
-class TaylorCoefficient : public Coefficient
-{
-   virtual double Eval(ElementTransformation &T,
-                       const IntegrationPoint &ip)
-   {
-      Vector x(2);
-      T.Transform(ip, x);
-      return 3.0 / 8.0 * M_PI * ( cos(3.0*M_PI*x(0)) * cos(M_PI*x(1)) -
-                                  cos(M_PI*x(0))     * cos(3.0*M_PI*x(1)) );
-   }
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS
diff --git a/cuda/makefile b/cuda/makefile
deleted file mode 100644
index 89afcd0d..00000000
--- a/cuda/makefile
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-# reserved. See files LICENSE and NOTICE for details.
-#
-# This file is part of CEED, a collection of benchmarks, miniapps, software
-# libraries and APIs for efficient high-order finite element and spectral
-# element discretizations for exascale applications. For more information and
-# source code availability see http://github.com/ceed.
-#
-# The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-# a collaborative effort of two U.S. Department of Energy organizations (Office
-# of Science and the National Nuclear Security Administration) responsible for
-# the planning and preparation of a capable exascale ecosystem, including
-# software, applications, hardware, advanced system engineering and early
-# testbed platforms, in support of the nation's exascale computing imperative.
-
-NV_ARCH ?= -arch=sm_61
-CUDA_DIR ?= /usr/local/cuda
-
-MPI_HOME ?= /usr/local/mpi
-MPI_INCFLAGS = -I$(MPI_HOME)/include
-
-NVCC_CXXFLAGS = -x=cu -std=c++11 -m64 --restrict -Xcompiler -Wall $(NV_ARCH)
-NVCC_LIBS = -Wl,-rpath -Wl,$(CUDA_DIR)/lib64 -L$(CUDA_DIR)/lib64 \
-            -lcuda -lcudart -lcudadevrt -lnvToolsExt
-
-define LAGHOS_HELP_MSG
-
-Laghos makefile targets:
-
-   make
-   make status/info
-   make install
-   make clean
-   make distclean
-   make style
-
-Examples:
-
-make -j 4
-   Build Laghos using the current configuration options from MFEM.
-   (Laghos requires the MFEM finite element library, and uses its compiler and
-    linker options in its build process.)
-make status
-   Display information about the current configuration.
-make install PREFIX=<dir>
-   Install the Laghos executable in <dir>.
-make clean
-   Clean the Laghos executable, library and object files.
-make distclean
-   In addition to "make clean", remove the local installation directory and some
-   run-time generated files.
-make style
-   Format the Laghos C++ source files using the Artistic Style (astyle) settings
-   from MFEM.
-
-endef
-
-# Default installation location
-PREFIX = ./bin
-INSTALL = /usr/bin/install
-
-# Use the MFEM build directory
-MFEM_DIR = ../../mfem
-CONFIG_MK = $(MFEM_DIR)/config/config.mk
-TEST_MK = $(MFEM_DIR)/config/test.mk
-# Use the MFEM install directory
-# MFEM_DIR = ../mfem/mfem
-# CONFIG_MK = $(MFEM_DIR)/config.mk
-# TEST_MK = $(MFEM_DIR)/test.mk
-
-# Use two relative paths to MFEM: first one for compilation in '.' and second
-# one for compilation in 'lib'.
-MFEM_DIR1 := $(MFEM_DIR)
-MFEM_DIR2 := $(realpath $(MFEM_DIR))
-
-# Use the compiler used by MFEM. Get the compiler and the options for compiling
-# and linking from MFEM's config.mk. (Skip this if the target does not require
-# building.)
-MFEM_LIB_FILE = mfem_is_not_built
-ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
-   -include $(CONFIG_MK)
-endif
-
-CXX = nvcc
-CPPFLAGS = $(MFEM_CPPFLAGS)
-CXXFLAGS = $(MFEM_CXXFLAGS)
-
-# MFEM config does not define C compiler
-CC     = gcc
-CFLAGS = -O3
-
-# Optional link flags
-LDFLAGS =
-
-OPTIM_OPTS = -O3
-DEBUG_OPTS = -g -Wall
-LAGHOS_DEBUG = $(MFEM_DEBUG)
-ifneq ($(LAGHOS_DEBUG),$(MFEM_DEBUG))
-   ifeq ($(LAGHOS_DEBUG),YES)
-      CXXFLAGS = $(DEBUG_OPTS)
-   else
-      CXXFLAGS = $(OPTIM_OPTS)
-   endif
-endif
-
-LAGHOS_FLAGS = $(CPPFLAGS) $(CXXFLAGS) $(MFEM_INCFLAGS) \
-      $(MPI_INCFLAGS) $(NVCC_CXXFLAGS) $(NVCC_INCFLAGS)
-LAGHOS_LIBS = $(MFEM_LIBS) $(NVCC_LIBS)
-
-ifeq ($(LAGHOS_DEBUG),YES)
-   LAGHOS_FLAGS += -DLAGHOS_DEBUG
-endif
-
-LIBS = $(strip $(LAGHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(LAGHOS_FLAGS))
-Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
-
-SOURCE_FILES = laghos.cpp laghos_assembly.cpp laghos_solver.cpp \
-	cuda/linalg/solvers.cpp \
-	cuda/linalg/vector.cpp \
-	cuda/kernels/share/qDataUpdateS.cpp \
-	cuda/kernels/share/gridFuncToQuadS.cpp \
-	cuda/kernels/share/forceS.cpp \
-	cuda/kernels/share/massMultAddS.cpp \
-	cuda/kernels/share/massAssembleS.cpp \
-	cuda/kernels/force/force.cpp \
-	cuda/kernels/geom/initGeom.cpp \
-	cuda/kernels/quad/gridFuncToQuad.cpp \
-	cuda/kernels/quad/qDataUpdate.cpp \
-	cuda/kernels/quad/qDataInit.cpp \
-	cuda/kernels/maps/globalToLocal.cpp \
-	cuda/kernels/maps/mapping.cpp \
-	cuda/kernels/maps/localToGlobal.cpp \
-	cuda/kernels/mass/multAdd.cpp \
-	cuda/kernels/mass/assemble.cpp \
-	cuda/kernels/blas/vector_map_dofs.cpp \
-	cuda/kernels/blas/vector_vec_sub.cpp \
-	cuda/kernels/blas/vector_dot.cpp \
-	cuda/kernels/blas/vector_clear_dofs.cpp \
-	cuda/kernels/blas/vector_xsy.cpp \
-	cuda/kernels/blas/vector_xpay.cpp \
-	cuda/kernels/blas/vector_axpy.cpp \
-	cuda/kernels/blas/vector_op_eq.cpp \
-	cuda/kernels/blas/vector_get_subvector.cpp \
-	cuda/kernels/blas/vector_vec_add.cpp \
-	cuda/kernels/blas/vector_min.cpp \
-	cuda/kernels/blas/vector_set_subvector.cpp \
-	cuda/kernels/blas/vector_set_subvector_const.cpp \
-	cuda/kernels/blas/vector_vec_mul.cpp \
-	cuda/kernels/blas/vector_neg.cpp \
-	cuda/fem/bilinearform.cpp \
-	cuda/fem/cuGridfunc.cpp \
-	cuda/fem/restrict.cpp \
-	cuda/fem/fespace.cpp \
-	cuda/fem/bilininteg.cpp \
-	cuda/fem/conform.cpp \
-	cuda/fem/prolong.cpp \
-	cuda/general/memcpy.cpp \
-	cuda/general/commd.cpp \
-	cuda/general/table.cpp \
-	cuda/config/config.cpp
-
-OBJECT_FILES1 = $(SOURCE_FILES:.cpp=.o)
-OBJECT_FILES = $(OBJECT_FILES1:.c=.o)
-HEADER_FILES = laghos_solver.hpp laghos_assembly.hpp \
-	cuda/linalg/solvers.hpp \
-	cuda/linalg/operator.hpp \
-	cuda/linalg/ode.hpp \
-	cuda/linalg/vector.hpp \
-	cuda/kernels/cuda.hpp \
-	cuda/kernels/include/forall.hpp \
-	cuda/kernels/include/kernels.hpp \
-	cuda/kernels/include/offsets.hpp \
-	cuda/fem/prolong.hpp \
-	cuda/fem/cuGridfunc.hpp \
-	cuda/fem/conform.hpp \
-	cuda/fem/restrict.hpp \
-	cuda/fem/bilininteg.hpp \
-	cuda/fem/fespace.hpp \
-	cuda/fem/bilinearform.hpp \
-	cuda/general/commd.hpp \
-	cuda/general/table.hpp \
-	cuda/general/malloc.hpp \
-	cuda/general/memcpy.hpp \
-	cuda/general/array.hpp \
-	cuda/config/config.hpp \
-	cuda/cuda.hpp \
-
-# Targets
-
-.PHONY: all clean distclean install status info opt debug test style clean-build clean-exec
-
-.SUFFIXES: .c .cpp .o
-.cpp.o:
-	cd $(<D); $(CCC) -c $(<F)
-.c.o:
-	cd $(<D); $(Ccc) -c $(<F)
-
-laghos: override MFEM_DIR = $(MFEM_DIR1)
-laghos:	$(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
-	$(MFEM_CXX) -o laghos $(OBJECT_FILES) $(LIBS)
-
-all: laghos
-
-opt:
-	$(MAKE) "LAGHOS_DEBUG=NO"
-
-debug:
-	$(MAKE) "LAGHOS_DEBUG=YES"
-
-$(OBJECT_FILES): override MFEM_DIR = $(MFEM_DIR2)
-$(OBJECT_FILES): $(HEADER_FILES) $(CONFIG_MK)
-
-MFEM_TESTS = laghos
-include $(TEST_MK)
-# Testing: Specific execution options
-RUN_MPI = $(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) 4
-test: laghos
-	@$(call mfem-test,$<, $(RUN_MPI), Laghos miniapp,\
-	-p 0 -m data/square01_quad.mesh -rs 3 -tf 0.1)
-# Testing: "test" target and mfem-test* variables are defined in MFEM's
-# config/test.mk
-
-# Generate an error message if the MFEM library is not built and exit
-$(CONFIG_MK) $(MFEM_LIB_FILE):
-	$(error The MFEM library is not built)
-
-clean: clean-build clean-exec
-
-clean-build:
-	rm -rf laghos $(OBJECT_FILES) *~ *.dSYM
-clean-exec:
-	rm -rf ./results
-
-distclean: clean
-	rm -rf bin/
-
-install: laghos
-	mkdir -p $(PREFIX)
-	$(INSTALL) -m 750 laghos $(PREFIX)
-
-help:
-	$(info $(value LAGHOS_HELP_MSG))
-	@true
-
-status info:
-	$(info MFEM_DIR    = $(MFEM_DIR))
-	$(info LAGHOS_FLAGS = $(LAGHOS_FLAGS))
-	$(info LAGHOS_LIBS  = $(value LAGHOS_LIBS))
-	$(info PREFIX      = $(PREFIX))
-	@true
-
-ASTYLE = astyle --options=$(MFEM_DIR1)/config/mfem.astylerc
-FORMAT_FILES := $(SOURCE_FILES) $(HEADER_FILES)
-
-style:
-	@if ! $(ASTYLE) $(FORMAT_FILES) | grep Formatted; then\
-	   echo "No source files were changed.";\
-	fi
diff --git a/data/square_10x9_quad.mesh b/data/square_10x9_quad.mesh
new file mode 100644
index 00000000..b671d282
--- /dev/null
+++ b/data/square_10x9_quad.mesh
@@ -0,0 +1,264 @@
+MFEM mesh v1.0
+
+#
+# MFEM Geometry Types (see mesh/geom.hpp):
+#
+# POINT       = 0
+# SEGMENT     = 1
+# TRIANGLE    = 2
+# SQUARE      = 3
+# TETRAHEDRON = 4
+# CUBE        = 5
+# PRISM       = 6
+#
+
+dimension
+2
+
+elements
+90
+1 3 0 1 12 11
+1 3 1 2 13 12
+1 3 2 3 14 13
+1 3 3 4 15 14
+1 3 4 5 16 15
+1 3 5 6 17 16
+1 3 6 7 18 17
+1 3 7 8 19 18
+1 3 8 9 20 19
+1 3 9 10 21 20
+1 3 11 12 23 22
+1 3 12 13 24 23
+1 3 13 14 25 24
+1 3 14 15 26 25
+1 3 15 16 27 26
+1 3 16 17 28 27
+1 3 17 18 29 28
+1 3 18 19 30 29
+1 3 19 20 31 30
+1 3 20 21 32 31
+1 3 22 23 34 33
+1 3 23 24 35 34
+1 3 24 25 36 35
+1 3 25 26 37 36
+1 3 26 27 38 37
+1 3 27 28 39 38
+1 3 28 29 40 39
+1 3 29 30 41 40
+1 3 30 31 42 41
+1 3 31 32 43 42
+1 3 33 34 45 44
+1 3 34 35 46 45
+1 3 35 36 47 46
+1 3 36 37 48 47
+1 3 37 38 49 48
+1 3 38 39 50 49
+1 3 39 40 51 50
+1 3 40 41 52 51
+1 3 41 42 53 52
+1 3 42 43 54 53
+1 3 44 45 56 55
+1 3 45 46 57 56
+1 3 46 47 58 57
+1 3 47 48 59 58
+1 3 48 49 60 59
+1 3 49 50 61 60
+1 3 50 51 62 61
+1 3 51 52 63 62
+1 3 52 53 64 63
+1 3 53 54 65 64
+1 3 55 56 67 66
+1 3 56 57 68 67
+1 3 57 58 69 68
+1 3 58 59 70 69
+1 3 59 60 71 70
+1 3 60 61 72 71
+1 3 61 62 73 72
+1 3 62 63 74 73
+1 3 63 64 75 74
+1 3 64 65 76 75
+1 3 66 67 78 77
+1 3 67 68 79 78
+1 3 68 69 80 79
+1 3 69 70 81 80
+1 3 70 71 82 81
+1 3 71 72 83 82
+1 3 72 73 84 83
+1 3 73 74 85 84
+1 3 74 75 86 85
+1 3 75 76 87 86
+1 3 77 78 89 88
+1 3 78 79 90 89
+1 3 79 80 91 90
+1 3 80 81 92 91
+1 3 81 82 93 92
+1 3 82 83 94 93
+1 3 83 84 95 94
+1 3 84 85 96 95
+1 3 85 86 97 96
+1 3 86 87 98 97
+1 3 88 89 100 99
+1 3 89 90 101 100
+1 3 90 91 102 101
+1 3 91 92 103 102
+1 3 92 93 104 103
+1 3 93 94 105 104
+1 3 94 95 106 105
+1 3 95 96 107 106
+1 3 96 97 108 107
+1 3 97 98 109 108
+
+boundary
+38
+2 1 0 1
+2 1 1 2
+2 1 2 3
+2 1 3 4
+2 1 4 5
+2 1 5 6
+2 1 6 7
+2 1 7 8
+2 1 8 9
+2 1 9 10
+2 1 100 99
+2 1 101 100
+2 1 102 101
+2 1 103 102
+2 1 104 103
+2 1 105 104
+2 1 106 105
+2 1 107 106
+2 1 108 107
+2 1 109 108
+1 1 11 0
+1 1 22 11
+1 1 33 22
+1 1 44 33
+1 1 55 44
+1 1 66 55
+1 1 77 66
+1 1 88 77
+1 1 99 88
+1 1 10 21
+1 1 21 32
+1 1 32 43
+1 1 43 54
+1 1 54 65
+1 1 65 76
+1 1 76 87
+1 1 87 98
+1 1 98 109
+
+vertices
+110
+2
+0 0
+0.1 0
+0.2 0
+0.3 0
+0.4 0
+0.5 0
+0.6 0
+0.7 0
+0.8 0
+0.9 0
+1 0
+0 0.1
+0.1 0.1
+0.2 0.1
+0.3 0.1
+0.4 0.1
+0.5 0.1
+0.6 0.1
+0.7 0.1
+0.8 0.1
+0.9 0.1
+1 0.1
+0 0.2
+0.1 0.2
+0.2 0.2
+0.3 0.2
+0.4 0.2
+0.5 0.2
+0.6 0.2
+0.7 0.2
+0.8 0.2
+0.9 0.2
+1 0.2
+0 0.3
+0.1 0.3
+0.2 0.3
+0.3 0.3
+0.4 0.3
+0.5 0.3
+0.6 0.3
+0.7 0.3
+0.8 0.3
+0.9 0.3
+1 0.3
+0 0.4
+0.1 0.4
+0.2 0.4
+0.3 0.4
+0.4 0.4
+0.5 0.4
+0.6 0.4
+0.7 0.4
+0.8 0.4
+0.9 0.4
+1 0.4
+0 0.5
+0.1 0.5
+0.2 0.5
+0.3 0.5
+0.4 0.5
+0.5 0.5
+0.6 0.5
+0.7 0.5
+0.8 0.5
+0.9 0.5
+1 0.5
+0 0.6
+0.1 0.6
+0.2 0.6
+0.3 0.6
+0.4 0.6
+0.5 0.6
+0.6 0.6
+0.7 0.6
+0.8 0.6
+0.9 0.6
+1 0.6
+0 0.7
+0.1 0.7
+0.2 0.7
+0.3 0.7
+0.4 0.7
+0.5 0.7
+0.6 0.7
+0.7 0.7
+0.8 0.7
+0.9 0.7
+1 0.7
+0 0.8
+0.1 0.8
+0.2 0.8
+0.3 0.8
+0.4 0.8
+0.5 0.8
+0.6 0.8
+0.7 0.8
+0.8 0.8
+0.9 0.8
+1 0.8
+0 0.9
+0.1 0.9
+0.2 0.9
+0.3 0.9
+0.4 0.9
+0.5 0.9
+0.6 0.9
+0.7 0.9
+0.8 0.9
+0.9 0.9
+1 0.9
diff --git a/hip/README.md b/hip/README.md
deleted file mode 100644
index f90233bc..00000000
--- a/hip/README.md
+++ /dev/null
@@ -1,130 +0,0 @@
-               __                __
-              / /   ____  ____  / /_  ____  _____
-             / /   / __ `/ __ `/ __ \/ __ \/ ___/
-            / /___/ /_/ / /_/ / / / / /_/ (__  )
-           /_____/\__,_/\__, /_/ /_/\____/____/
-                       /____/
-
-        High-order Lagrangian Hydrodynamics Miniapp
-
-                      HIP version
-
-## Overview
-
-This directory contains the HIP version of the **Laghos** (LAGrangian
-High-Order Solver), which is provided as a reference implementation and is NOT
-the official benchmark version of the miniapp.
-
-For more details about Laghos see the [README file](../README.md) in the
-top-level directory.
-
-The Laghos miniapp is part of the [CEED software suite](http://ceed.exascaleproject.org/software),
-a collection of software benchmarks, miniapps, libraries and APIs for
-efficient exascale discretizations based on high-order finite element
-and spectral element methods. See http://github.com/ceed for more
-information and source code availability.
-
-The CEED research is supported by the [Exascale Computing Project](https://exascaleproject.org/exascale-computing-project)
-(17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
-organizations (Office of Science and the National Nuclear Security
-Administration) responsible for the planning and preparation of a
-[capable exascale ecosystem](https://exascaleproject.org/what-is-exascale),
-including software, applications, hardware, advanced system engineering and early
-testbed platforms, in support of the nation’s exascale computing imperative.
-
-## Differences with the official benchmark version
-
-The HIP version differs from the official benchmark version of Laghos (in the
-top-level directory) in the following ways:
-
-1. Only problems 0 and 1 are defined
-2. Final iterations (`step`), time steps (`dt`) and energies (`|e|`) differ from the original version
-
-## Building
-
-Follow the steps below to build the HIP version with GPU acceleration.
-
-### Environment setup
-```sh
-export MPI_HOME=~/usr/local/openmpi/3.0.0
-```
-
-### Hypre
-- <https://computation.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods/download/hypre-2.11.2.tar.gz>
-- `tar xzvf hypre-2.11.2.tar.gz`
-- ` cd hypre-2.11.2/src`
-- `./configure --disable-fortran --with-MPI --with-MPI-include=$MPI_HOME/include --with-MPI-lib-dirs=$MPI_HOME/lib`
-- `make -j`
-- `cd ../..`
-
-### Metis
--   <http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/metis-5.1.0.tar.gz>
--   `tar xzvf metis-5.1.0.tar.gz`
--   `cd metis-5.1.0`
--   ``make config prefix=`pwd` ``
--   `make && make install`
--   `cd ..`
-
-### MFEM
--   `git clone git@github.com:mfem/mfem.git`
--   `cd mfem`
--   `git checkout laghos-v2.0`
--   ``make config MFEM_USE_MPI=YES HYPRE_DIR=`pwd`/../hypre-2.11.2/src/hypre MFEM_USE_METIS_5=YES METIS_DIR=`pwd`/../metis-5.1.0``
--   `make status` to verify that all the include paths are correct
--   `make -j`
--   `cd ..`
-
-### HIP Laghos
--   `git clone git@github.com:CEED/Laghos.git`
--   `cd Laghos/cuda`
--   edit the `makefile`, set HIP\_ARCH to the desired architecture and the absolute paths to HIP\_DIR, MFEM\_DIR, MPI\_HOME
--   `make` to build the HIP version
-
-## Running
-
-The HIP version can run the same sample test runs as the official benchmark
-version of Laghos.
-
-### Options
--   -m <string>: Mesh file to use
--   -ok <int>: Order (degree) of the kinematic finite element space
--   -rs <int>: Number of times to refine the mesh uniformly in serial
--   -p <int>: Problem setup to use, Sedov problem is '1'
--   -cfl <double>: CFL-condition number
--   -ms <int>: Maximum number of steps (negative means no restriction)
--   -aware: Enable or disable MPI HIP Aware
-
-## Verification of Results
-
-To make sure the results are correct, we tabulate reference final iterations
-(`step`), time steps (`dt`) and energies (`|e|`) for the runs listed below:
-
-1. `mpirun -np 4 laghos -p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.75 -pa`
-2. `mpirun -np 4 laghos -p 0 -m ../data/cube01_hex.mesh -rs 1 -tf 0.75 -pa`
-3. `mpirun -np 4 laghos -p 1 -m ../data/square01_quad.mesh -rs 3 -tf 0.8 -pa -cfl 0.05`
-4. `mpirun -np 4 laghos -p 1 -m ../data/cube01_hex.mesh -rs 2 -tf 0.6 -pa -cfl 0.08`
-
-| `run` | `step` | `dt` | `e` |
-| ----- | ------ | ---- | --- |
-|  1. |  333 | 0.000008 | 49.6955373330   |
-|  2. | 1036 | 0.000093 | 3390.9635544029 |
-|  3. | 1570 | 0.000768 | 46.2901037375   |
-|  4. |  486 | 0.000864 | 135.1267396160  |
-
-An implementation is considered valid if the final energy values are all within
-round-off distance from the above reference values.
-
-## Contact
-
-You can reach the Laghos team by emailing laghos@llnl.gov or by leaving a
-comment in the [issue tracker](https://github.com/CEED/Laghos/issues).
-
-## Copyright
-
-The following copyright applies to each file in the CEED software suite,
-unless otherwise stated in the file:
-
-> Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the
-> Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights reserved.
-
-See files LICENSE and NOTICE in the top-level directory for details.
diff --git a/hip/hip/config/config.cpp b/hip/hip/config/config.cpp
deleted file mode 100644
index 4e193e9f..00000000
--- a/hip/hip/config/config.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-#include <mpi-ext.h>
-#include <unistd.h>
-
-namespace mfem
-{
-
-
-// ***************************************************************************
-void computeCapabilityOfTheDevice(const int mpi_rank,
-                                  const hipDevice_t hipDevice,
-                                  const int device)
-{
-   char name[128];
-   int major, minor;
-   hipDeviceGetName(name, 128, hipDevice);
-   hipDeviceComputeCapability(&major, &minor, device);
-   printf("\033[32m[laghos] Rank_%d => Device_%d (%s:sm_%d.%d)\033[m\n",
-          mpi_rank, device, name, major, minor);
-}
-
-// ***************************************************************************
-static bool isTux(void)
-{
-   char hostname[1024];
-   hostname[1023] = '\0';
-   gethostname(hostname, 1023);
-   if (strncmp("tux", hostname, 3)==0) { return true; }
-   return false;
-}
-
-// ***************************************************************************
-__attribute__((unused))
-static void printDevProp(hipDeviceProp_t devProp)
-{
-   printf("Major revision number:         %d\n",  devProp.major);
-   printf("Minor revision number:         %d\n",  devProp.minor);
-   printf("Name:                          %s\n",  devProp.name);
-   printf("Total global memory:           %lu\n",  devProp.totalGlobalMem);
-   printf("Total shared memory per block: %lu\n",  devProp.sharedMemPerBlock);
-   printf("Total registers per block:     %d\n",  devProp.regsPerBlock);
-   printf("Warp size:                     %d\n",  devProp.warpSize);
-   printf("Maximum threads per block:     %d\n",  devProp.maxThreadsPerBlock);
-   for (int i = 0; i < 3; ++i)
-   {
-      printf("Maximum dimension %d of block:  %d\n", i, devProp.maxThreadsDim[i]);
-   }
-   for (int i = 0; i < 3; ++i)
-   {
-      printf("Maximum dimension %d of grid:   %d\n", i, devProp.maxGridSize[i]);
-   }
-   printf("Clock rate:                    %d\n",  devProp.clockRate);
-   printf("Total constant memory:         %lu\n",  devProp.totalConstMem);
-   printf("Number of multiprocessors:     %d\n",  devProp.multiProcessorCount);
-}
-
-// ***************************************************************************
-// *   Setup
-// ***************************************************************************
-void rconfig::Setup(const int _mpi_rank,
-                    const int _mpi_size,
-                    const bool _hip,
-                    const bool _aware,
-                    const bool _share,
-                    const bool _hcpo,
-                    const bool _sync,
-                    const int rs_levels)
-{
-   mpi_rank=_mpi_rank;
-   mpi_size=_mpi_size;
-
-   // Look if we are on a Tux machine
-   const bool tux = isTux();
-   if (tux && Root())
-   {
-      printf("\033[32m[laghos] \033[1mTux\033[m\n");
-   }
-
-   // On Tux machines, look for MPS
-   // mps = tux?isNvidiaCudaMpsDaemonRunning():false;
-   mps = false;
-   if (tux && Mps() && Root())
-   {
-      printf("\033[32m[laghos] \033[32;1mMPS daemon\033[m\033[m\n");
-   }
-   if (tux && !Mps() && Root())
-   {
-      printf("\033[32m[laghos] \033[31;1mNo MPS daemon\033[m\n");
-   }
-
-   // Get the number of devices with compute capability greater or equal to 2.0
-   // Can be changed wuth HIP_VISIBLE_DEVICES
-   hipGetDeviceCount(&gpu_count);
-   hip=_hip;
-   aware=_aware;
-   share=_share;
-   hcpo=_hcpo;
-   sync=_sync;
-
-   // LAGHOS_DEBUG warning output
-#if defined(LAGHOS_DEBUG)
-   if (Root())
-   {
-      printf("\033[32m[laghos] \033[31;1mLAGHOS_DEBUG\033[m\n");
-   }
-#endif
-
-   // Check for Enforced Kernel Synchronization
-   if (Sync() && Root())
-   {
-      printf("\033[32m[laghos] \033[31;1mEnforced Kernel Synchronization!\033[m\n");
-   }
-
-   // Check if MPI is HIP aware
-   if (Root())
-      printf("\033[32m[laghos] MPI %s HIP aware\033[m\n",
-             aware?"\033[1mIS":"is \033[31;1mNOT\033[32m");
-
-   if (Root())
-   {
-      printf("\033[32m[laghos] HIP device count: %i\033[m\n", gpu_count);
-   }
-
-   // Initializes the driver API
-   // Must be called before any other function from the driver API
-   // Currently, the Flags parameter must be 0.
-   const unsigned int Flags = 0; // parameter must be 0
-   hipInit(Flags);
-
-   // Returns properties for the selected device
-   const int device = Mps()?0:(mpi_rank%gpu_count);
-   // Check if we have enough devices for all ranks
-   assert(device<gpu_count);
-
-   // Get a handle to our compute device
-   hipDeviceGet(&hipDevice,device);
-   computeCapabilityOfTheDevice(mpi_rank,hipDevice,device);
-
-   // Get the properties of the device
-   struct hipDeviceProp_t properties;
-   hipGetDeviceProperties(&properties, device);
-#if defined(LAGHOS_DEBUG)
-   if (Root())
-   {
-      printDevProp(properties);
-   }
-#endif // LAGHOS_DEBUG
-   maxXGridSize=properties.maxGridSize[0];
-   maxXThreadsDim=properties.maxThreadsDim[0];
-
-   // Create our context
-   hStream=new hipStream_t;
-   hipStreamCreate(hStream);
-}
-
-// ***************************************************************************
-bool rconfig::IAmAlone()
-{
-   return mpi_size==1;
-}
-
-// ***************************************************************************
-bool rconfig::GeomNeedsUpdate(const int sequence)
-{
-   assert(sequence==0);
-   return (sequence!=0);
-}
-
-// ***************************************************************************
-bool rconfig::DoHostConformingProlongationOperator()
-{
-   return (Hip())?hcpo:true;
-}
-
-} // namespace mfem
diff --git a/hip/hip/config/config.hpp b/hip/hip/config/config.hpp
deleted file mode 100644
index e8b82171..00000000
--- a/hip/hip/config/config.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_CONFIG
-#define LAGHOS_HIP_CONFIG
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * Configuration class for HIP
-// ***************************************************************************
-class rconfig
-{
-private:
-   // *************************************************************************
-   int mpi_rank=0;
-   int mpi_size=0;
-   bool aware=false;
-   //  ************************************************************************
-   bool mps=false;
-   int gpu_count=0;
-   int maxXGridSize=0;
-   int maxXThreadsDim=0;
-   // *************************************************************************
-   hipDevice_t hipDevice;
-   hipStream_t *hStream;
-   // *************************************************************************
-   bool hip=false;
-   bool share=false;
-   // *************************************************************************
-   bool hcpo=false;
-   bool sync=false;
-   // *************************************************************************
-private:
-   rconfig() {}
-   rconfig(rconfig const&);
-   void operator=(rconfig const&);
-   // *************************************************************************
-public:
-   static rconfig& Get()
-   {
-      static rconfig rconfig_singleton;
-      return rconfig_singleton;
-   }
-   // *************************************************************************
-   void Setup(const int,const int, const bool hip,
-              const bool aware,
-              const bool share, const bool hcpo,
-              const bool sync, const int rs_levels);
-   // *************************************************************************
-   bool IAmAlone();
-   bool GeomNeedsUpdate(const int);
-   bool DoHostConformingProlongationOperator();
-   // *************************************************************************
-   inline int Rank() { return mpi_rank; }
-   inline int Size() { return mpi_size; }
-   inline bool Root() { return mpi_rank==0; }
-   inline bool Aware() { return aware; }
-   // *************************************************************************
-   inline bool Mps() { return mps; }
-   // *************************************************************************
-   inline bool Hip() { return hip; }
-   inline bool Share() { return share; }
-   inline bool Hcpo() { return hcpo; }
-   inline bool Sync() { return sync; }
-   inline int MaxXGridSize() { return maxXGridSize; }
-   inline int MaxXThreadsDim() { return maxXThreadsDim; }
-   // *************************************************************************
-   inline hipStream_t *Stream() { return hStream; }
-};
-
-} // namespace mfem
-
-#endif // LAGHOS_HIP_CONFIG
diff --git a/hip/hip/fem/bilinearform.cpp b/hip/hip/fem/bilinearform.cpp
deleted file mode 100644
index 6494f842..00000000
--- a/hip/hip/fem/bilinearform.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipBilinearForm
-// ***************************************************************************
-HipBilinearForm::HipBilinearForm(HipFiniteElementSpace* fes) :
-   HipOperator(fes->GetVSize(),fes->GetVSize()),
-   mesh(fes->GetMesh()),
-   trialFes(fes),
-   testFes(fes),
-   localX(mesh->GetNE() * trialFes->GetLocalDofs() * trialFes->GetVDim()),
-   localY(mesh->GetNE() * testFes->GetLocalDofs() * testFes->GetVDim()) {}
-
-// ***************************************************************************
-HipBilinearForm::~HipBilinearForm() { }
-
-// ***************************************************************************
-// Adds new Domain Integrator.
-void HipBilinearForm::AddDomainIntegrator(HipIntegrator* i)
-{
-   AddIntegrator(i, DomainIntegrator);
-}
-
-// Adds new Boundary Integrator.
-void HipBilinearForm::AddBoundaryIntegrator(HipIntegrator* i)
-{
-   AddIntegrator(i, BoundaryIntegrator);
-}
-
-// Adds new interior Face Integrator.
-void HipBilinearForm::AddInteriorFaceIntegrator(HipIntegrator* i)
-{
-   AddIntegrator(i, InteriorFaceIntegrator);
-}
-
-// Adds new boundary Face Integrator.
-void HipBilinearForm::AddBoundaryFaceIntegrator(HipIntegrator* i)
-{
-   AddIntegrator(i, BoundaryFaceIntegrator);
-}
-
-// Adds Integrator based on HipIntegratorType
-void HipBilinearForm::AddIntegrator(HipIntegrator* i,
-                                     const HipIntegratorType itype)
-{
-   assert(i);
-   i->SetupIntegrator(*this, itype);
-   integrators.push_back(i);
-}
-
-// ***************************************************************************
-void HipBilinearForm::Assemble()
-{
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->Assemble();
-   }
-}
-
-// ***************************************************************************
-void HipBilinearForm::FormLinearSystem(const Array<int>& constraintList,
-                                        HipVector& x, HipVector& b,
-                                        HipOperator*& Aout,
-                                        HipVector& X, HipVector& B,
-                                        int copy_interior)
-{
-   FormOperator(constraintList, Aout);
-   InitRHS(constraintList, x, b, Aout, X, B, copy_interior);
-}
-
-// ***************************************************************************
-void HipBilinearForm::FormOperator(const Array<int>& constraintList,
-                                    HipOperator*& Aout)
-{
-   const HipOperator* trialP = trialFes->GetProlongationOperator();
-   const HipOperator* testP  = testFes->GetProlongationOperator();
-   HipOperator *rap = this;
-   if (trialP) { rap = new HipRAPOperator(*testP, *this, *trialP); }
-   Aout = new HipConstrainedOperator(rap, constraintList, rap!=this);
-}
-
-// ***************************************************************************
-void HipBilinearForm::InitRHS(const Array<int>& constraintList,
-                               const HipVector& x, const HipVector& b,
-                               HipOperator* A,
-                               HipVector& X, HipVector& B,
-                               int copy_interior)
-{
-   const HipOperator* P = trialFes->GetProlongationOperator();
-   const HipOperator* R = trialFes->GetRestrictionOperator();
-   if (P)
-   {
-      // Variational restriction with P
-      B.SetSize(P->Width());
-      P->MultTranspose(b, B);
-      X.SetSize(R->Height());
-      R->Mult(x, X);
-   }
-   else
-   {
-      // rap, X and B point to the same data as this, x and b
-      X.SetSize(x.Size(),x);
-      B.SetSize(b.Size(),b);
-   }
-   HipConstrainedOperator* cA = static_cast<HipConstrainedOperator*>(A);
-   if (cA)
-   {
-      cA->EliminateRHS(X, B);
-   }
-   else
-   {
-      mfem_error("HipBilinearForm::InitRHS expects an HipConstrainedOperator");
-   }
-}
-
-// ***************************************************************************
-void HipBilinearForm::Mult(const HipVector& x, HipVector& y) const
-{
-   trialFes->GlobalToLocal(x, localX);
-   localY = 0;
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->MultAdd(localX, localY);
-   }
-   testFes->LocalToGlobal(localY, y);
-}
-
-// ***************************************************************************
-void HipBilinearForm::MultTranspose(const HipVector& x, HipVector& y) const
-{
-   testFes->GlobalToLocal(x, localX);
-   localY = 0;
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->MultTransposeAdd(localX, localY);
-   }
-   trialFes->LocalToGlobal(localY, y);
-}
-
-// ***************************************************************************
-void HipBilinearForm::RecoverFEMSolution(const HipVector& X,
-                                          const HipVector& b,
-                                          HipVector& x)
-{
-   const HipOperator *P = this->GetProlongation();
-   if (P)
-   {
-      // Apply conforming prolongation
-      x.SetSize(P->Height());
-      P->Mult(X, x);
-   }
-   // Otherwise X and x point to the same data
-}
-
-
-// ***************************************************************************
-// * HipConstrainedOperator
-// ***************************************************************************
-HipConstrainedOperator::HipConstrainedOperator(HipOperator* A_,
-                                                 const Array<int>& constraintList_,
-                                                 bool own_A_) :
-   HipOperator(A_->Height(), A_->Width())
-{
-   Setup(A_, constraintList_, own_A_);
-}
-
-void HipConstrainedOperator::Setup(HipOperator* A_,
-                                    const Array<int>& constraintList_,
-                                    bool own_A_)
-{
-   A = A_;
-   own_A = own_A_;
-   constraintIndices = constraintList_.Size();
-   if (constraintIndices)
-   {
-      constraintList.allocate(constraintIndices);
-   }
-   z.SetSize(height);
-   w.SetSize(height);
-}
-
-void HipConstrainedOperator::EliminateRHS(const HipVector& x,
-                                           HipVector& b) const
-{
-   w = 0.0;
-   A->Mult(w, z);
-   b -= z;
-}
-
-void HipConstrainedOperator::Mult(const HipVector& x, HipVector& y) const
-{
-   if (constraintIndices == 0)
-   {
-      A->Mult(x, y);
-      return;
-   }
-   z = x;
-   A->Mult(z, y);
-}
-
-} // mfem
diff --git a/hip/hip/fem/bilinearform.hpp b/hip/hip/fem/bilinearform.hpp
deleted file mode 100644
index 3308c670..00000000
--- a/hip/hip/fem/bilinearform.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_BILINEARFORM
-#define LAGHOS_HIP_BILINEARFORM
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipIntegratorType
-// ***************************************************************************
-enum HipIntegratorType
-{
-   DomainIntegrator       = 0,
-   BoundaryIntegrator     = 1,
-   InteriorFaceIntegrator = 2,
-   BoundaryFaceIntegrator = 3,
-};
-
-class HipIntegrator;
-
-// ***************************************************************************
-// * HipBilinearForm
-// ***************************************************************************
-class HipBilinearForm : public HipOperator
-{
-   friend class HipIntegrator;
-protected:
-   typedef std::vector<HipIntegrator*> IntegratorVector;
-   mutable Mesh* mesh;
-   mutable HipFiniteElementSpace* trialFes;
-   mutable HipFiniteElementSpace* testFes;
-   IntegratorVector integrators;
-   mutable HipVector localX, localY;
-public:
-   HipBilinearForm(HipFiniteElementSpace*);
-   ~HipBilinearForm();
-   Mesh& GetMesh() const { return *mesh; }
-   HipFiniteElementSpace& GetTrialFESpace() const { return *trialFes;}
-   HipFiniteElementSpace& GetTestFESpace() const { return *testFes;}
-   // *************************************************************************
-   void AddDomainIntegrator(HipIntegrator*);
-   void AddBoundaryIntegrator(HipIntegrator*);
-   void AddInteriorFaceIntegrator(HipIntegrator*);
-   void AddBoundaryFaceIntegrator(HipIntegrator*);
-   void AddIntegrator(HipIntegrator*, const HipIntegratorType);
-   // *************************************************************************
-   virtual void Assemble();
-   void FormLinearSystem(const Array<int>& constraintList,
-                         HipVector& x, HipVector& b,
-                         HipOperator*& Aout,
-                         HipVector& X, HipVector& B,
-                         int copy_interior = 0);
-   void FormOperator(const Array<int>& constraintList, HipOperator*& Aout);
-   void InitRHS(const Array<int>& constraintList,
-                const HipVector& x, const HipVector& b,
-                HipOperator* Aout,
-                HipVector& X, HipVector& B,
-                int copy_interior = 0);
-   virtual void Mult(const HipVector& x, HipVector& y) const;
-   virtual void MultTranspose(const HipVector& x, HipVector& y) const;
-   void RecoverFEMSolution(const HipVector&, const HipVector&, HipVector&);
-};
-
-
-// ***************************************************************************
-// * Constrained Operator
-// ***************************************************************************
-class HipConstrainedOperator : public HipOperator
-{
-protected:
-   HipOperator *A;
-   bool own_A;
-   HipArray<int> constraintList;
-   int constraintIndices;
-   mutable HipVector z, w;
-public:
-   HipConstrainedOperator(HipOperator*, const Array<int>&, bool = false);
-   void Setup(HipOperator*, const Array<int>&, bool = false);
-   void EliminateRHS(const HipVector&, HipVector&) const;
-   virtual void Mult(const HipVector&, HipVector&) const;
-   virtual ~HipConstrainedOperator() {}
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_BILINEARFORM
diff --git a/hip/hip/fem/bilininteg.cpp b/hip/hip/fem/bilininteg.cpp
deleted file mode 100644
index 17db4568..00000000
--- a/hip/hip/fem/bilininteg.cpp
+++ /dev/null
@@ -1,526 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// *****************************************************************************
-static HipGeometry *geom=NULL;
-
-// ***************************************************************************
-// * ~ HipGeometry
-// ***************************************************************************
-HipGeometry::~HipGeometry()
-{
-   free(geom->meshNodes);
-   free(geom->J);
-   free(geom->invJ);
-   free(geom->detJ);
-   delete[] geom;
-}
-
-// *****************************************************************************
-// * HipGeometry Get: use this one to fetch nodes from vector Sx
-// *****************************************************************************
-HipGeometry* HipGeometry::Get(HipFiniteElementSpace& fes,
-                                const IntegrationRule& ir,
-                                const HipVector& Sx)
-{
-   const Mesh *mesh = fes.GetMesh();
-   const mfem::GridFunction *nodes = mesh->GetNodes();
-   const FiniteElementSpace *fespace = nodes->FESpace();
-   const FiniteElement *fe = fespace->GetFE(0);
-   const int dims     = fe->GetDim();
-   const int numDofs  = fe->GetDof();
-   const int numQuad  = ir.GetNPoints();
-   const int elements = fespace->GetNE();
-   const int ndofs    = fespace->GetNDofs();
-   const HipDofQuadMaps* maps = HipDofQuadMaps::GetSimplexMaps(*fe, ir);
-   rNodeCopyByVDim(elements,numDofs,ndofs,dims,geom->eMap,Sx,geom->meshNodes);
-   rIniGeom(dims,numDofs,numQuad,elements,
-            maps->dofToQuadD,
-            geom->meshNodes,
-            geom->J,
-            geom->invJ,
-            geom->detJ);
-   return geom;
-}
-
-
-// *****************************************************************************
-HipGeometry* HipGeometry::Get(HipFiniteElementSpace& fes,
-                                const IntegrationRule& ir)
-{
-   Mesh& mesh = *(fes.GetMesh());
-   const bool geom_to_allocate =
-      (!geom) || rconfig::Get().GeomNeedsUpdate(mesh.GetSequence());
-   if (geom_to_allocate) { geom=new HipGeometry(); }
-   if (!mesh.GetNodes()) { mesh.SetCurvature(1, false, -1, Ordering::byVDIM); }
-   GridFunction& nodes = *(mesh.GetNodes());
-   const FiniteElementSpace& fespace = *(nodes.FESpace());
-   const FiniteElement& fe = *(fespace.GetFE(0));
-   const int dims     = fe.GetDim();
-   const int elements = fespace.GetNE();
-   const int numDofs  = fe.GetDof();
-   const int numQuad  = ir.GetNPoints();
-   const bool orderedByNODES = (fespace.GetOrdering() == Ordering::byNODES);
-
-   if (orderedByNODES) { ReorderByVDim(nodes); }
-   const int asize = dims*numDofs*elements;
-   Array<double> meshNodes(asize);
-   const Table& e2dTable = fespace.GetElementToDofTable();
-   const int* elementMap = e2dTable.GetJ();
-   Array<int> eMap(numDofs*elements);
-   {
-      for (int e = 0; e < elements; ++e)
-      {
-         for (int d = 0; d < numDofs; ++d)
-         {
-            const int lid = d+numDofs*e;
-            const int gid = elementMap[lid];
-            eMap[lid]=gid;
-            for (int v = 0; v < dims; ++v)
-            {
-               const int moffset = v+dims*lid;
-               const int xoffset = v+dims*gid;
-               meshNodes[moffset] = nodes[xoffset];
-            }
-         }
-      }
-   }
-   if (geom_to_allocate)
-   {
-      geom->meshNodes.allocate(dims, numDofs, elements);
-      geom->eMap.allocate(numDofs, elements);
-   }
-   {
-      geom->meshNodes = meshNodes;
-      geom->eMap = eMap;
-   }
-   if (orderedByNODES) { ReorderByNodes(nodes); }
-   if (geom_to_allocate)
-   {
-      geom->J.allocate(dims, dims, numQuad, elements);
-      geom->invJ.allocate(dims, dims, numQuad, elements);
-      geom->detJ.allocate(numQuad, elements);
-   }
-
-   const HipDofQuadMaps* maps = HipDofQuadMaps::GetSimplexMaps(fe, ir);
-   rIniGeom(dims,numDofs,numQuad,elements,
-            maps->dofToQuadD,
-            geom->meshNodes,
-            geom->J,
-            geom->invJ,
-            geom->detJ);
-   return geom;
-}
-
-// ***************************************************************************
-void HipGeometry::ReorderByVDim(GridFunction& nodes)
-{
-   const FiniteElementSpace *fes=nodes.FESpace();
-   const int size = nodes.Size();
-   const int vdim = fes->GetVDim();
-   const int ndofs = fes->GetNDofs();
-   double *data = nodes.GetData();
-   double *temp = new double[size];
-   int k=0;
-   for (int d = 0; d < ndofs; d++)
-      for (int v = 0; v < vdim; v++)
-      {
-         temp[k++] = data[d+v*ndofs];
-      }
-   for (int i = 0; i < size; i++)
-   {
-      data[i] = temp[i];
-   }
-   delete [] temp;
-}
-
-// ***************************************************************************
-void HipGeometry::ReorderByNodes(GridFunction& nodes)
-{
-   const FiniteElementSpace *fes=nodes.FESpace();
-   const int size = nodes.Size();
-   const int vdim = fes->GetVDim();
-   const int ndofs = fes->GetNDofs();
-   double *data = nodes.GetData();
-   double *temp = new double[size];
-   int k = 0;
-   for (int j = 0; j < ndofs; j++)
-      for (int i = 0; i < vdim; i++)
-      {
-         temp[j+i*ndofs] = data[k++];
-      }
-   for (int i = 0; i < size; i++)
-   {
-      data[i] = temp[i];
-   }
-   delete [] temp;
-}
-
-// *****************************************************************************
-// * HipDofQuadMaps
-// *****************************************************************************
-static std::map<std::string, HipDofQuadMaps* > AllDofQuadMaps;
-
-// ***************************************************************************
-HipDofQuadMaps::~HipDofQuadMaps() {}
-
-// *****************************************************************************
-void HipDofQuadMaps::delHipDofQuadMaps()
-{
-   for (std::map<std::string,
-        HipDofQuadMaps*>::iterator itr = AllDofQuadMaps.begin();
-        itr != AllDofQuadMaps.end();
-        itr++)
-   {
-      delete itr->second;
-   }
-}
-
-// *****************************************************************************
-HipDofQuadMaps* HipDofQuadMaps::Get(const HipFiniteElementSpace& fespace,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return Get(*fespace.GetFE(0),*fespace.GetFE(0),ir,transpose);
-}
-
-HipDofQuadMaps* HipDofQuadMaps::Get(const HipFiniteElementSpace&
-                                      trialFESpace,
-                                      const HipFiniteElementSpace& testFESpace,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return Get(*trialFESpace.GetFE(0),*testFESpace.GetFE(0),ir,transpose);
-}
-
-HipDofQuadMaps* HipDofQuadMaps::Get(const FiniteElement& trialFE,
-                                      const FiniteElement& testFE,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return GetTensorMaps(trialFE, testFE, ir, transpose);
-}
-
-// *****************************************************************************
-HipDofQuadMaps* HipDofQuadMaps::GetTensorMaps(const FiniteElement& trialFE,
-                                                const FiniteElement& testFE,
-                                                const IntegrationRule& ir,
-                                                const bool transpose)
-{
-   const TensorBasisElement& trialTFE =
-      dynamic_cast<const TensorBasisElement&>(trialFE);
-   const TensorBasisElement& testTFE =
-      dynamic_cast<const TensorBasisElement&>(testFE);
-   std::stringstream ss;
-   ss << "TensorMap:"
-      << " O1:"  << trialFE.GetOrder()
-      << " O2:"  << testFE.GetOrder()
-      << " BT1:" << trialTFE.GetBasisType()
-      << " BT2:" << testTFE.GetBasisType()
-      << " Q:"   << ir.GetNPoints();
-   std::string hash = ss.str();
-   // If we've already made the dof-quad maps, reuse them
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   // Otherwise, build them
-   HipDofQuadMaps *maps = new HipDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   const HipDofQuadMaps* trialMaps = GetD2QTensorMaps(trialFE, ir);
-   const HipDofQuadMaps* testMaps  = GetD2QTensorMaps(testFE, ir, true);
-   maps->dofToQuad   = trialMaps->dofToQuad;
-   maps->dofToQuadD  = trialMaps->dofToQuadD;
-   maps->quadToDof   = testMaps->dofToQuad;
-   maps->quadToDofD  = testMaps->dofToQuadD;
-   maps->quadWeights = testMaps->quadWeights;
-   return maps;
-}
-
-// *****************************************************************************
-HipDofQuadMaps* HipDofQuadMaps::GetD2QTensorMaps(const FiniteElement& fe,
-                                                   const IntegrationRule& ir,
-                                                   const bool transpose)
-{
-   const TensorBasisElement& tfe = dynamic_cast<const TensorBasisElement&>(fe);
-   const Poly_1D::Basis& basis = tfe.GetBasis1D();
-   const int order = fe.GetOrder();
-   const int dofs = order + 1;
-   const int dims = fe.GetDim();
-   const IntegrationRule& ir1D = IntRules.Get(Geometry::SEGMENT, ir.GetOrder());
-   const int quadPoints = ir1D.GetNPoints();
-   const int quadPoints2D = quadPoints*quadPoints;
-   const int quadPoints3D = quadPoints2D*quadPoints;
-   const int quadPointsND = ((dims == 1) ? quadPoints :
-                             ((dims == 2) ? quadPoints2D : quadPoints3D));
-   std::stringstream ss ;
-   ss << "D2QTensorMap:"
-      << " order:" << order
-      << " dofs:" << dofs
-      << " dims:" << dims
-      << " quadPoints:"<<quadPoints
-      << " transpose:"  << (transpose?"T":"F");
-   std::string hash = ss.str();
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-
-   HipDofQuadMaps *maps = new HipDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-
-   maps->dofToQuad.allocate(quadPoints, dofs,1,1,transpose);
-   maps->dofToQuadD.allocate(quadPoints, dofs,1,1,transpose);
-   double* quadWeights1DData = NULL;
-   if (transpose)
-   {
-      // Initialize quad weights only for transpose
-      maps->quadWeights.allocate(quadPointsND);
-      quadWeights1DData = ::new double[quadPoints];
-   }
-   mfem::Vector d2q(dofs);
-   mfem::Vector d2qD(dofs);
-   Array<double> dofToQuad(quadPoints*dofs);
-   Array<double> dofToQuadD(quadPoints*dofs);
-   for (int q = 0; q < quadPoints; ++q)
-   {
-      const IntegrationPoint& ip = ir1D.IntPoint(q);
-      basis.Eval(ip.x, d2q, d2qD);
-      if (transpose)
-      {
-         quadWeights1DData[q] = ip.weight;
-      }
-      for (int d = 0; d < dofs; ++d)
-      {
-         dofToQuad[maps->dofToQuad.dim()[0]*q + maps->dofToQuad.dim()[1]*d] = d2q[d];
-         dofToQuadD[maps->dofToQuad.dim()[0]*q + maps->dofToQuad.dim()[1]*d] = d2qD[d];
-      }
-   }
-   maps->dofToQuad = dofToQuad;
-   maps->dofToQuadD = dofToQuadD;
-   if (transpose)
-   {
-      Array<double> quadWeights(quadPointsND);
-      for (int q = 0; q < quadPointsND; ++q)
-      {
-         const int qx = q % quadPoints;
-         const int qz = q / quadPoints2D;
-         const int qy = (q - qz*quadPoints2D) / quadPoints;
-         double w = quadWeights1DData[qx];
-         if (dims > 1)
-         {
-            w *= quadWeights1DData[qy];
-         }
-         if (dims > 2)
-         {
-            w *= quadWeights1DData[qz];
-         }
-         quadWeights[q] = w;
-      }
-      maps->quadWeights = quadWeights;
-      ::delete [] quadWeights1DData;
-   }
-   assert(maps);
-   return maps;
-}
-
-// *****************************************************************************
-HipDofQuadMaps* HipDofQuadMaps::GetSimplexMaps(const FiniteElement& fe,
-                                                 const IntegrationRule& ir,
-                                                 const bool transpose)
-{
-   return GetSimplexMaps(fe, fe, ir, transpose);
-}
-
-// *****************************************************************************
-HipDofQuadMaps* HipDofQuadMaps::GetSimplexMaps(const FiniteElement& trialFE,
-                                                 const FiniteElement& testFE,
-                                                 const IntegrationRule& ir,
-                                                 const bool transpose)
-{
-   std::stringstream ss;
-   ss << "SimplexMap:"
-      << " O1:" << trialFE.GetOrder()
-      << " O2:" << testFE.GetOrder()
-      << " Q:"  << ir.GetNPoints();
-   std::string hash = ss.str();
-   // If we've already made the dof-quad maps, reuse them
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   HipDofQuadMaps *maps = new HipDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   const HipDofQuadMaps* trialMaps = GetD2QSimplexMaps(trialFE, ir);
-   const HipDofQuadMaps* testMaps  = GetD2QSimplexMaps(testFE, ir, true);
-   maps->dofToQuad   = trialMaps->dofToQuad;
-   maps->dofToQuadD  = trialMaps->dofToQuadD;
-   maps->quadToDof   = testMaps->dofToQuad;
-   maps->quadToDofD  = testMaps->dofToQuadD;
-   maps->quadWeights = testMaps->quadWeights;
-   return maps;
-}
-
-// *****************************************************************************
-HipDofQuadMaps* HipDofQuadMaps::GetD2QSimplexMaps(const FiniteElement& fe,
-                                                    const IntegrationRule& ir,
-                                                    const bool transpose)
-{
-   const int dims = fe.GetDim();
-   const int numDofs = fe.GetDof();
-   const int numQuad = ir.GetNPoints();
-   std::stringstream ss ;
-   ss << "D2QSimplexMap:"
-      << " Dim:" << dims
-      << " numDofs:" << numDofs
-      << " numQuad:" << numQuad
-      << " transpose:"  << (transpose?"T":"F");
-   std::string hash = ss.str();
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   HipDofQuadMaps* maps = new HipDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   // Initialize the dof -> quad mapping
-   maps->dofToQuad.allocate(numQuad, numDofs,1,1,transpose);
-   maps->dofToQuadD.allocate(dims, numQuad, numDofs,1,transpose);
-   if (transpose) // Initialize quad weights only for transpose
-   {
-      maps->quadWeights.allocate(numQuad);
-   }
-   Vector d2q(numDofs);
-   DenseMatrix d2qD(numDofs, dims);
-   Array<double> quadWeights(numQuad);
-   Array<double> dofToQuad(numQuad*numDofs);
-   Array<double> dofToQuadD(dims*numQuad*numDofs);
-   for (int q = 0; q < numQuad; ++q)
-   {
-      const IntegrationPoint& ip = ir.IntPoint(q);
-      if (transpose)
-      {
-         quadWeights[q] = ip.weight;
-      }
-      fe.CalcShape(ip, d2q);
-      fe.CalcDShape(ip, d2qD);
-      for (int d = 0; d < numDofs; ++d)
-      {
-         const double w = d2q[d];
-         dofToQuad[maps->dofToQuad.dim()[0]*q +
-                   maps->dofToQuad.dim()[1]*d] = w;
-         for (int dim = 0; dim < dims; ++dim)
-         {
-            const double wD = d2qD(d, dim);
-            dofToQuadD[maps->dofToQuadD.dim()[0]*dim +
-                       maps->dofToQuadD.dim()[1]*q +
-                       maps->dofToQuadD.dim()[2]*d] = wD;
-         }
-      }
-   }
-   if (transpose)
-   {
-      maps->quadWeights = quadWeights;
-   }
-   maps->dofToQuad = dofToQuad;
-   maps->dofToQuadD = dofToQuadD;
-   return maps;
-}
-
-
-// *****************************************************************************
-// * Base Integrator
-// *****************************************************************************
-void HipIntegrator::SetIntegrationRule(const IntegrationRule& ir_)
-{
-   ir = &ir_;
-}
-
-const IntegrationRule& HipIntegrator::GetIntegrationRule() const
-{
-   assert(ir);
-   return *ir;
-}
-
-void HipIntegrator::SetupIntegrator(HipBilinearForm& bform_,
-                                     const HipIntegratorType itype_)
-{
-   mesh = &(bform_.GetMesh());
-   trialFESpace = &(bform_.GetTrialFESpace());
-   testFESpace  = &(bform_.GetTestFESpace());
-   itype = itype_;
-   if (ir == NULL) { assert(false); }
-   maps = HipDofQuadMaps::Get(*trialFESpace,*testFESpace,*ir);
-   mapsTranspose = HipDofQuadMaps::Get(*testFESpace,*trialFESpace,*ir);
-   Setup();
-}
-
-HipGeometry* HipIntegrator::GetGeometry()
-{
-   return HipGeometry::Get(*trialFESpace, *ir);
-}
-
-
-// *****************************************************************************
-// * Mass Integrator
-// *****************************************************************************
-void HipMassIntegrator::SetupIntegrationRule()
-{
-   assert(false);
-}
-
-// *****************************************************************************
-void HipMassIntegrator::Assemble()
-{
-   if (op.Size()) { return; }
-   assert(false);
-}
-
-// *****************************************************************************
-void HipMassIntegrator::SetOperator(HipVector& v) { op = v; }
-
-// ***************************************************************************
-void HipMassIntegrator::MultAdd(HipVector& x, HipVector& y)
-{
-   const int dim = mesh->Dimension();
-   const int quad1D = IntRules.Get(Geometry::SEGMENT,ir->GetOrder()).GetNPoints();
-   const int dofs1D = trialFESpace->GetFE(0)->GetOrder() + 1;
-   if (rconfig::Get().Share())
-      rMassMultAddS(dim,
-                    dofs1D,
-                    quad1D,
-                    mesh->GetNE(),
-                    maps->dofToQuad,
-                    maps->dofToQuadD,
-                    maps->quadToDof,
-                    maps->quadToDofD,
-                    op,x,y);
-   else
-      rMassMultAdd(dim,
-                   dofs1D,
-                   quad1D,
-                   mesh->GetNE(),
-                   maps->dofToQuad,
-                   maps->dofToQuadD,
-                   maps->quadToDof,
-                   maps->quadToDofD,
-                   op,x,y);
-}
-
-} // namespace mfem
-
diff --git a/hip/hip/fem/bilininteg.hpp b/hip/hip/fem/bilininteg.hpp
deleted file mode 100644
index 68632628..00000000
--- a/hip/hip/fem/bilininteg.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_BILININTEG
-#define LAGHOS_HIP_BILININTEG
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipGeometry
-// ***************************************************************************
-class HipGeometry
-{
-public:
-   ~HipGeometry();
-   HipArray<int> eMap;
-   HipArray<double> meshNodes;
-   HipArray<double> J, invJ, detJ;
-   static HipGeometry* Get(HipFiniteElementSpace&,
-                            const IntegrationRule&);
-   static HipGeometry* Get(HipFiniteElementSpace&,
-                            const IntegrationRule&,
-                            const HipVector&);
-   static void ReorderByVDim(GridFunction& nodes);
-   static void ReorderByNodes(GridFunction& nodes);
-};
-
-// ***************************************************************************
-// * HipDofQuadMaps
-// ***************************************************************************
-class HipDofQuadMaps
-{
-private:
-   std::string hash;
-public:
-   HipArray<double, false> dofToQuad, dofToQuadD; // B
-   HipArray<double, false> quadToDof, quadToDofD; // B^T
-   HipArray<double> quadWeights;
-public:
-   ~HipDofQuadMaps();
-   static void delHipDofQuadMaps();
-   static HipDofQuadMaps* Get(const HipFiniteElementSpace&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static HipDofQuadMaps* Get(const HipFiniteElementSpace&,
-                               const HipFiniteElementSpace&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static HipDofQuadMaps* Get(const FiniteElement&,
-                               const FiniteElement&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static HipDofQuadMaps* GetTensorMaps(const FiniteElement&,
-                                         const FiniteElement&,
-                                         const IntegrationRule&,
-                                         const bool = false);
-   static HipDofQuadMaps* GetD2QTensorMaps(const FiniteElement&,
-                                            const IntegrationRule&,
-                                            const bool = false);
-   static HipDofQuadMaps* GetSimplexMaps(const FiniteElement&,
-                                          const IntegrationRule&,
-                                          const bool = false);
-   static HipDofQuadMaps* GetSimplexMaps(const FiniteElement&,
-                                          const FiniteElement&,
-                                          const IntegrationRule&,
-                                          const bool = false);
-   static HipDofQuadMaps* GetD2QSimplexMaps(const FiniteElement&,
-                                             const IntegrationRule&,
-                                             const bool = false);
-};
-
-// ***************************************************************************
-// * Base Integrator
-// ***************************************************************************
-class HipIntegrator
-{
-protected:
-   Mesh* mesh = NULL;
-   HipFiniteElementSpace* trialFESpace = NULL;
-   HipFiniteElementSpace* testFESpace = NULL;
-   HipIntegratorType itype;
-   const IntegrationRule* ir = NULL;
-   HipDofQuadMaps* maps;
-   HipDofQuadMaps* mapsTranspose;
-private:
-public:
-   virtual std::string GetName() = 0;
-   void SetIntegrationRule(const IntegrationRule& ir_);
-   const IntegrationRule& GetIntegrationRule() const;
-   virtual void SetupIntegrationRule() = 0;
-   virtual void SetupIntegrator(HipBilinearForm& bform_,
-                                const HipIntegratorType itype_);
-   virtual void Setup() = 0;
-   virtual void Assemble() = 0;
-   virtual void MultAdd(HipVector& x, HipVector& y) = 0;
-   virtual void MultTransposeAdd(HipVector&, HipVector&) {assert(false);}
-   HipGeometry* GetGeometry();
-};
-
-// ***************************************************************************
-// * Mass Integrator
-// ***************************************************************************
-class HipMassIntegrator : public HipIntegrator
-{
-private:
-   HipVector op;
-public:
-   HipMassIntegrator() {}
-   virtual ~HipMassIntegrator() {}
-   virtual std::string GetName() {return "MassIntegrator";}
-   virtual void SetupIntegrationRule();
-   virtual void Setup() {}
-   virtual void Assemble();
-   void SetOperator(HipVector& v);
-   virtual void MultAdd(HipVector& x, HipVector& y);
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_BILININTEG
diff --git a/hip/hip/fem/conform.cpp b/hip/hip/fem/conform.cpp
deleted file mode 100644
index 4f20a720..00000000
--- a/hip/hip/fem/conform.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipConformingProlongationOperator
-// ***************************************************************************
-HipConformingProlongationOperator::HipConformingProlongationOperator
-(ParFiniteElementSpace &pfes): HipOperator(pfes.GetVSize(),
-                                               pfes.GetTrueVSize()),
-   external_ldofs(),
-   d_external_ldofs(Height()-Width()), // size can be 0 here
-   gc(new HipCommD(pfes)),
-   kMaxTh(0)
-{
-   Array<int> ldofs;
-   Table &group_ldof = gc->GroupLDofTable();
-   external_ldofs.Reserve(Height()-Width());
-   for (int gr = 1; gr < group_ldof.Size(); gr++)
-   {
-      if (!gc->GetGroupTopology().IAmMaster(gr))
-      {
-         ldofs.MakeRef(group_ldof.GetRow(gr), group_ldof.RowSize(gr));
-         external_ldofs.Append(ldofs);
-      }
-   }
-   external_ldofs.Sort();
-   const int HmW=Height()-Width();
-   if (HmW>0)
-   {
-      d_external_ldofs=external_ldofs;
-   }
-   assert(external_ldofs.Size() == Height()-Width());
-   const int m = external_ldofs.Size();
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      const int size = end-j;
-      if (size>kMaxTh) { kMaxTh=size; }
-      j = end+1;
-   }
-}
-
-// ***************************************************************************
-// * ~HipConformingProlongationOperator
-// ***************************************************************************
-HipConformingProlongationOperator::~HipConformingProlongationOperator()
-{
-   delete  gc;
-}
-
-// ***************************************************************************
-// * HIP Error Status Check
-// ***************************************************************************
-void hipLastCheck()
-{
-   hipError_t hipStatus = hipGetLastError();
-   if (hipStatus != hipSuccess)
-      exit(fprintf(stderr, "\n\t\033[31;1m[hipLastCheck] failed: %s\033[m\n",
-                   hipGetErrorString(hipStatus)));
-}
-
-// ***************************************************************************
-// * k_Mult
-// ***************************************************************************
-static __global__
-void k_Mult(double *y, const double *x,
-            const int *external_ldofs, const int m)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i>=m) { return; }
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   for (int k=0; k<(end-j); k+=1)
-   {
-      y[j+k]=x[j-i+k];
-   }
-}
-static __global__
-void k_Mult2(double *y, const double *x, const int *external_ldofs,
-             const int m, const int base)
-{
-   const int i = base+threadIdx.x;
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   const int k = blockIdx.x;
-   if (k>=(end-j)) { return; }
-   y[j+k]=x[j-i+k];
-}
-
-// ***************************************************************************
-// * Device Mult
-// ***************************************************************************
-void HipConformingProlongationOperator::d_Mult(const HipVector &x,
-                                                HipVector &y) const
-{
-   const double *d_xdata = x.GetData();
-   const int in_layout = 2; // 2 - input is ltdofs array
-   gc->d_BcastBegin(const_cast<double*>(d_xdata), in_layout);
-   double *d_ydata = y.GetData();
-   int j = 0;
-   const int m = external_ldofs.Size();
-   if (m>0)
-   {
-      const int maxXThDim = rconfig::Get().MaxXThreadsDim();
-      if (m>maxXThDim)
-      {
-         const int kTpB=64;
-         hipLaunchKernelGGL((k_Mult), dim3((m+kTpB-1)/kTpB), dim3(kTpB), 0, 0,
-                            d_ydata,d_xdata,d_external_ldofs.ptr(),m);
-         hipLastCheck();
-      }
-      else
-      {
-         assert((m/maxXThDim)==0);
-         assert(kMaxTh<rconfig::Get().MaxXGridSize());
-         for (int of7=0; of7<m/maxXThDim; of7+=1)
-         {
-            const int base = of7*maxXThDim;
-            hipLaunchKernelGGL((k_Mult2),dim3(kMaxTh),dim3(maxXThDim), 0, 0,
-                                 d_ydata,d_xdata,d_external_ldofs.ptr(),m,base);
-            hipLastCheck();
-         }
-         hipLaunchKernelGGL((k_Mult2),dim3(kMaxTh),dim3(m%maxXThDim), 0, 0,
-                              d_ydata,d_xdata,d_external_ldofs.ptr(),m,0);
-         hipLastCheck();
-      }
-      j = external_ldofs[m-1]+1;
-   }
-   rmemcpy::rDtoD(d_ydata+j,d_xdata+j-m,(Width()+m-j)*sizeof(double));
-   const int out_layout = 0; // 0 - output is ldofs array
-   gc->d_BcastEnd(d_ydata, out_layout);
-}
-
-
-// ***************************************************************************
-// * k_Mult
-// ***************************************************************************
-static __global__
-void k_MultTranspose(double *y, const double *x,
-                     const int *external_ldofs, const int m)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i>=m) { return; }
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   for (int k=0; k<(end-j); k+=1)
-   {
-      y[j-i+k]=x[j+k];
-   }
-}
-
-static __global__
-void k_MultTranspose2(double *y, const double *x,
-                      const int *external_ldofs,
-                      const int m, const int base)
-{
-   const int i = base+threadIdx.x;
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   const int k = blockIdx.x;
-   if (k>=(end-j)) { return; }
-   y[j-i+k]=x[j+k];
-}
-
-// ***************************************************************************
-// * Device MultTranspose
-// ***************************************************************************
-void HipConformingProlongationOperator::d_MultTranspose(const HipVector &x,
-                                                         HipVector &y) const
-{
-   const double *d_xdata = x.GetData();
-   gc->d_ReduceBegin(d_xdata);
-   double *d_ydata = y.GetData();
-   int j = 0;
-   const int m = external_ldofs.Size();
-   if (m>0)
-   {
-      const int maxXThDim = rconfig::Get().MaxXThreadsDim();
-      if (m>maxXThDim)
-      {
-         const int kTpB=64;
-         hipLaunchKernelGGL((k_MultTranspose), dim3((m+kTpB-1)/kTpB),dim3(kTpB), 0, 0,
-                              d_ydata,d_xdata,d_external_ldofs.ptr(),m);
-         hipLastCheck();
-      }
-      else
-      {
-         // const int TpB = rconfig::Get().MaxXThreadsDim();
-         assert(kMaxTh<rconfig::Get().MaxXGridSize());
-         for (int of7=0; of7<m/maxXThDim; of7+=1)
-         {
-            const int base = of7*maxXThDim;
-            hipLaunchKernelGGL((k_MultTranspose2), dim3(kMaxTh),dim3(maxXThDim), 0, 0,
-                                 d_ydata,d_xdata,d_external_ldofs.ptr(),m,base);
-            hipLastCheck();
-         }
-         hipLaunchKernelGGL((k_MultTranspose2), dim3(kMaxTh),dim3(m%maxXThDim), 0, 0,
-                                 d_ydata,d_xdata,d_external_ldofs.ptr(),m,0);
-         hipLastCheck();
-      }
-      j = external_ldofs[m-1]+1;
-   }
-   rmemcpy::rDtoD(d_ydata+j-m,d_xdata+j,(Height()-j)*sizeof(double));
-   const int out_layout = 2; // 2 - output is an array on all ltdofs
-   gc->d_ReduceEnd<double>(d_ydata, out_layout, GroupCommunicator::Sum);
-}
-
-// ***************************************************************************
-// * Host Mult
-// ***************************************************************************
-void HipConformingProlongationOperator::h_Mult(const Vector &x,
-                                                Vector &y) const
-{
-   const double *xdata = x.GetData();
-   double *ydata = y.GetData();
-   const int m = external_ldofs.Size();
-   const int in_layout = 2; // 2 - input is ltdofs array
-   gc->BcastBegin(const_cast<double*>(xdata), in_layout);
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      std::copy(xdata+j-i, xdata+end-i, ydata+j);
-      j = end+1;
-   }
-   std::copy(xdata+j-m, xdata+Width(), ydata+j);
-   const int out_layout = 0; // 0 - output is ldofs array
-   gc->BcastEnd(ydata, out_layout);
-}
-
-// ***************************************************************************
-// * Host MultTranspose
-// ***************************************************************************
-void HipConformingProlongationOperator::h_MultTranspose(const Vector &x,
-                                                         Vector &y) const
-{
-   const double *xdata = x.GetData();
-   double *ydata = y.GetData();
-   const int m = external_ldofs.Size();
-   gc->ReduceBegin(xdata);
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      std::copy(xdata+j, xdata+end, ydata+j-i);
-      j = end+1;
-   }
-   std::copy(xdata+j, xdata+Height(), ydata+j-m);
-   const int out_layout = 2; // 2 - output is an array on all ltdofs
-   gc->ReduceEnd<double>(ydata, out_layout, GroupCommunicator::Sum);
-}
-
-} // namespace mfem
diff --git a/hip/hip/fem/conform.hpp b/hip/hip/fem/conform.hpp
deleted file mode 100644
index 04c1effc..00000000
--- a/hip/hip/fem/conform.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_CONFORM_PROLONGATION_OP
-#define LAGHOS_HIP_CONFORM_PROLONGATION_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipConformingProlongationOperator
-//  **************************************************************************
-class HipConformingProlongationOperator : public HipOperator
-{
-protected:
-   Array<int> external_ldofs;
-   HipArray<int> d_external_ldofs;
-   HipCommD *gc;
-   int kMaxTh;
-public:
-   HipConformingProlongationOperator(ParFiniteElementSpace &);
-   ~HipConformingProlongationOperator();
-   void d_Mult(const HipVector &x, HipVector &y) const;
-   void d_MultTranspose(const HipVector &x, HipVector &y) const;
-   void h_Mult(const Vector &x, Vector &y) const;
-   void h_MultTranspose(const Vector &x, Vector &y) const;
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_CONFORM_PROLONGATION_OP
diff --git a/hip/hip/fem/fespace.cpp b/hip/hip/fem/fespace.cpp
deleted file mode 100644
index afdb447e..00000000
--- a/hip/hip/fem/fespace.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-/////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2018,2019 Advanced Micro Devices, Inc.
-/////////////////////////////////////////////////////////////////////////////////
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipFiniteElementSpace
-//  ***************************************************************************
-HipFiniteElementSpace::HipFiniteElementSpace(Mesh* mesh,
-                                               const FiniteElementCollection* fec,
-                                               const int vdim_,
-                                               Ordering::Type ordering_)
-   :ParFiniteElementSpace(static_cast<ParMesh*>(mesh),fec,vdim_,ordering_),
-    globalDofs(GetNDofs()),
-    localDofs(GetFE(0)->GetDof()),
-    offsets(globalDofs+1),
-    indices(localDofs, GetNE()),
-    map(localDofs, GetNE())
-{
-   const FiniteElement *fe = GetFE(0);
-   const TensorBasisElement* el = dynamic_cast<const TensorBasisElement*>(fe);
-   const Array<int> &dof_map = el->GetDofMap();
-   const bool dof_map_is_identity = (dof_map.Size()==0);
-
-   const Table& e2dTable = GetElementToDofTable();
-   const int* elementMap = e2dTable.GetJ();
-   const int elements = GetNE();
-   Array<int> h_offsets(globalDofs+1);
-   // We'll be keeping a count of how many local nodes point to its global dof
-   for (int i = 0; i <= globalDofs; ++i)
-   {
-      h_offsets[i] = 0;
-   }
-   for (int e = 0; e < elements; ++e)
-   {
-      for (int d = 0; d < localDofs; ++d)
-      {
-         const int gid = elementMap[localDofs*e + d];
-         ++h_offsets[gid + 1];
-      }
-   }
-   // Aggregate to find offsets for each global dof
-   for (int i = 1; i <= globalDofs; ++i)
-   {
-      h_offsets[i] += h_offsets[i - 1];
-   }
-
-   Array<int> h_indices(localDofs*elements);
-   Array<int> h_map(localDofs*elements);
-   // For each global dof, fill in all local nodes that point   to it
-   for (int e = 0; e < elements; ++e)
-   {
-      for (int d = 0; d < localDofs; ++d)
-      {
-         const int did = dof_map_is_identity?d:dof_map[d];
-         const int gid = elementMap[localDofs*e + did];
-         const int lid = localDofs*e + d;
-         h_indices[h_offsets[gid]++] = lid;
-         h_map[lid] = gid;
-      }
-   }
-
-   // We shifted the offsets vector by 1 by using it as a counter
-   // Now we shift it back.
-   for (int i = globalDofs; i > 0; --i)
-   {
-      h_offsets[i] = h_offsets[i - 1];
-   }
-   h_offsets[0] = 0;
-
-   offsets = h_offsets;
-   indices = h_indices;
-   map = h_map;
-
-   const SparseMatrix* R = GetRestrictionMatrix(); assert(R);
-   const HipConformingProlongationOperator *P =
-      new HipConformingProlongationOperator(*this);
-
-   const int mHeight = R->Height();
-   const int* I = R->GetI();
-   const int* J = R->GetJ();
-   int trueCount = 0;
-   for (int i = 0; i < mHeight; ++i)
-   {
-      trueCount += ((I[i + 1] - I[i]) == 1);
-   }
-
-   Array<int> h_reorderIndices(2*trueCount);
-   for (int i = 0, trueIdx=0; i < mHeight; ++i)
-   {
-      if ((I[i + 1] - I[i]) == 1)
-      {
-         h_reorderIndices[trueIdx++] = J[I[i]];
-         h_reorderIndices[trueIdx++] = i;
-      }
-   }
-
-   reorderIndices = ::new HipArray<int>(2*trueCount);
-   *reorderIndices = h_reorderIndices;
-
-   restrictionOp = new HipRestrictionOperator(R->Height(),
-                                               R->Width(),
-                                               reorderIndices);
-   prolongationOp = new HipProlongationOperator(P);
-}
-
-// ***************************************************************************
-HipFiniteElementSpace::~HipFiniteElementSpace()
-{
-   ::delete reorderIndices;
-}
-
-// ***************************************************************************
-bool HipFiniteElementSpace::hasTensorBasis() const
-{
-   assert(dynamic_cast<const TensorBasisElement*>(GetFE(0)));
-   return true;
-}
-
-// ***************************************************************************
-void HipFiniteElementSpace::GlobalToLocal(const HipVector& globalVec,
-                                           HipVector& localVec) const
-{
-   const int vdim = GetVDim();
-   const int localEntries = localDofs * GetNE();
-   const bool vdim_ordering = ordering == Ordering::byVDIM;
-   rGlobalToLocal(vdim,
-                  vdim_ordering,
-                  globalDofs,
-                  localEntries,
-                  offsets,
-                  indices,
-                  globalVec,
-                  localVec);
-}
-
-// ***************************************************************************
-void HipFiniteElementSpace::LocalToGlobal(const HipVector& localVec,
-                                           HipVector& globalVec) const
-{
-   const int vdim = GetVDim();
-   const int localEntries = localDofs * GetNE();
-   const bool vdim_ordering = ordering == Ordering::byVDIM;
-   rLocalToGlobal(vdim,
-                  vdim_ordering,
-                  globalDofs,
-                  localEntries,
-                  offsets,
-                  indices,
-                  localVec,
-                  globalVec);
-}
-
-} // namespace mfem
diff --git a/hip/hip/fem/fespace.hpp b/hip/hip/fem/fespace.hpp
deleted file mode 100644
index c4ef0002..00000000
--- a/hip/hip/fem/fespace.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-/////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2018,2019 Advanced Micro Devices, Inc.
-/////////////////////////////////////////////////////////////////////////////////
-#ifndef LAGHOS_HIP_FESPACE
-#define LAGHOS_HIP_FESPACE
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipFiniteElementSpace
-//  **************************************************************************
-class HipFiniteElementSpace : public ParFiniteElementSpace
-{
-private:
-   int globalDofs, localDofs;
-   HipArray<int> offsets;
-   HipArray<int> indices, *reorderIndices;
-   HipArray<int> map;
-   HipOperator *restrictionOp, *prolongationOp;
-public:
-   HipFiniteElementSpace(Mesh* mesh,
-                          const FiniteElementCollection* fec,
-                          const int vdim_ = 1,
-                          Ordering::Type ordering_ = Ordering::byNODES);
-   ~HipFiniteElementSpace();
-   // *************************************************************************
-   bool hasTensorBasis() const;
-   int GetLocalDofs() const { return localDofs; }
-   const HipOperator* GetRestrictionOperator() { return restrictionOp; }
-   const HipOperator* GetProlongationOperator() { return prolongationOp; }
-   const HipArray<int>& GetLocalToGlobalMap() const { return map; }
-   // *************************************************************************
-   void GlobalToLocal(const HipVector&, HipVector&) const;
-   void LocalToGlobal(const HipVector&, HipVector&) const;
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_FESPACE
diff --git a/hip/hip/fem/hipGridfunc.cpp b/hip/hip/fem/hipGridfunc.cpp
deleted file mode 100644
index 72bbcd8c..00000000
--- a/hip/hip/fem/hipGridfunc.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-void HipGridFunction::ToQuad(const IntegrationRule& ir,
-                              HipVector& quadValues)
-{
-   const FiniteElement& fe = *(fes.GetFE(0));
-   const int dim  = fe.GetDim();
-   const int vdim = fes.GetVDim();
-   const int elements = fes.GetNE();
-   const int numQuad  = ir.GetNPoints();
-   const HipDofQuadMaps* maps = HipDofQuadMaps::Get(fes, ir);
-   const int quad1D  = IntRules.Get(Geometry::SEGMENT,ir.GetOrder()).GetNPoints();
-   const int dofs1D =fes.GetFE(0)->GetOrder() + 1;
-   quadValues.SetSize(numQuad * elements);
-   if (rconfig::Get().Share())
-   {
-      rGridFuncToQuadS(dim,vdim,dofs1D,quad1D,elements,
-                       maps->dofToQuad,
-                       fes.GetLocalToGlobalMap(),
-                       ptr(),
-                       quadValues);
-   }
-   else
-      rGridFuncToQuad(dim,vdim,dofs1D,quad1D,elements,
-                      maps->dofToQuad,
-                      fes.GetLocalToGlobalMap(),
-                      ptr(),
-                      quadValues);
-}
-
-} // mfem
diff --git a/hip/hip/fem/hipGridfunc.hpp b/hip/hip/fem/hipGridfunc.hpp
deleted file mode 100644
index 46839850..00000000
--- a/hip/hip/fem/hipGridfunc.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_GRIDFUNC
-#define LAGHOS_HIP_GRIDFUNC
-
-namespace mfem
-{
-
-class HipGridFunction : public HipVector
-{
-public:
-   const HipFiniteElementSpace& fes;
-public:
-
-   HipGridFunction(const HipFiniteElementSpace& f):
-      HipVector(f.GetVSize()),fes(f) {}
-
-   HipGridFunction(const HipFiniteElementSpace& f,const HipVector* v):
-      HipVector(v), fes(f) {}
-
-   void ToQuad(const IntegrationRule&,HipVector&);
-
-   HipGridFunction& operator=(const HipVector& v)
-   {
-      HipVector::operator=(v);
-      return *this;
-   }
-   HipGridFunction& operator=(const Vector& v)
-   {
-      HipVector::operator=(v);
-      return *this;
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_GRIDFUNC
diff --git a/hip/hip/fem/prolong.cpp b/hip/hip/fem/prolong.cpp
deleted file mode 100644
index e72b10a7..00000000
--- a/hip/hip/fem/prolong.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipProlongationOperator
-// ***************************************************************************
-HipProlongationOperator::HipProlongationOperator
-(const HipConformingProlongationOperator* Op):
-   HipOperator(Op->Height(), Op->Width()),pmat(Op) {}
-
-// ***************************************************************************
-void HipProlongationOperator::Mult(const HipVector& x,
-                                    HipVector& y) const
-{
-   if (rconfig::Get().IAmAlone())
-   {
-      y=x;
-      return;
-   }
-   if (!rconfig::Get().DoHostConformingProlongationOperator())
-   {
-      pmat->d_Mult(x, y);
-      return;
-   }
-   const Vector hostX=x;//D2H
-   Vector hostY(y.Size());
-   pmat->h_Mult(hostX, hostY);
-   y=hostY;//H2D
-}
-
-// ***************************************************************************
-void HipProlongationOperator::MultTranspose(const HipVector& x,
-                                             HipVector& y) const
-{
-   if (rconfig::Get().IAmAlone())
-   {
-      y=x;
-      return;
-   }
-   if (!rconfig::Get().DoHostConformingProlongationOperator())
-   {
-      pmat->d_MultTranspose(x, y);
-      return;
-   }
-   const Vector hostX=x;
-   Vector hostY(y.Size());
-   pmat->h_MultTranspose(hostX, hostY);
-   y=hostY;//H2D
-}
-
-} // namespace mfem
diff --git a/hip/hip/fem/prolong.hpp b/hip/hip/fem/prolong.hpp
deleted file mode 100644
index 0df0f781..00000000
--- a/hip/hip/fem/prolong.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_PROLONG_OP
-#define LAGHOS_HIP_PROLONG_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipProlongationOperator
-// ***************************************************************************
-class HipProlongationOperator : public HipOperator
-{
-protected:
-   const HipConformingProlongationOperator* pmat = NULL;
-public:
-   HipProlongationOperator(const HipConformingProlongationOperator*);
-   void Mult(const HipVector& x, HipVector& y) const;
-   void MultTranspose(const HipVector& x, HipVector& y) const ;
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_PROLONG_OP
diff --git a/hip/hip/fem/restrict.cpp b/hip/hip/fem/restrict.cpp
deleted file mode 100644
index 044c3045..00000000
--- a/hip/hip/fem/restrict.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipRestrictionOperator
-// ***************************************************************************
-void HipRestrictionOperator::Mult(const HipVector& x,
-                                   HipVector& y) const
-{
-   rExtractSubVector(entries, indices->ptr(), x, y);
-}
-
-} // namespace mfem
diff --git a/hip/hip/fem/restrict.hpp b/hip/hip/fem/restrict.hpp
deleted file mode 100644
index 4eda79e2..00000000
--- a/hip/hip/fem/restrict.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_RESTRICT_OP
-#define LAGHOS_HIP_RESTRICT_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipRestrictionOperator
-// ***************************************************************************
-class HipRestrictionOperator : public HipOperator
-{
-protected:
-   int entries;
-   const HipArray<int> *indices;
-public:
-   HipRestrictionOperator(const int h, const int w,
-                           const HipArray<int> *idx):
-      HipOperator(h,w),
-      entries(idx->size()>>1),
-      indices(idx) {}
-   void Mult(const HipVector& x, HipVector& y) const ;
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_RESTRICT_OP
diff --git a/hip/hip/general/array.hpp b/hip/hip/general/array.hpp
deleted file mode 100644
index b230f1d4..00000000
--- a/hip/hip/general/array.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_ARRAY
-#define LAGHOS_HIP_ARRAY
-
-namespace mfem
-{
-
-template <class T, bool xyz = true> class HipArray;
-
-// Partial Specializations for xyz==TRUE *************************************
-template <class T> class HipArray<T,true> : public rmalloc<T>
-{
-private:
-   T* data = NULL;
-   size_t sz,d[4];
-public:
-   HipArray():data(NULL),sz(0),d{0,0,0,0} {}
-   HipArray(const size_t x) {allocate(x);}
-   HipArray(const size_t x,const size_t y) {allocate(x,y);}
-   HipArray(const HipArray<T,true> &r) {assert(false);}
-   HipArray& operator=(Array<T> &a)
-   {
-      rmemcpy::rHtoD(data,a.GetData(),a.Size()*sizeof(T));
-      return *this;
-   }
-   ~HipArray() {rmalloc<T>::operator delete (data);}
-   inline size_t* dim() { return &d[0]; }
-   inline T* ptr() { return data; }
-   inline const T* GetData() const { return data; }
-   inline const T* ptr() const { return data; }
-   inline operator T* () { return data; }
-   inline operator const T* () const { return data; }
-   double operator* (const HipArray& a) const { return vector_dot(sz, data, a.data); }
-   inline size_t size() const { return sz; }
-   inline size_t Size() const { return sz; }
-   inline size_t bytes() const { return size()*sizeof(T); }
-   void allocate(const size_t X, const size_t Y =1,
-                 const size_t Z =1, const size_t D =1,
-                 const bool transposed = false)
-   {
-      d[0]=X; d[1]=Y; d[2]=Z; d[3]=D;
-      sz=d[0]*d[1]*d[2]*d[3];
-      data=(T*) rmalloc<T>::operator new (sz);
-   }
-   inline T& operator[](const size_t x) { return data[x]; }
-   inline T& operator()(const size_t x, const size_t y)
-   {
-      return data[x + d[0]*y];
-   }
-   inline T& operator()(const size_t x, const size_t y, const size_t z)
-   {
-      return data[x + d[0]*(y + d[1]*z)];
-   }
-   void Print(std::ostream& out= std::cout, int width = 8) const
-   {
-      T *h_data = (double*) ::malloc(bytes());
-      rmemcpy::rDtoH(h_data,data,bytes());
-      for (size_t i=0; i<sz; i+=1)
-         if (sizeof(T)==8) { printf("\n\t[%ld] %.15e",i,h_data[i]); }
-         else { printf("\n\t[%ld] %d",i,h_data[i]); }
-      free(h_data);
-   }
-};
-
-// Partial Specializations for xyz==FALSE ************************************
-template <class T> class HipArray<T,false> : public rmalloc<T>
-{
-private:
-   static const int DIM = 4;
-   T* data = NULL;
-   size_t sz,d[DIM];
-public:
-   HipArray():data(NULL),sz(0),d{0,0,0,0} {}
-   HipArray(const size_t d0) {allocate(d0);}
-   HipArray(const HipArray<T,false> &r) {assert(false);}
-   ~HipArray() {rmalloc<T>::operator delete (data);}
-   HipArray& operator=(Array<T> &a)
-   {
-      rmemcpy::rHtoD(data,a.GetData(),a.Size()*sizeof(T));
-      return *this;
-   }
-   inline size_t* dim() { return &d[0]; }
-   inline T* ptr() { return data; }
-   inline T* GetData() const { return data; }
-   inline const T* ptr() const { return data; }
-   inline operator T* () { return data; }
-   inline operator const T* () const { return data; }
-   double operator* (const HipArray& a) const { return vector_dot(sz, data, a.data); }
-   inline size_t size() const { return sz; }
-   inline size_t Size() const { return sz; }
-   inline size_t bytes() const { return size()*sizeof(T); }
-   void allocate(const size_t X, const size_t Y =1,
-                 const size_t Z =1, const size_t D =1,
-                 const bool transposed = false)
-   {
-      d[0]=X; d[1]=Y; d[2]=Z; d[3]=D;
-      sz=d[0]*d[1]*d[2]*d[3];
-      assert(sz>0);
-      data=(T*) rmalloc<T>::operator new (sz);
-#define xsw(a,b) a^=b^=a^=b
-      if (transposed) { xsw(d[0],d[1]); }
-      for (size_t i=1,b=d[0]; i<DIM; xsw(d[i],b),++i)
-      {
-         d[i]*=d[i-1];
-      }
-      d[0]=1;
-      if (transposed) { xsw(d[0],d[1]); }
-   }
-   inline T& operator[](const size_t x) { return data[x]; }
-   inline T& operator()(const size_t x, const size_t y)
-   {
-      return data[d[0]*x + d[1]*y];
-   }
-   inline T& operator()(const size_t x, const size_t y, const size_t z)
-   {
-      return data[d[0]*x + d[1]*y + d[2]*z];
-   }
-   void Print(std::ostream& out= std::cout, int width = 8) const
-   {
-      T *h_data = (double*) ::malloc(bytes());
-      rmemcpy::rDtoH(h_data,data,bytes());
-      for (size_t i=0; i<sz; i+=1)
-         if (sizeof(T)==8) { printf("\n\t[%ld] %.15e",i,h_data[i]); }
-         else { printf("\n\t[%ld] %d",i,h_data[i]); }
-      free(h_data);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_ARRAY
-
diff --git a/hip/hip/general/commd.cpp b/hip/hip/general/commd.cpp
deleted file mode 100644
index ca5b618b..00000000
--- a/hip/hip/general/commd.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * HipCommD
-// ***************************************************************************
-HipCommD::HipCommD(ParFiniteElementSpace &pfes):
-   GroupCommunicator(pfes.GroupComm()),
-   d_group_ldof(group_ldof),
-   d_group_ltdof(group_ltdof),
-   d_group_buf(NULL) {comm_lock=0;}
-
-
-// ***************************************************************************
-// * ~HipCommD
-// ***************************************************************************
-HipCommD::~HipCommD() { }
-
-
-// ***************************************************************************
-// * kCopyFromTable
-// ***************************************************************************
-template <class T> static __global__
-void k_CopyGroupToBuffer(T *buf,const T *data,const int *dofs)
-{
-   const int j = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[j];
-   buf[j]=data[idx];
-}
-
-// ***************************************************************************
-// ***************************************************************************
-template <class T> static
-T *d_CopyGroupToBuffer_k(const T *d_ldata,T *d_buf,
-                         const HipTable &d_dofs,
-                         const int group)
-{
-   const int ndofs = d_dofs.RowSize(group);
-   const int *dofs = d_dofs.GetRow(group);
-   hipLaunchKernelGGL((k_CopyGroupToBuffer), dim3(ndofs), dim3(1), 0, 0, d_buf,d_ldata,dofs);
-   return d_buf + ndofs;
-}
-
-// ***************************************************************************
-// * d_CopyGroupToBuffer
-// ***************************************************************************
-template <class T>
-T *HipCommD::d_CopyGroupToBuffer(const T *d_ldata, T *d_buf,
-                                  int group, int layout) const
-{
-   if (layout==2) // master
-   {
-      return d_CopyGroupToBuffer_k(d_ldata,d_buf,d_group_ltdof,group);
-   }
-   if (layout==0) // slave
-   {
-      return d_CopyGroupToBuffer_k(d_ldata,d_buf,d_group_ldof,group);
-   }
-   assert(false);
-   return 0;
-}
-
-// ***************************************************************************
-// * k_CopyGroupFromBuffer
-// ***************************************************************************
-template <class T> static __global__
-void k_CopyGroupFromBuffer(const T *buf,T *data,const int *dofs)
-{
-   const int j = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[j];
-   data[idx]=buf[j];
-}
-
-// ***************************************************************************
-// * d_CopyGroupFromBuffer
-// ***************************************************************************
-template <class T>
-const T *HipCommD::d_CopyGroupFromBuffer(const T *d_buf, T *d_ldata,
-                                          int group, int layout) const
-{
-   assert(layout==0);
-   const int ndofs = d_group_ldof.RowSize(group);
-   const int *dofs = d_group_ldof.GetRow(group);
-   hipLaunchKernelGGL((k_CopyGroupFromBuffer), dim3(ndofs), dim3(1), 0, 0,
-                                              d_buf,d_ldata,dofs);
-   return d_buf + ndofs;
-}
-
-// ***************************************************************************
-// * kAtomicAdd
-// ***************************************************************************
-template <class T>
-static __global__ void kAtomicAdd(T* adrs, const int* dofs,T *value)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[i];
-   adrs[idx] += value[i];
-}
-template __global__ void kAtomicAdd<int>(int*, const int*, int*);
-template __global__ void kAtomicAdd<double>(double*, const int*, double*);
-
-// ***************************************************************************
-// * ReduceGroupFromBuffer
-// ***************************************************************************
-template <class T>
-const T *HipCommD::d_ReduceGroupFromBuffer(const T *d_buf, T *d_ldata,
-                                            int group, int layout,
-                                            void (*Op)(OpData<T>)) const
-{
-   OpData<T> opd;
-   opd.ldata = d_ldata;
-   opd.nldofs = group_ldof.RowSize(group);
-   opd.nb = 1;
-   opd.buf = const_cast<T*>(d_buf);
-   opd.ldofs = const_cast<int*>(d_group_ltdof.GetRow(group));
-   assert(opd.nb == 1);
-   hipLaunchKernelGGL((kAtomicAdd), dim3(opd.nldofs), dim3(1), 0, 0,
-                      opd.ldata,opd.ldofs,opd.buf);
-   return d_buf + opd.nldofs;
-}
-
-
-// ***************************************************************************
-// * d_BcastBegin
-// ***************************************************************************
-template <class T>
-void HipCommD::d_BcastBegin(T *d_ldata, int layout)
-{
-   MFEM_VERIFY(comm_lock == 0, "object is already in use");
-   if (group_buf_size == 0) { return; }
-
-   assert(layout==2);
-   // const int rnk = rconfig::Get().Rank();
-   int request_counter = 0;
-   group_buf.SetSize(group_buf_size*sizeof(T));
-   T *buf = (T *)group_buf.GetData();
-   if (!d_group_buf)
-   {
-      d_group_buf = rmalloc<T>::operator new (group_buf_size);
-   }
-   T *d_buf = (T*)d_group_buf;
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      const int num_send_groups = nbr_send_groups.RowSize(nbr);
-      if (num_send_groups > 0)
-      {
-         T *buf_start = buf;
-         T *d_buf_start = d_buf;
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         for (int i = 0; i < num_send_groups; i++)
-         {
-            T *d_buf_ini = d_buf;
-            assert(layout==2);
-            d_buf = d_CopyGroupToBuffer(d_ldata, d_buf, grp_list[i], 2);
-            buf += d_buf - d_buf_ini;
-         }
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rDtoH(buf_start,d_buf_start,(buf-buf_start)*sizeof(T));
-         }
-
-         // make sure the device has finished
-         if (rconfig::Get().Aware())
-         {
-            hipStreamSynchronize(0);//*rconfig::Get().Stream());
-         }
-
-         if (rconfig::Get().Aware())
-            MPI_Isend(d_buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Isend(buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = -1; // mark as send request
-         request_counter++;
-      }
-
-      const int num_recv_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         if (rconfig::Get().Aware())
-            MPI_Irecv(d_buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Irecv(buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = nbr;
-         request_counter++;
-         buf_offsets[nbr] = buf - (T*)group_buf.GetData();
-         buf += recv_size;
-         d_buf += recv_size;
-      }
-   }
-   assert(buf - (T*)group_buf.GetData() == group_buf_size);
-   comm_lock = 1; // 1 - locked for Bcast
-   num_requests = request_counter;
-}
-
-// ***************************************************************************
-// * d_BcastEnd
-// ***************************************************************************
-template <class T>
-void HipCommD::d_BcastEnd(T *d_ldata, int layout)
-{
-   if (comm_lock == 0) { return; }
-   // const int rnk = rconfig::Get().Rank();
-   // The above also handles the case (group_buf_size == 0).
-   assert(comm_lock == 1);
-   // copy the received data from the buffer to d_ldata, as it arrives
-   int idx;
-   while (MPI_Waitany(num_requests, requests, &idx, MPI_STATUS_IGNORE),
-          idx != MPI_UNDEFINED)
-   {
-      int nbr = request_marker[idx];
-      if (nbr == -1) { continue; } // skip send requests
-
-      const int num_recv_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         const T *buf = (T*)group_buf.GetData() + buf_offsets[nbr];
-         const T *d_buf = (T*)d_group_buf + buf_offsets[nbr];
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rHtoD((void*)d_buf,buf,recv_size*sizeof(T));
-         }
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            d_buf = d_CopyGroupFromBuffer(d_buf, d_ldata, grp_list[i], layout);
-         }
-      }
-   }
-   comm_lock = 0; // 0 - no lock
-   num_requests = 0;
-}
-
-// ***************************************************************************
-// * d_ReduceBegin
-// ***************************************************************************
-template <class T>
-void HipCommD::d_ReduceBegin(const T *d_ldata)
-{
-   MFEM_VERIFY(comm_lock == 0, "object is already in use");
-   if (group_buf_size == 0) { return; }
-   // const int rnk = rconfig::Get().Rank();
-   int request_counter = 0;
-   group_buf.SetSize(group_buf_size*sizeof(T));
-   T *buf = (T *)group_buf.GetData();
-   if (!d_group_buf)
-   {
-      d_group_buf = rmalloc<T>::operator new (group_buf_size);
-   }
-   T *d_buf = (T*)d_group_buf;
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      const int num_send_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_send_groups > 0)
-      {
-         T *buf_start = buf;
-         T *d_buf_start = d_buf;
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         for (int i = 0; i < num_send_groups; i++)
-         {
-            T *d_buf_ini = d_buf;
-            d_buf = d_CopyGroupToBuffer(d_ldata, d_buf, grp_list[i], 0);
-            buf += d_buf - d_buf_ini;
-         }
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rDtoH(buf_start,d_buf_start,(buf-buf_start)*sizeof(T));
-         }
-         // make sure the device has finished
-         if (rconfig::Get().Aware())
-         {
-            hipStreamSynchronize(0);//*rconfig::Get().Stream());
-         }
-         if (rconfig::Get().Aware())
-            MPI_Isend(d_buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Isend(buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = -1; // mark as send request
-         request_counter++;
-      }
-
-      // In Reduce operation: send_groups <--> recv_groups
-      const int num_recv_groups = nbr_send_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         if (rconfig::Get().Aware())
-            MPI_Irecv(d_buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Irecv(buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = nbr;
-         request_counter++;
-         buf_offsets[nbr] = buf - (T*)group_buf.GetData();
-         buf += recv_size;
-         d_buf += recv_size;
-      }
-   }
-   assert(buf - (T*)group_buf.GetData() == group_buf_size);
-   comm_lock = 2;
-   num_requests = request_counter;
-}
-
-// ***************************************************************************
-// * d_ReduceEnd
-// ***************************************************************************
-template <class T>
-void HipCommD::d_ReduceEnd(T *d_ldata, int layout,
-                            void (*Op)(OpData<T>))
-{
-   if (comm_lock == 0) { return; }
-   // const int rnk = rconfig::Get().Rank();
-   // The above also handles the case (group_buf_size == 0).
-   assert(comm_lock == 2);
-   MPI_Waitall(num_requests, requests, MPI_STATUSES_IGNORE);
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      // In Reduce operation: send_groups <--> recv_groups
-      const int num_recv_groups = nbr_send_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         const T *buf = (T*)group_buf.GetData() + buf_offsets[nbr];
-         assert(d_group_buf);
-         const T *d_buf = (T*)d_group_buf + buf_offsets[nbr];
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rHtoD((void*)d_buf,buf,recv_size*sizeof(T));
-         }
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            d_buf = d_ReduceGroupFromBuffer(d_buf, d_ldata, grp_list[i], layout, Op);
-         }
-      }
-   }
-   comm_lock = 0; // 0 - no lock
-   num_requests = 0;
-}
-
-// ***************************************************************************
-// * instantiate HipCommD::Bcast and Reduce for doubles
-// ***************************************************************************
-template void HipCommD::d_BcastBegin<double>(double*, int);
-template void HipCommD::d_BcastEnd<double>(double*, int);
-template void HipCommD::d_ReduceBegin<double>(const double *);
-template void HipCommD::d_ReduceEnd<double>(double*,int,
-                                             void (*)(OpData<double>));
-
-} // namespace mfem
diff --git a/hip/hip/general/commd.hpp b/hip/hip/general/commd.hpp
deleted file mode 100644
index dca08478..00000000
--- a/hip/hip/general/commd.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_COMM_D
-#define LAGHOS_HIP_COMM_D
-
-#ifdef MFEM_USE_MPI
-#include <mpi.h>
-#endif
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * First communicator, buf goes on the device
-// ***************************************************************************
-class HipCommD : public GroupCommunicator, public rmemcpy
-{
-private:
-   HipTable d_group_ldof;
-   HipTable d_group_ltdof;
-   void *d_group_buf;
-   int comm_lock;
-   int num_requests;
-public:
-   HipCommD(ParFiniteElementSpace&);
-   ~HipCommD();
-
-   template <class T> T *d_CopyGroupToBuffer(const T*,T*,int,int) const;
-   template <class T>
-   const T *d_CopyGroupFromBuffer(const T*, T*,int, int) const;
-   template <class T>
-   const T *d_ReduceGroupFromBuffer(const T*,T*,int,int,
-                                    void (*)(OpData<T>)) const;
-
-   template <class T> void d_BcastBegin(T*,int);
-   template <class T> void d_BcastEnd(T*, int);
-
-   template <class T> void d_ReduceBegin(const T*);
-   template <class T> void d_ReduceEnd(T*,int,void (*)(OpData<T>));
-};
-
-
-} // mfem
-
-#endif // LAGHOS_HIP_COMM_D
diff --git a/hip/hip/general/malloc.hpp b/hip/hip/general/malloc.hpp
deleted file mode 100644
index c0e32dd7..00000000
--- a/hip/hip/general/malloc.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_MALLOC
-#define LAGHOS_HIP_MALLOC
-
-namespace mfem
-{
-
-// ***************************************************************************
-template<class T> struct rmalloc: public rmemcpy
-{
-
-   // *************************************************************************
-   inline void* operator new (size_t n, bool lock_page = false)
-   {
-      if (!rconfig::Get().Hip()) { return ::new T[n]; }
-      void *ptr;
-
-      if (lock_page) { hipHostMalloc(&ptr, n*sizeof(T), hipHostMallocMapped); }
-      else { hipMalloc((void**)&ptr, n*sizeof(T)); }
-
-      return ptr;
-   }
-
-   // ***************************************************************************
-   inline void operator delete (void *ptr)
-   {
-      if (!rconfig::Get().Hip())
-      {
-         if (ptr)
-         {
-            ::delete[] static_cast<T*>(ptr);
-         }
-      }
-      else
-      {
-         hipFree(ptr); // or hipHostFree if page_locked was used
-      }
-      ptr = nullptr;
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_MALLOC
diff --git a/hip/hip/general/memcpy.cpp b/hip/hip/general/memcpy.cpp
deleted file mode 100644
index df728b55..00000000
--- a/hip/hip/general/memcpy.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// *************************************************************************
-void* rmemcpy::rHtoH(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   std::memcpy(dest,src,bytes);
-   return dest;
-}
-
-// *************************************************************************
-void* rmemcpy::rHtoD(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   if (!rconfig::Get().Hip()) { return std::memcpy(dest,src,bytes); }
-
-   hipMemcpy(dest,src,bytes,hipMemcpyHostToDevice);
-
-   return dest;
-}
-
-// ***************************************************************************
-void* rmemcpy::rDtoH(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   if (!rconfig::Get().Hip()) { return std::memcpy(dest,src,bytes); }
-
-   hipMemcpy(dest,src,bytes,hipMemcpyDeviceToHost);
-
-   return dest;
-}
-
-// ***************************************************************************
-void* rmemcpy::rDtoD(void *dest, const void *src,
-                     std::size_t bytes, const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   if (!rconfig::Get().Hip()) { return std::memcpy(dest,src,bytes); }
-
-   if (!async)
-   {
-      hipMemcpy(dest,src,bytes,hipMemcpyDeviceToDevice);
-   }
-   else
-   {
-      const hipStream_t s = *rconfig::Get().Stream();
-      hipMemcpyAsync(dest,src,bytes,hipMemcpyDeviceToDevice,s);
-   }
-
-   return dest;
-}
-
-} // mfem
diff --git a/hip/hip/general/memcpy.hpp b/hip/hip/general/memcpy.hpp
deleted file mode 100644
index 8a50a3c3..00000000
--- a/hip/hip/general/memcpy.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_MEMCPY
-#define LAGHOS_HIP_MEMCPY
-
-namespace mfem
-{
-
-// ***************************************************************************
-struct rmemcpy
-{
-   static void* rHtoH(void*, const void*, std::size_t, const bool =false);
-   static void* rHtoD(void*, const void*, std::size_t, const bool =false);
-   static void* rDtoH(void*, const void*, std::size_t, const bool =false);
-   static void* rDtoD(void*, const void*, std::size_t, const bool =false);
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_MEMCPY
diff --git a/hip/hip/general/table.cpp b/hip/hip/general/table.cpp
deleted file mode 100644
index 5dfc4961..00000000
--- a/hip/hip/general/table.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-HipTable::HipTable(const Table &table)
-{
-   size = table.Size();
-   assert(size > 0);
-   const int nnz = table.GetI()[size];
-   I = new int[size+1];
-   J = (int*) operator new (nnz);
-   rHtoH(I,table.GetI(),sizeof(int)*(size+1));
-   if (nnz>0)
-   {
-      assert(table.GetJ());
-      rHtoD(J,table.GetJ(),sizeof(int)*nnz);
-   }
-}
-
-} // mfem
diff --git a/hip/hip/general/table.hpp b/hip/hip/general/table.hpp
deleted file mode 100644
index bab06cfd..00000000
--- a/hip/hip/general/table.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_TABLE
-#define LAGHOS_HIP_TABLE
-
-namespace mfem
-{
-
-class HipTable : public rmalloc<int>
-{
-private:
-   int size = 0;
-   int *I = NULL;
-   int *J = NULL;
-public:
-   HipTable(const Table&);
-   inline int Size() {return size;}
-   int RowSize(int i) const { return I[i+1]-I[i]; }
-   const int *GetRow(int i) const { return J+I[i]; }
-   int *GetRow(int i) { return J+I[i]; }
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_TABLE
diff --git a/hip/hip/hip.hpp b/hip/hip/hip.hpp
deleted file mode 100644
index 5d923314..00000000
--- a/hip/hip/hip.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef LAGHOS_HIP
-#define LAGHOS_HIP
-
-// stdincs *********************************************************************
-#include <stdio.h>
-#include <stdarg.h>
-#include <assert.h>
-
-// HIP ************************************************************************
-#include <hip/hip_runtime.h>
-
-// MFEM/fem  *******************************************************************
-#include "fem/gridfunc.hpp"
-#include "general/communication.hpp"
-#include "fem/pfespace.hpp"
-
-// LAGHOS/hip/config **********************************************************
-#include "./config/config.hpp"
-
-// LAGHOS/hip/general *********************************************************
-#include "./general/memcpy.hpp"
-#include "./general/malloc.hpp"
-#include "./general/array.hpp"
-#include "./general/table.hpp"
-#include "./general/commd.hpp"
-
-// LAGHOS/hip/linalg **********************************************************
-#include "./linalg/vector.hpp"
-#include "./linalg/operator.hpp"
-#include "./linalg/ode.hpp"
-#include "./linalg/solvers.hpp"
-
-// LAGHOS/hip/kernels *********************************************************
-#include "./kernels/include/kernels.hpp"
-
-// LAGHOS/hip/fem *************************************************************
-#include "./fem/conform.hpp"
-#include "./fem/prolong.hpp"
-#include "./fem/restrict.hpp"
-#include "./fem/fespace.hpp"
-#include "./fem/bilinearform.hpp"
-#include "./fem/hipGridfunc.hpp"
-#include "./fem/bilininteg.hpp"
-
-#endif // LAGHOS_HIP
-
diff --git a/hip/hip/kernels/blas/vector_axpy.cpp b/hip/hip/kernels/blas/vector_axpy.cpp
deleted file mode 100644
index c84f9b25..00000000
--- a/hip/hip/kernels/blas/vector_axpy.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_axpy0(const int N,
-                  const double alpha,
-                  double* __restrict v0,
-                  const double* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] += alpha * v1[i]; }
-}
-
-
-// *****************************************************************************
-void vector_axpy(const int N,
-                 const double alpha,
-                 double* __restrict v0,
-                 const double* __restrict v1)
-{
-   hipKer(vector_axpy,N,alpha,v0,v1);
-}
diff --git a/hip/hip/kernels/blas/vector_clear_dofs.cpp b/hip/hip/kernels/blas/vector_clear_dofs.cpp
deleted file mode 100644
index ebe187bf..00000000
--- a/hip/hip/kernels/blas/vector_clear_dofs.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_clear_dofs0(const int N,
-                        double* __restrict v0,
-                        const int* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[v1[i]] = 0.0; }
-}
-
-// *****************************************************************************
-void vector_clear_dofs(const int N,
-                       double* __restrict v0,
-                       const int* __restrict v1)
-{
-   hipKer(vector_clear_dofs,N,v0,v1);
-}
diff --git a/hip/hip/kernels/blas/vector_dot.cpp b/hip/hip/kernels/blas/vector_dot.cpp
deleted file mode 100644
index f143d6da..00000000
--- a/hip/hip/kernels/blas/vector_dot.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-#define HIP_BLOCKSIZE 256
-
-// *****************************************************************************
-__global__ void hipKernelDot(const size_t N, double *gdsr,
-                            const double *x, const double *y)
-{
-   __shared__ double s_dot[HIP_BLOCKSIZE];
-   const size_t n = blockDim.x*blockIdx.x + threadIdx.x;
-   if (n>=N) { return; }
-   const size_t bid = blockIdx.x;
-   const size_t tid = threadIdx.x;
-   const size_t bbd = bid*blockDim.x;
-   const size_t rid = bbd+tid;
-   s_dot[tid] = x[n] * y[n];
-   for (size_t workers=blockDim.x>>1; workers>0; workers>>=1)
-   {
-      __syncthreads();
-      if (tid >= workers) { continue; }
-      if (rid >= N) { continue; }
-      const size_t dualTid = tid + workers;
-      if (dualTid >= N) { continue; }
-      const size_t rdd = bbd+dualTid;
-      if (rdd >= N) { continue; }
-      if (dualTid >= blockDim.x) { continue; }
-      s_dot[tid] += s_dot[dualTid];
-   }
-   if (tid==0) { gdsr[bid] = s_dot[0]; }
-}
-
-// *****************************************************************************
-double hipVectorDot(const size_t N, const double *x, const double *y)
-{
-   const size_t tpb = HIP_BLOCKSIZE;
-   const size_t blockSize = HIP_BLOCKSIZE;
-   const size_t gridSize = (N+blockSize-1)/blockSize;
-   const size_t dot_sz = (N%tpb)==0? (N/tpb) : (1+N/tpb);
-   const size_t bytes = dot_sz*sizeof(double);
-   static double *h_dot = NULL;
-   if (!h_dot) { h_dot = (double*)calloc(dot_sz,sizeof(double)); }
-   static void* gdsr = (void*) NULL;
-   if (!gdsr) { hipMalloc(&gdsr,bytes); }
-   hipLaunchKernelGGL((hipKernelDot),dim3(gridSize),dim3(blockSize), 0, 0,
-                           N, (double*)gdsr, x, y);
-   hipMemcpy((void*)h_dot,(void*)gdsr,bytes, hipMemcpyDeviceToHost);
-   double dot = 0.0;
-   for (size_t i=0; i<dot_sz; i+=1) { dot += h_dot[i]; }
-   return dot;
-}
-
-
-// *****************************************************************************
-double vector_dot(const int N,
-                  const double* __restrict x,
-                  const double* __restrict y)
-{
-   return hipVectorDot(N, x, y);
-}
diff --git a/hip/hip/kernels/blas/vector_get_subvector.cpp b/hip/hip/kernels/blas/vector_get_subvector.cpp
deleted file mode 100644
index 6b094a79..00000000
--- a/hip/hip/kernels/blas/vector_get_subvector.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_get_subvector0(const int N,
-                           double* __restrict v0,
-                           const double* __restrict v1,
-                           const int* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int dof_i = v2[i];
-      v0[i] = dof_i >= 0 ? v1[dof_i] : -v1[-dof_i-1];
-   }
-}
-
-// *****************************************************************************
-void vector_get_subvector(const int N,
-                          double* __restrict v0,
-                          const double* __restrict v1,
-                          const int* __restrict v2)
-{
-   hipKer(vector_get_subvector,N,v0,v1,v2);
-}
-
diff --git a/hip/hip/kernels/blas/vector_map_dofs.cpp b/hip/hip/kernels/blas/vector_map_dofs.cpp
deleted file mode 100644
index e7523e14..00000000
--- a/hip/hip/kernels/blas/vector_map_dofs.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_map_dofs0(const int N,
-                      double* __restrict v0,
-                      const double* __restrict v1,
-                      const int* v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int idx = v2[i];
-      v0[idx] = v1[idx];
-   }
-}
-
-// *****************************************************************************
-void vector_map_dofs(const int N,
-                     double* __restrict v0,
-                     const double* __restrict v1,
-                     const int* v2)
-{
-   hipKer(vector_map_dofs,N,v0,v1,v2);
-}
diff --git a/hip/hip/kernels/blas/vector_min.cpp b/hip/hip/kernels/blas/vector_min.cpp
deleted file mode 100644
index dfab4a18..00000000
--- a/hip/hip/kernels/blas/vector_min.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-#define HIP_BLOCKSIZE 256
-
-// *****************************************************************************
-__global__ void hipKernelMin(const size_t N, double *gdsr, const double *x)
-{
-   __shared__ double s_min[HIP_BLOCKSIZE];
-   const size_t n = blockDim.x*blockIdx.x + threadIdx.x;
-   if (n>=N) { return; }
-   const size_t bid = blockIdx.x;
-   const size_t tid = threadIdx.x;
-   const size_t bbd = bid*blockDim.x;
-   const size_t rid = bbd+tid;
-   s_min[tid] = x[n];
-   for (size_t workers=blockDim.x>>1; workers>0; workers>>=1)
-   {
-      __syncthreads();
-      if (tid >= workers) { continue; }
-      if (rid >= N) { continue; }
-      const size_t dualTid = tid + workers;
-      if (dualTid >= N) { continue; }
-      const size_t rdd = bbd+dualTid;
-      if (rdd >= N) { continue; }
-      if (dualTid >= blockDim.x) { continue; }
-      s_min[tid] = fmin(s_min[tid],s_min[dualTid]);
-   }
-   if (tid==0) { gdsr[bid] = s_min[0]; }
-}
-
-// *****************************************************************************
-double hipVectorMin(const size_t N, const double *x)
-{
-   const size_t tpb = HIP_BLOCKSIZE;
-   const size_t blockSize = HIP_BLOCKSIZE;
-   const size_t gridSize = (N+blockSize-1)/blockSize;
-   const size_t min_sz = (N%tpb)==0? (N/tpb) : (1+N/tpb);
-   const size_t bytes = min_sz*sizeof(double);
-   static double *h_min = NULL;
-   if (!h_min) { h_min = (double*)calloc(min_sz,sizeof(double)); }
-   static void* gdsr = NULL;
-   if (!gdsr) { hipMalloc(&gdsr,bytes); }
-   hipLaunchKernelGGL((hipKernelMin), dim3(gridSize), dim3(blockSize), 0, 0,
-                        N, (double*)gdsr, x);
-   hipMemcpy((void*)h_min,(void*)gdsr,bytes,hipMemcpyDeviceToHost);
-   double min = HUGE_VAL;
-   for (size_t i=0; i<min_sz; i+=1) { min = fmin(min,h_min[i]); }
-   return min;
-}
-
-
-// *****************************************************************************
-double vector_min(const int N, const double* __restrict x)
-{
-   return hipVectorMin(N, x);
-}
-
diff --git a/hip/hip/kernels/blas/vector_neg.cpp b/hip/hip/kernels/blas/vector_neg.cpp
deleted file mode 100644
index bf0863af..00000000
--- a/hip/hip/kernels/blas/vector_neg.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_neg0(const int N,
-                 double* __restrict vec)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { vec[i] *= -1.0; }
-}
-
-// *****************************************************************************
-void vector_neg(const int N,
-                double* __restrict vec)
-{
-   hipKer(vector_neg,N,vec);
-}
diff --git a/hip/hip/kernels/blas/vector_op_eq.cpp b/hip/hip/kernels/blas/vector_op_eq.cpp
deleted file mode 100644
index 1ce81310..00000000
--- a/hip/hip/kernels/blas/vector_op_eq.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel void vector_op_eq0(const int N,
-                                     const double c0,
-                                     double* __restrict v0)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] = c0; }
-}
-
-// *****************************************************************************
-void vector_op_eq(const int N,
-                  const double c0,
-                  double* __restrict v0)
-{
-   hipKer(vector_op_eq,N,c0,v0);
-}
diff --git a/hip/hip/kernels/blas/vector_op_eq_d.cu b/hip/hip/kernels/blas/vector_op_eq_d.cu
deleted file mode 100644
index 9eadce0a..00000000
--- a/hip/hip/kernels/blas/vector_op_eq_d.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-static __global__ void d_vector_op_eq0(const int N,
-                                       const double c0,
-                                       double* __restrict v0){
-  const int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) v0[i] = c0;
-}
-
-// *****************************************************************************
-extern "C" __global__ void d_vector_op_eq(const int N,
-                                          const double c0,
-                                          double* __restrict v0){
-  const size_t blockSize = 128;
-  const size_t gridSize = (N+blockSize-1)/blockSize;
-  hipLaunchKernelGGL((d_vector_op_eq0),dim3(gridSize),dim3(blockSize), 0, 0,
-  										N,c0,v0);
-}
diff --git a/hip/hip/kernels/blas/vector_set_subvector.cpp b/hip/hip/kernels/blas/vector_set_subvector.cpp
deleted file mode 100644
index e96331b4..00000000
--- a/hip/hip/kernels/blas/vector_set_subvector.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_set_subvector0(const int N,
-                           double* __restrict v0,
-                           const double* __restrict v1,
-                           const int* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int dof_i = v2[i];
-      const bool tst = dof_i >= 0;
-      const int idx = tst?dof_i:-dof_i-1;
-      const double value = tst?v1[i]:-v1[i];
-      v0[idx]=value;
-   }
-}
-
-// *****************************************************************************
-void vector_set_subvector(const int N,
-                          double* __restrict v0,
-                          const double* __restrict v1,
-                          const int* __restrict v2)
-{
-   hipKer(vector_set_subvector,N,v0,v1,v2);
-}
diff --git a/hip/hip/kernels/blas/vector_set_subvector_const.cpp b/hip/hip/kernels/blas/vector_set_subvector_const.cpp
deleted file mode 100644
index bde2667c..00000000
--- a/hip/hip/kernels/blas/vector_set_subvector_const.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_set_subvector_const0(const int N,
-                                 const double value,
-                                 double* __restrict data,
-                                 const int* __restrict tdofs)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i >= N) { return; }
-   const int dof_i = tdofs[i];
-   data[dof_i] = value;
-   if (dof_i >= 0)
-   {
-      data[dof_i] = value;
-   }
-   else
-   {
-      data[-dof_i-1] = -value;
-   }
-}
-
-// *****************************************************************************
-void vector_set_subvector_const(const int N,
-                                const double value,
-                                double* __restrict data,
-                                const int* __restrict tdofs)
-{
-   hipKer(vector_set_subvector_const,N,value,data,tdofs);
-}
diff --git a/hip/hip/kernels/blas/vector_vec_add.cpp b/hip/hip/kernels/blas/vector_vec_add.cpp
deleted file mode 100644
index 53ab1109..00000000
--- a/hip/hip/kernels/blas/vector_vec_add.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_vec_add0(const int N,
-                     double* __restrict v0,
-                     const double* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] += v1[i]; }
-}
-
-// *****************************************************************************
-void vector_vec_add(const int N,
-                    double* __restrict v0,
-                    const double* __restrict v1)
-{
-   hipKer(vector_vec_add,N,v0,v1);
-}
diff --git a/hip/hip/kernels/blas/vector_vec_mul.cpp b/hip/hip/kernels/blas/vector_vec_mul.cpp
deleted file mode 100644
index 07051a87..00000000
--- a/hip/hip/kernels/blas/vector_vec_mul.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_vec_mul0(const int N,
-                     double* __restrict v0,
-                     const double d)
-{
-   const int i = blockDim.x*blockIdx.x+threadIdx.x;
-   if (i < N) { v0[i]*=d; }
-}
-
-// *****************************************************************************
-void vector_vec_mul(const int N,
-                    double* __restrict v0,
-                    const double d)
-{
-   hipKer(vector_vec_mul,N,v0,d);
-}
diff --git a/hip/hip/kernels/blas/vector_vec_sub.cpp b/hip/hip/kernels/blas/vector_vec_sub.cpp
deleted file mode 100644
index 70f0393e..00000000
--- a/hip/hip/kernels/blas/vector_vec_sub.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_vec_sub0(const int N,
-                     double* __restrict v0,
-                     const double* __restrict v1)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] -= v1[i]; }
-}
-
-// *****************************************************************************
-void vector_vec_sub(const int N,
-                    double* __restrict v0,
-                    const double* __restrict v1)
-{
-   hipKer(vector_vec_sub,N,v0,v1);
-}
diff --git a/hip/hip/kernels/blas/vector_xpay.cpp b/hip/hip/kernels/blas/vector_xpay.cpp
deleted file mode 100644
index f7b2c661..00000000
--- a/hip/hip/kernels/blas/vector_xpay.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_xpay0(const int N,
-                  const double c0,
-                  double* __restrict v0,
-                  const double* __restrict v1,
-                  const double* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] = v1[i] + (c0 * v2[i]); }
-}
-
-// *****************************************************************************
-void vector_xpay(const int N,
-                 const double c0,
-                 double* __restrict v0,
-                 const double* __restrict v1,
-                 const double* __restrict v2)
-{
-   hipKer(vector_xpay,N,c0,v0,v1,v2);
-}
diff --git a/hip/hip/kernels/blas/vector_xsy.cpp b/hip/hip/kernels/blas/vector_xsy.cpp
deleted file mode 100644
index 18e3c231..00000000
--- a/hip/hip/kernels/blas/vector_xsy.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void vector_xsy0(const int N,
-                 double* __restrict v0,
-                 const double* __restrict v1,
-                 const double* __restrict v2)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { v0[i] = v1[i]-v2[i]; }
-}
-
-// *****************************************************************************
-void vector_xsy(const int N,
-                double* __restrict v0,
-                const double* __restrict v1,
-                const double* __restrict v2)
-{
-   hipKer(vector_xsy,N,v0,v1,v2);
-}
diff --git a/hip/hip/kernels/force/force.cpp b/hip/hip/kernels/force/force.cpp
deleted file mode 100644
index b47eb5cb..00000000
--- a/hip/hip/kernels/force/force.cpp
+++ /dev/null
@@ -1,654 +0,0 @@
-
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-static void rForceMult2D(const int numElements,
-                         const double* restrict L2DofToQuad,
-                         const double* restrict H1QuadToDof,
-                         const double* restrict H1QuadToDofD,
-                         const double* restrict stressJinvT,
-                         const double* restrict e,
-                         double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double e_xy[NUM_QUAD_2D];
-      for (int i = 0; i < NUM_QUAD_2D; ++i)
-      {
-         e_xy[i] = 0;
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-      {
-         double e_x[NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            e_x[qy] = 0;
-         }
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            const double r_e = e[ijkN(dx,dy,el,L2_DOFS_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               e_x[qx] += L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * r_e;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double wy = L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               e_xy[ijN(qx,qy,NUM_QUAD_1D)] += wy * e_x[qx];
-            }
-         }
-      }
-      for (int c = 0; c < 2; ++c)
-      {
-         for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)] = 0.0;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double Dxy[H1_DOFS_1D];
-            double xy[H1_DOFS_1D];
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               Dxy[dx] = 0.0;
-               xy[dx]  = 0.0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double esx = e_xy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(0,c,qx,qy,
-                                                                                     el,NUM_DIM,NUM_QUAD_1D)];
-               const double esy = e_xy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(1,c,qx,qy,
-                                                                                     el,NUM_DIM,NUM_QUAD_1D)];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  Dxy[dx] += esx * H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)];
-                  xy[dx]  += esy * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-               }
-            }
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               const double wy  = H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-               const double wDy = H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)] += wy* Dxy[dx] + wDy*xy[dx];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-static void rForceMultTranspose2D(const int numElements,
-                                  const double* restrict L2QuadToDof,
-                                  const double* restrict H1DofToQuad,
-                                  const double* restrict H1DofToQuadD,
-                                  const double* restrict stressJinvT,
-                                  const double* restrict v,
-                                  double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double vStress[NUM_QUAD_2D];
-      for (int i = 0; i < NUM_QUAD_2D; ++i)
-      {
-         vStress[i] = 0;
-      }
-      for (int c = 0; c < NUM_DIM; ++c)
-      {
-         double v_Dxy[NUM_QUAD_2D];
-         double v_xDy[NUM_QUAD_2D];
-         for (int i = 0; i < NUM_QUAD_2D; ++i)
-         {
-            v_Dxy[i] = v_xDy[i] = 0;
-         }
-         for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-         {
-            double v_x[NUM_QUAD_1D];
-            double v_Dx[NUM_QUAD_1D];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               v_x[qx] = v_Dx[qx] = 0;
-            }
-
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               const double r_v = v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  v_x[qx]  += r_v * H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  v_Dx[qx] += r_v * H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy  = H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               const double wDy = H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  v_Dxy[ijN(qx,qy,NUM_QUAD_1D)] += v_Dx[qx] * wy;
-                  v_xDy[ijN(qx,qy,NUM_QUAD_1D)] += v_x[qx]  * wDy;
-               }
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               vStress[ijN(qx,qy,NUM_QUAD_1D)] +=
-                  ((v_Dxy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,
-                                                                        NUM_QUAD_1D)]) +
-                   (v_xDy[ijN(qx,qy,NUM_QUAD_1D)] * stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,
-                                                                        NUM_QUAD_1D)]));
-            }
-         }
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-      {
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            e[ijkN(dx,dy,el,L2_DOFS_1D)] = 0;
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         double e_x[L2_DOFS_1D];
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            e_x[dx] = 0;
-         }
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            const double r_v = vStress[ijN(qx,qy,NUM_QUAD_1D)];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e_x[dx] += r_v * L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-            }
-         }
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            const double w = L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e[ijkN(dx,dy,el,L2_DOFS_1D)] += e_x[dx] * w;
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMult3D(const int numElements,
-                  const double* restrict L2DofToQuad,
-                  const double* restrict H1QuadToDof,
-                  const double* restrict H1QuadToDofD,
-                  const double* restrict stressJinvT,
-                  const double* restrict e,
-                  double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double e_xyz[NUM_QUAD_3D];
-      for (int i = 0; i < NUM_QUAD_3D; ++i)
-      {
-         e_xyz[i] = 0;
-      }
-      for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-      {
-         double e_xy[NUM_QUAD_2D];
-         for (int i = 0; i < NUM_QUAD_2D; ++i)
-         {
-            e_xy[i] = 0;
-         }
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            double e_x[NUM_QUAD_1D];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               e_x[qy] = 0;
-            }
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               const double r_e = e[ijklN(dx,dy,dz,el,L2_DOFS_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_x[qx] += L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * r_e;
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_xy[ijN(qx,qy,NUM_QUAD_1D)] += wy * e_x[qx];
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = L2DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_xyz[ijkN(qx,qy,qz,NUM_QUAD_1D)] += wz * e_xy[ijN(qx,qy,NUM_QUAD_1D)];
-               }
-            }
-         }
-      }
-      for (int c = 0; c < 3; ++c)
-      {
-         for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-         {
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] = 0;
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            double Dxy_x[H1_DOFS_1D * H1_DOFS_1D];
-            double xDy_y[H1_DOFS_1D * H1_DOFS_1D];
-            double xy_z[H1_DOFS_1D * H1_DOFS_1D] ;
-            for (int d = 0; d < (H1_DOFS_1D * H1_DOFS_1D); ++d)
-            {
-               Dxy_x[d] = xDy_y[d] = xy_z[d] = 0;
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               double Dx_x[H1_DOFS_1D];
-               double x_y[H1_DOFS_1D];
-               double x_z[H1_DOFS_1D];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  Dx_x[dx] = x_y[dx] = x_z[dx] = 0;
-               }
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  const double r_e = e_xyz[ijkN(qx,qy,qz,NUM_QUAD_1D)];
-                  const double esx = r_e * stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,NUM_DIM,
-                                                                NUM_QUAD_1D)];
-                  const double esy = r_e * stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,
-                                                                NUM_QUAD_1D)];
-                  const double esz = r_e * stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,
-                                                                NUM_QUAD_1D)];
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     Dx_x[dx] += esx * H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)];
-                     x_y[dx]  += esy * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-                     x_z[dx]  += esz * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-                  }
-               }
-               for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-               {
-                  const double wy  = H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-                  const double wDy = H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     Dxy_x[ijN(dx,dy,H1_DOFS_1D)] += Dx_x[dx] * wy;
-                     xDy_y[ijN(dx,dy,H1_DOFS_1D)] += x_y[dx]  * wDy;
-                     xy_z[ijN(dx,dy,H1_DOFS_1D)]  += x_z[dx]  * wy;
-                  }
-               }
-            }
-            for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-            {
-               const double wz  = H1QuadToDof[ijN(dz,qz,H1_DOFS_1D)];
-               const double wDz = H1QuadToDofD[ijN(dz,qz,H1_DOFS_1D)];
-               for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-               {
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] +=
-                        ((Dxy_x[ijN(dx,dy,H1_DOFS_1D)] * wz) +
-                         (xDy_y[ijN(dx,dy,H1_DOFS_1D)] * wz) +
-                         (xy_z[ijN(dx,dy,H1_DOFS_1D)]  * wDz));
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-static void rForceMultTranspose3D(const int numElements,
-                                  const double* restrict L2QuadToDof,
-                                  const double* restrict H1DofToQuad,
-                                  const double* restrict H1DofToQuadD,
-                                  const double* restrict stressJinvT,
-                                  const double* restrict v,
-                                  double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double vStress[NUM_QUAD_3D];
-      for (int i = 0; i < NUM_QUAD_3D; ++i)
-      {
-         vStress[i] = 0;
-      }
-      for (int c = 0; c < NUM_DIM; ++c)
-      {
-         for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-         {
-            double Dxy_x[NUM_QUAD_2D];
-            double xDy_y[NUM_QUAD_2D];
-            double xy_z[NUM_QUAD_2D] ;
-            for (int i = 0; i < NUM_QUAD_2D; ++i)
-            {
-               Dxy_x[i] = xDy_y[i] = xy_z[i] = 0;
-            }
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               double Dx_x[NUM_QUAD_1D];
-               double x_y[NUM_QUAD_1D];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  Dx_x[qx] = x_y[qx] = 0;
-               }
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  const double r_v = v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     Dx_x[qx] += r_v * H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                     x_y[qx]  += r_v * H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  }
-               }
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  const double wy  = H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                  const double wDy = H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     Dxy_x[ijN(qx,qy,NUM_QUAD_1D)] += Dx_x[qx] * wy;
-                     xDy_y[ijN(qx,qy,NUM_QUAD_1D)] += x_y[qx]  * wDy;
-                     xy_z[ijN(qx,qy,NUM_QUAD_1D)]  += x_y[qx]  * wy;
-                  }
-               }
-            }
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               const double wz  = H1DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-               const double wDz = H1DofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     vStress[ijkN(qx,qy,qz,NUM_QUAD_1D)] +=
-                        ((Dxy_x[ijN(qx,qy,NUM_QUAD_1D)]*wz *stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,
-                                                                                 NUM_DIM,NUM_QUAD_1D)]) +
-                         (xDy_y[ijN(qx,qy,NUM_QUAD_1D)]*wz *stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,
-                                                                                 NUM_QUAD_1D)]) +
-                         (xy_z[ijN(qx,qy,NUM_QUAD_1D)] *wDz*stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,
-                                                                                 NUM_QUAD_1D)]));
-                  }
-               }
-            }
-         }
-      }
-      for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-      {
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] = 0;
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         double e_xy[L2_DOFS_1D * L2_DOFS_1D];
-         for (int d = 0; d < (L2_DOFS_1D * L2_DOFS_1D); ++d)
-         {
-            e_xy[d] = 0;
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double e_x[L2_DOFS_1D];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e_x[dx] = 0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double r_v = vStress[ijkN(qx,qy,qz,NUM_QUAD_1D)];
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e_x[dx] += r_v * L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-               }
-            }
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-            {
-               const double w = L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e_xy[ijN(dx,dy,L2_DOFS_1D)] += e_x[dx] * w;
-               }
-            }
-         }
-         for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-         {
-            const double w = L2QuadToDof[ijN(dz,qz,L2_DOFS_1D)];
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] += w * e_xy[ijN(dx,dy,L2_DOFS_1D)];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fForceMult)(const int numElements,
-                           const double* restrict L2QuadToDof,
-                           const double* restrict H1DofToQuad,
-                           const double* restrict H1DofToQuadD,
-                           const double* restrict stressJinvT,
-                           const double* restrict e,
-                           double* restrict v);
-
-// *****************************************************************************
-void rForceMult(const int NUM_DIM,
-                const int NUM_DOFS_1D,
-                const int NUM_QUAD_1D,
-                const int L2_DOFS_1D,
-                const int H1_DOFS_1D,
-                const int nzones,
-                const double* restrict L2QuadToDof,
-                const double* restrict H1DofToQuad,
-                const double* restrict H1DofToQuadD,
-                const double* restrict stressJinvT,
-                const double* restrict e,
-                double* restrict v)
-{
-   const int blck = HIP_BLOCK_SIZE;
-   const int grid = (nzones+blck-1)/blck;
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   static std::unordered_map<unsigned long long, fForceMult> call =
-   {
-      {0x20,&rForceMult2D<2,2,2,1,2>},
-      {0x21,&rForceMult2D<2,3,4,2,3>},
-      {0x22,&rForceMult2D<2,4,6,3,4>},
-      {0x23,&rForceMult2D<2,5,8,4,5>},
-      {0x24,&rForceMult2D<2,6,10,5,6>},
-      {0x25,&rForceMult2D<2,7,12,6,7>},
-      {0x26,&rForceMult2D<2,8,14,7,8>},
-      {0x27,&rForceMult2D<2,9,16,8,9>},
-      {0x28,&rForceMult2D<2,10,18,9,10>},
-      {0x29,&rForceMult2D<2,11,20,10,11>},
-      {0x2A,&rForceMult2D<2,12,22,11,12>},
-      {0x2B,&rForceMult2D<2,13,24,12,13>},
-      {0x2C,&rForceMult2D<2,14,26,13,14>},
-      {0x2D,&rForceMult2D<2,15,28,14,15>},
-      {0x2E,&rForceMult2D<2,16,30,15,16>},
-      {0x2F,&rForceMult2D<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMult3D<3,2,2,1,2>},
-      {0x31,&rForceMult3D<3,3,4,2,3>},
-      {0x32,&rForceMult3D<3,4,6,3,4>},
-      {0x33,&rForceMult3D<3,5,8,4,5>},
-      {0x34,&rForceMult3D<3,6,10,5,6>},
-      {0x35,&rForceMult3D<3,7,12,6,7>},
-      {0x36,&rForceMult3D<3,8,14,7,8>},
-      {0x37,&rForceMult3D<3,9,16,8,9>},
-      {0x38,&rForceMult3D<3,10,18,9,10>},
-      {0x39,&rForceMult3D<3,11,20,10,11>},
-      {0x3A,&rForceMult3D<3,12,22,11,12>},
-      {0x3B,&rForceMult3D<3,13,24,12,13>},
-      {0x3C,&rForceMult3D<3,14,26,13,14>},
-      {0x3D,&rForceMult3D<3,15,28,14,15>},
-      {0x3E,&rForceMult3D<3,16,30,15,16>},
-      {0x3F,&rForceMult3D<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMult] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         nzones,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,e,v);
-}
-
-// *****************************************************************************
-typedef void (*fForceMultTranspose)(const int numElements,
-                                    const double* restrict L2QuadToDof,
-                                    const double* restrict H1DofToQuad,
-                                    const double* restrict H1DofToQuadD,
-                                    const double* restrict stressJinvT,
-                                    const double* restrict v,
-                                    double* restrict e);
-
-// *****************************************************************************
-void rForceMultTranspose(const int NUM_DIM,
-                         const int NUM_DOFS_1D,
-                         const int NUM_QUAD_1D,
-                         const int L2_DOFS_1D,
-                         const int H1_DOFS_1D,
-                         const int nzones,
-                         const double* restrict L2QuadToDof,
-                         const double* restrict H1DofToQuad,
-                         const double* restrict H1DofToQuadD,
-                         const double* restrict stressJinvT,
-                         const double* restrict v,
-                         double* restrict e)
-{
-   const int blck = HIP_BLOCK_SIZE;
-   const int grid = (nzones+blck-1)/blck;
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned long long, fForceMultTranspose> call =
-   {
-      // 2D
-      {0x20,&rForceMultTranspose2D<2,2,2,1,2>},
-      {0x21,&rForceMultTranspose2D<2,3,4,2,3>},
-      {0x22,&rForceMultTranspose2D<2,4,6,3,4>},
-      {0x23,&rForceMultTranspose2D<2,5,8,4,5>},
-      {0x24,&rForceMultTranspose2D<2,6,10,5,6>},
-      {0x25,&rForceMultTranspose2D<2,7,12,6,7>},
-      {0x26,&rForceMultTranspose2D<2,8,14,7,8>},
-      {0x27,&rForceMultTranspose2D<2,9,16,8,9>},
-      {0x28,&rForceMultTranspose2D<2,10,18,9,10>},
-      {0x29,&rForceMultTranspose2D<2,11,20,10,11>},
-      {0x2A,&rForceMultTranspose2D<2,12,22,11,12>},
-      {0x2B,&rForceMultTranspose2D<2,13,24,12,13>},
-      {0x2C,&rForceMultTranspose2D<2,14,26,13,14>},
-      {0x2D,&rForceMultTranspose2D<2,15,28,14,15>},
-      {0x2E,&rForceMultTranspose2D<2,16,30,15,16>},
-      {0x2F,&rForceMultTranspose2D<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMultTranspose3D<3,2,2,1,2>},
-      {0x31,&rForceMultTranspose3D<3,3,4,2,3>},
-      {0x32,&rForceMultTranspose3D<3,4,6,3,4>},
-      {0x33,&rForceMultTranspose3D<3,5,8,4,5>},
-      {0x34,&rForceMultTranspose3D<3,6,10,5,6>},
-      {0x35,&rForceMultTranspose3D<3,7,12,6,7>},
-      {0x36,&rForceMultTranspose3D<3,8,14,7,8>},
-      {0x37,&rForceMultTranspose3D<3,9,16,8,9>},
-      {0x38,&rForceMultTranspose3D<3,10,18,9,10>},
-      {0x39,&rForceMultTranspose3D<3,11,20,10,11>},
-      {0x3A,&rForceMultTranspose3D<3,12,22,11,12>},
-      {0x3B,&rForceMultTranspose3D<3,13,24,12,13>},
-      {0x3C,&rForceMultTranspose3D<3,14,26,13,14>},
-      {0x3D,&rForceMultTranspose3D<3,15,28,14,15>},
-      {0x3E,&rForceMultTranspose3D<3,16,30,15,16>},
-      {0x3F,&rForceMultTranspose3D<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMultTranspose] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         nzones,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,v,e);
-}
-
diff --git a/hip/hip/kernels/geom/initGeom.cpp b/hip/hip/kernels/geom/initGeom.cpp
deleted file mode 100644
index 8533349c..00000000
--- a/hip/hip/kernels/geom/initGeom.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-kernel
-void rNodeCopyByVDim0(const int elements,
-                      const int numDofs,
-                      const int ndofs,
-                      const int dims,
-                      const int* eMap,
-                      const double* Sx,
-                      double* nodes)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < elements)
-   {
-      for (int dof = 0; dof < numDofs; ++dof)
-      {
-         const int lid = dof+numDofs*e;
-         const int gid = eMap[lid];
-         for (int v = 0; v < dims; ++v)
-         {
-            const int moffset = v+dims*lid;
-            const int voffset = gid+v*ndofs;
-            nodes[moffset] = Sx[voffset];
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-void rNodeCopyByVDim(const int elements,
-                     const int numDofs,
-                     const int ndofs,
-                     const int dims,
-                     const int* eMap,
-                     const double* Sx,
-                     double* nodes)
-{
-   hipKer(rNodeCopyByVDim,elements,numDofs,ndofs,dims,eMap,Sx,nodes);
-}
-
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD> kernel
-void rIniGeom1D(const int numElements,
-                const double* restrict dofToQuadD,
-                const double* restrict nodes,
-                double* restrict J,
-                double* restrict invJ,
-                double* restrict detJ)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double s_nodes[NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d += NUM_QUAD)
-         {
-            s_nodes[d] = nodes[ijkN(0,d,e,NUM_QUAD)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijN(q,d,NUM_DOFS)];
-            J11 += wx * s_nodes[d];
-         }
-         J[ijN(q,e,NUM_QUAD)] = J11;
-         invJ[ijN(q, e,NUM_QUAD)] = 1.0 / J11;
-         detJ[ijN(q, e,NUM_QUAD)] = J11;
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD> kernel
-void rIniGeom2D(const int numElements,
-                const double* restrict dofToQuadD,
-                const double* restrict nodes,
-                double* restrict J,
-                double* restrict invJ,
-                double* restrict detJ)
-{
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double s_nodes[2 * NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d +=NUM_QUAD)
-         {
-            s_nodes[ijN(0,d,2)] = nodes[ijkNM(0,d,el,2,NUM_DOFS)];
-            s_nodes[ijN(1,d,2)] = nodes[ijkNM(1,d,el,2,NUM_DOFS)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0; double J12 = 0;
-         double J21 = 0; double J22 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijkNM(0,q,d,2,NUM_QUAD)];
-            const double wy = dofToQuadD[ijkNM(1,q,d,2,NUM_QUAD)];
-            const double x = s_nodes[ijN(0,d,2)];
-            const double y = s_nodes[ijN(1,d,2)];
-            J11 += (wx * x); J12 += (wx * y);
-            J21 += (wy * x); J22 += (wy * y);
-         }
-         const double r_detJ = (J11 * J22)-(J12 * J21);
-         J[ijklNM(0, 0, q, el,2,NUM_QUAD)] = J11;
-         J[ijklNM(1, 0, q, el,2,NUM_QUAD)] = J12;
-         J[ijklNM(0, 1, q, el,2,NUM_QUAD)] = J21;
-         J[ijklNM(1, 1, q, el,2,NUM_QUAD)] = J22;
-         const double r_idetJ = 1.0 / r_detJ;
-         invJ[ijklNM(0, 0, q, el,2,NUM_QUAD)] =  J22 * r_idetJ;
-         invJ[ijklNM(1, 0, q, el,2,NUM_QUAD)] = -J12 * r_idetJ;
-         invJ[ijklNM(0, 1, q, el,2,NUM_QUAD)] = -J21 * r_idetJ;
-         invJ[ijklNM(1, 1, q, el,2,NUM_QUAD)] =  J11 * r_idetJ;
-         detJ[ijN(q, el,NUM_QUAD)] = r_detJ;
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD> kernel
-void rIniGeom3D(const int numElements,
-                const double* restrict dofToQuadD,
-                const double* restrict nodes,
-                double* restrict J,
-                double* restrict invJ,
-                double* restrict detJ)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double s_nodes[3*NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d += NUM_QUAD)
-         {
-            s_nodes[ijN(0,d,3)] = nodes[ijkNM(0, d, e,3,NUM_DOFS)];
-            s_nodes[ijN(1,d,3)] = nodes[ijkNM(1, d, e,3,NUM_DOFS)];
-            s_nodes[ijN(2,d,3)] = nodes[ijkNM(2, d, e,3,NUM_DOFS)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0; double J12 = 0; double J13 = 0;
-         double J21 = 0; double J22 = 0; double J23 = 0;
-         double J31 = 0; double J32 = 0; double J33 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijkNM(0, q, d,3,NUM_QUAD)];
-            const double wy = dofToQuadD[ijkNM(1, q, d,3,NUM_QUAD)];
-            const double wz = dofToQuadD[ijkNM(2, q, d,3,NUM_QUAD)];
-            const double x = s_nodes[ijN(0, d,3)];
-            const double y = s_nodes[ijN(1, d,3)];
-            const double z = s_nodes[ijN(2, d,3)];
-            J11 += (wx * x); J12 += (wx * y); J13 += (wx * z);
-            J21 += (wy * x); J22 += (wy * y); J23 += (wy * z);
-            J31 += (wz * x); J32 += (wz * y); J33 += (wz * z);
-         }
-         const double r_detJ = ((J11 * J22 * J33) + (J12 * J23 * J31) +
-                                (J13 * J21 * J32) -
-                                (J13 * J22 * J31)-(J12 * J21 * J33)-(J11 * J23 * J32));
-         J[ijklNM(0, 0, q, e,3,NUM_QUAD)] = J11;
-         J[ijklNM(1, 0, q, e,3,NUM_QUAD)] = J12;
-         J[ijklNM(2, 0, q, e,3,NUM_QUAD)] = J13;
-         J[ijklNM(0, 1, q, e,3,NUM_QUAD)] = J21;
-         J[ijklNM(1, 1, q, e,3,NUM_QUAD)] = J22;
-         J[ijklNM(2, 1, q, e,3,NUM_QUAD)] = J23;
-         J[ijklNM(0, 2, q, e,3,NUM_QUAD)] = J31;
-         J[ijklNM(1, 2, q, e,3,NUM_QUAD)] = J32;
-         J[ijklNM(2, 2, q, e,3,NUM_QUAD)] = J33;
-
-         const double r_idetJ = 1.0 / r_detJ;
-         invJ[ijklNM(0, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J22 * J33)-(J23 * J32));
-         invJ[ijklNM(1, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J32 * J13)-(J33 * J12));
-         invJ[ijklNM(2, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J12 * J23)-(J13 * J22));
-
-         invJ[ijklNM(0, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J23 * J31)-(J21 * J33));
-         invJ[ijklNM(1, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J33 * J11)-(J31 * J13));
-         invJ[ijklNM(2, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J13 * J21)-(J11 * J23));
-
-         invJ[ijklNM(0, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J21 * J32)-(J22 * J31));
-         invJ[ijklNM(1, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J31 * J12)-(J32 * J11));
-         invJ[ijklNM(2, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J11 * J22)-(J12 * J21));
-         detJ[ijN(q, e,NUM_QUAD)] = r_detJ;
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fIniGeom)(const int numElements,
-                         const double* restrict dofToQuadD,
-                         const double* restrict nodes,
-                         double* restrict J,
-                         double* restrict invJ,
-                         double* restrict detJ);
-
-
-// *****************************************************************************
-void rIniGeom(const int DIM,
-              const int NUM_DOFS,
-              const int NUM_QUAD,
-              const int numElements,
-              const double* dofToQuadD,
-              const double* nodes,
-              double* restrict J,
-              double* restrict invJ,
-              double* restrict detJ)
-{
-   const int blck = HIP_BLOCK_SIZE;
-   const int grid = (numElements+blck-1)/blck;
-   const unsigned int dofs1D = IROOT(DIM,NUM_DOFS);
-   const unsigned int quad1D = IROOT(DIM,NUM_QUAD);
-   const unsigned int id = (DIM<<4)|(dofs1D-2);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(dofs1D-2)<=4);
-   if (quad1D!=2*(dofs1D-1))
-   {
-      printf("\033[31;1m[rIniGeom] order ERROR: -ok=p -ot=p-1, p in [1,16] (%d,%d)\033[m\n",
-                       quad1D,dofs1D);
-      return exit(1);
-   }
-   assert(quad1D==2*(dofs1D-1));
-   static std::unordered_map<unsigned int, fIniGeom> call =
-   {
-      // 2D
-      {0x20,&rIniGeom2D<2*2,(2*2-2)*(2*2-2)>},
-      {0x21,&rIniGeom2D<3*3,(3*2-2)*(3*2-2)>},
-      {0x22,&rIniGeom2D<4*4,(4*2-2)*(4*2-2)>},
-      {0x23,&rIniGeom2D<5*5,(5*2-2)*(5*2-2)>},
-      {0x24,&rIniGeom2D<6*6,(6*2-2)*(6*2-2)>},
-      {0x25,&rIniGeom2D<7*7,(7*2-2)*(7*2-2)>},
-      {0x26,&rIniGeom2D<8*8,(8*2-2)*(8*2-2)>},
-      {0x27,&rIniGeom2D<9*9,(9*2-2)*(9*2-2)>},
-      {0x28,&rIniGeom2D<10*10,(10*2-2)*(10*2-2)>},
-      {0x29,&rIniGeom2D<11*11,(11*2-2)*(11*2-2)>},
-      {0x2A,&rIniGeom2D<12*12,(12*2-2)*(12*2-2)>},
-      {0x2B,&rIniGeom2D<13*13,(13*2-2)*(13*2-2)>},
-      {0x2C,&rIniGeom2D<14*14,(14*2-2)*(14*2-2)>},
-      {0x2D,&rIniGeom2D<15*15,(15*2-2)*(15*2-2)>},
-      {0x2E,&rIniGeom2D<16*16,(16*2-2)*(16*2-2)>},
-      {0x2F,&rIniGeom2D<17*17,(17*2-2)*(17*2-2)>},
-      // 3D
-      {0x30,&rIniGeom3D<2*2*2,2*2*2>},
-      {0x31,&rIniGeom3D<3*3*3,4*4*4>},
-      {0x32,&rIniGeom3D<4*4*4,6*6*6>},
-      {0x33,&rIniGeom3D<5*5*5,8*8*8>},
-      {0x34,&rIniGeom3D<6*6*6,10*10*10>},
-      {0x35,&rIniGeom3D<7*7*7,12*12*12>},
-      {0x36,&rIniGeom3D<8*8*8,14*14*14>},
-      {0x37,&rIniGeom3D<9*9*9,16*16*16>},
-      {0x38,&rIniGeom3D<10*10*10,18*18*18>},
-      {0x39,&rIniGeom3D<11*11*11,20*20*20>},
-      {0x3A,&rIniGeom3D<12*12*12,22*22*22>},
-      {0x3B,&rIniGeom3D<13*13*13,24*24*24>},
-      {0x3C,&rIniGeom3D<14*14*14,26*26*26>},
-      {0x3D,&rIniGeom3D<15*15*15,28*28*28>},
-      {0x3E,&rIniGeom3D<16*16*16,30*30*30>},
-      {0x3F,&rIniGeom3D<17*17*17,32*32*32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rIniGeom] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuadD,nodes,J,invJ,detJ);
-}
diff --git a/hip/hip/kernels/hip.hpp b/hip/hip/kernels/hip.hpp
deleted file mode 100644
index d1795d68..00000000
--- a/hip/hip/kernels/hip.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_KERNELS_HIP
-#define LAGHOS_HIP_KERNELS_HIP
-
-// *****************************************************************************
-#include <math.h>
-#include <stdarg.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <functional>
-#include <unordered_map>
-
-// *****************************************************************************
-#define LOG2(X) ((unsigned) (8*sizeof(unsigned long long)-__builtin_clzll((X))))
-#define ISQRT(N) sqrt(static_cast<float>(N))
-#define ICBRT(N) cbrt(static_cast<float>(N))
-#define IROOT(D,N) ((D==1)?N:(D==2)?ISQRT(N):(D==3)?ICBRT(N):0)
-
-// *****************************************************************************
-#include <hip/hip_runtime.h>
-
-// *****************************************************************************
-#include "../config/config.hpp"
-#include "../general/memcpy.hpp"
-#include "../general/malloc.hpp"
-
-// *****************************************************************************
-#include "include/forall.hpp"
-#include "include/offsets.hpp"
-#include "include/kernels.hpp"
-
-#endif // LAGHOS_HIP_KERNELS_HIP
diff --git a/hip/hip/kernels/include/forall.hpp b/hip/hip/kernels/include/forall.hpp
deleted file mode 100644
index fca4cbe8..00000000
--- a/hip/hip/kernels/include/forall.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_KERNELS_FORALL
-#define LAGHOS_HIP_KERNELS_FORALL
-
-// *****************************************************************************
-#define HIP_BLOCK_SIZE 256
-
-#define ELEMENT_BATCH 10
-#define M2_ELEMENT_BATCH 32
-
-// *****************************************************************************
-#define kernel __global__
-#define share __shared__
-#define sync __syncthreads();
-// *****************************************************************************
-#define hipKer(name,end,...)                                             \
-   hipLaunchKernelGGL((name ## 0), dim3((end+HIP_BLOCK_SIZE-1)/HIP_BLOCK_SIZE),   \
-               dim3(HIP_BLOCK_SIZE), 0, 0, end,__VA_ARGS__)
-#define hipKerGBS(name,grid,block,end,...) hipLaunchKernelGGL((name ## 0), dim3(grid), dim3(block), 0, 0, end,__VA_ARGS__)
-#define call0(id,grid,blck,...) hipLaunchKernelGGL((call[id]), dim3(grid), dim3(blck), 0, 0, __VA_ARGS__)
-
-#endif // LAGHOS_HIP_KERNELS_FORALL
diff --git a/hip/hip/kernels/include/kernels.hpp b/hip/hip/kernels/include/kernels.hpp
deleted file mode 100644
index 3ac56fff..00000000
--- a/hip/hip/kernels/include/kernels.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_KERNELS
-#define LAGHOS_HIP_KERNELS
-
-// *****************************************************************************
-#define restrict __restrict__
-
-// **** BLAS1 ******************************************************************
-void vector_neg(const int, double* restrict);
-void vector_op_eq(const int, const double, double* restrict);
-void vector_xpay(const int, const double,
-                 double* restrict, const double* restrict,
-                 const double* restrict);
-void vector_xsy(const int, double* restrict,
-                const double* restrict, const double* restrict);
-void vector_axpy(const int, const double, double* restrict,
-                 const double* restrict);
-void vector_map_dofs(const int, double* restrict,
-                     const double* restrict, const int* restrict);
-void vector_clear_dofs(const int, double* restrict, const int* restrict);
-void vector_vec_sub(const int, double* restrict, const double* restrict);
-void vector_vec_add(const int, double* restrict, const double* restrict);
-void vector_vec_mul(const int, double* restrict, const double);
-void vector_set_subvector(const int, double* restrict, const double* restrict,
-                          const int* restrict);
-void vector_get_subvector(const int, double* restrict, const double* restrict,
-                          const int* restrict);
-void vector_set_subvector_const(const int, const double, double* restrict,
-                                const int* restrict);
-double vector_dot(const int, const double* restrict, const double* restrict);
-double vector_min(const int, const double* restrict);
-
-// *****************************************************************************
-void reduceMin(int, const double*, double*);
-void reduceSum(int, const double*, const double*, double*);
-
-// *****************************************************************************
-void rGridFuncToQuad(const int, const int, const int,
-                     const int, const int,
-                     const double* restrict, const int* restrict,
-                     const double* restrict, double* restrict);
-
-void rGridFuncToQuadS(const int, const int, const int,
-                      const int, const int,
-                      const double* restrict, const int* restrict,
-                      const double* restrict, double* restrict);
-
-// mapping *********************************************************************
-void rSetSubVector(const int, const int* restrict,
-                   const double* restrict, double* restrict);
-
-void rMapSubVector(const int, const int* restrict,
-                   const double* restrict, double* restrict);
-
-void rExtractSubVector(const int ries, const int* restrict,
-                       const double* restrict, double* restrict);
-
-// kQuadratureData *************************************************************
-void rInitQuadratureData(const int, const int,
-                         const double* restrict, const double* restrict,
-                         const double* restrict, double* restrict);
-
-void rUpdateQuadratureData(const double, const double, const double,
-                           const bool, const int, const int, const int,
-                           const int, const int,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           const double* restrict, const double* restrict,
-                           double* restrict, double* restrict);
-void rUpdateQuadratureDataS(const double, const double, const double,
-                            const bool, const int, const int, const int,
-                            const int, const int,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            const double* restrict, const double* restrict,
-                            double* restrict, double* restrict);
-
-// kForce **********************************************************************
-void rForceMult(const int, const int, const int, const int, const int,
-                const int,
-                const double* restrict, const double* restrict,
-                const double* restrict, const double* restrict,
-                const double* restrict, double* restrict);
-void rForceMultS(const int, const int, const int, const int, const int,
-                 const int, const double* restrict, const double* restrict,
-                 const double* restrict, const double* restrict,
-                 const double* restrict, double* restrict);
-
-void rForceMultTranspose(const int, const int, const int, const int,
-                         const int, const int,
-                         const double* restrict, const double* restrict,
-                         const double* restrict, const double* restrict,
-                         const double* restrict, double* restrict);
-void rForceMultTransposeS(const int, const int, const int,
-                          const int, const int, const int,
-                          const double* restrict, const double* restrict,
-                          const double* restrict, const double* restrict,
-                          const double* restrict, double* restrict);
-
-// *****************************************************************************
-void rNodeCopyByVDim(const int, const int, const int, const int,
-                     const int* restrict, const double* restrict,
-                     double* restrict);
-
-// *****************************************************************************
-void rIniGeom(const int, const int, const int, const int,
-              const double* restrict, const double* restrict,
-              double* restrict, double* restrict, double* restrict);
-
-// *****************************************************************************
-void rGlobalToLocal(const int, const bool, const int, const int,
-                    const int* restrict, const int* restrict,
-                    const double* restrict, double* restrict);
-
-void rLocalToGlobal(const int, const bool, const int,
-                    const int, const int* restrict, const int* restrict,
-                    const double* restrict, double* restrict);
-
-// *****************************************************************************
-void rMassMultAdd(const int, const int, const int, const int,
-                  const double* restrict, const double* restrict,
-                  const double* restrict, const double* restrict,
-                  const double* restrict, const double* restrict,
-                  double* restrict);
-
-void rMassMultAddS(const int, const int, const int, const int,
-                   const double* restrict, const double* restrict,
-                   const double* restrict, const double* restrict,
-                   const double* restrict, const double* restrict,
-                   double* restrict);
-
-#endif // LAGHOS_HIP_KERNELS
diff --git a/hip/hip/kernels/include/offsets.hpp b/hip/hip/kernels/include/offsets.hpp
deleted file mode 100644
index 29e31e09..00000000
--- a/hip/hip/kernels/include/offsets.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_KERNEL_OFFSETS
-#define LAGHOS_HIP_KERNEL_OFFSETS
-
-// N-Offsets *******************************************************************
-#define   ijN(i,j,N) (i)+(N)*(j)
-#define  ijkN(i,j,k,N) (i)+(N)*((j)+(N)*(k))
-#define ijklN(i,j,k,l,N) (i)+(N)*((j)+(N)*((k)+(N)*(l)))
-
-// N,M-Offsets *****************************************************************
-#define    ijNMt(i,j,N,M,t) (t)?((i)+(N)*(j)):((j)+(M)*(i))
-#define    ijkNM(i,j,k,N,M) (i)+(N)*((j)+(M)*(k))
-#define   _ijkNM(i,j,k,N,M) (j)+(N)*((k)+(M)*(i))
-#define   ijklNM(i,j,k,l,N,M) (i)+(N)*((j)+(N)*((k)+(M)*(l)))
-#define  _ijklNM(i,j,k,l,N,M)  (j)+(N)*((k)+(N)*((l)+(M)*(i)))
-#define  ijklmNM(i,j,k,l,m,N,M) (i)+(N)*((j)+(N)*((k)+(M)*((l)+(M)*(m))))
-#define _ijklmNM(i,j,k,l,m,N,M) (j)+(N)*((k)+(N)*((l)+(N)*((m)+(M)*(i))))
-#define ijklmnNM(i,j,k,l,m,n,N,M) (i)+(N)*((j)+(N)*((k)+(M)*((l)+(M)*((m)+(M)*(n)))))
-
-#endif // LAGHOS_HIP_KERNEL_OFFSETS
diff --git a/hip/hip/kernels/maps/globalToLocal.cpp b/hip/hip/kernels/maps/globalToLocal.cpp
deleted file mode 100644
index c63ced81..00000000
--- a/hip/hip/kernels/maps/globalToLocal.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rGlobalToLocal0(const int globalEntries,
-                     const int NUM_VDIM,
-                     const bool VDIM_ORDERING,
-                     const int localEntries,
-                     const int* __restrict offsets,
-                     const int* __restrict indices,
-                     const double* __restrict globalX,
-                     double* __restrict localX)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < globalEntries)
-   {
-      const int offset = offsets[i];
-      const int nextOffset = offsets[i+1];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         const int g_offset = ijNMt(v,i,NUM_VDIM,globalEntries,VDIM_ORDERING);
-         const double dofValue = globalX[g_offset];
-         for (int j = offset; j < nextOffset; ++j)
-         {
-            const int l_offset = ijNMt(v,indices[j],NUM_VDIM,localEntries,VDIM_ORDERING);
-            localX[l_offset] = dofValue;
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-void rGlobalToLocal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* __restrict offsets,
-                    const int* __restrict indices,
-                    const double* __restrict globalX,
-                    double* __restrict localX)
-{
-   hipKer(rGlobalToLocal,globalEntries,NUM_VDIM,VDIM_ORDERING,
-         localEntries,offsets,indices,globalX,localX);
-}
diff --git a/hip/hip/kernels/maps/localToGlobal.cpp b/hip/hip/kernels/maps/localToGlobal.cpp
deleted file mode 100644
index bfe5ef74..00000000
--- a/hip/hip/kernels/maps/localToGlobal.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rLocalToGlobal0(const int globalEntries,
-                     const int NUM_VDIM,
-                     const bool VDIM_ORDERING,
-                     const int localEntries,
-                     const int* offsets,
-                     const int* indices,
-                     const double* localX,
-                     double* __restrict globalX)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < globalEntries)
-   {
-      const int offset = offsets[i];
-      const int nextOffset = offsets[i + 1];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         double dofValue = 0;
-         for (int j = offset; j < nextOffset; ++j)
-         {
-            const int l_offset = ijNMt(v,indices[j],NUM_VDIM,localEntries,VDIM_ORDERING);
-            dofValue += localX[l_offset];
-         }
-         const int g_offset = ijNMt(v,i,NUM_VDIM,globalEntries,VDIM_ORDERING);
-         globalX[g_offset] = dofValue;
-      }
-   }
-}
-
-// *****************************************************************************
-void rLocalToGlobal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* offsets,
-                    const int* indices,
-                    const double* localX,
-                    double* __restrict globalX)
-{
-   hipKer(rLocalToGlobal,globalEntries,NUM_VDIM,VDIM_ORDERING,
-         localEntries,offsets,indices,localX,globalX);
-}
diff --git a/hip/hip/kernels/maps/mapping.cpp b/hip/hip/kernels/maps/mapping.cpp
deleted file mode 100644
index 5bc660bf..00000000
--- a/hip/hip/kernels/maps/mapping.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rSetSubVector0(const int N,
-                    const int* indices,
-                    const double* in,
-                    double* __restrict out)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { out[indices[i]] = in[i]; }
-}
-
-// *****************************************************************************
-void rSetSubVector(const int N,
-                   const int* indices,
-                   const double* in,
-                   double* __restrict out)
-{
-   hipKer(rSetSubVector,N,indices,in,out);
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rMapSubVector0(const int N,
-                    const int* indices,
-                    const double* in,
-                    double* __restrict out)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N)
-   {
-      const int fromIdx = indices[2*i + 0];
-      const int toIdx   = indices[2*i + 1];
-      out[toIdx] = in[fromIdx];
-   }
-}
-
-// *****************************************************************************
-void rMapSubVector(const int N,
-                   const int* indices,
-                   const double* in,
-                   double* __restrict out)
-{
-   hipKer(rMapSubVector,N,indices,in,out);
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rExtractSubVector0(const int N,
-                        const int* indices,
-                        const double* in,
-                        double* __restrict out)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i < N) { out[i] = in[indices[i]]; }
-}
-
-// *****************************************************************************
-void rExtractSubVector(const int N,
-                       const int* indices,
-                       const double* in,
-                       double* __restrict out)
-{
-   hipKer(rExtractSubVector,N,indices,in,out);
-}
diff --git a/hip/hip/kernels/mass/assemble.cpp b/hip/hip/kernels/mass/assemble.cpp
deleted file mode 100644
index e680877a..00000000
--- a/hip/hip/kernels/mass/assemble.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble2D0(const int numElements,
-                      const int NUM_QUAD_2D,
-                      const double COEFF,
-                      const double* quadWeights,
-                      const double* J,
-                      double* __restrict oper)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      for (int q = 0; q < NUM_QUAD_2D; ++q)
-      {
-         const double J11 = J[ijklNM(0,0,q,e,2,NUM_QUAD_2D)];
-         const double J12 = J[ijklNM(1,0,q,e,2,NUM_QUAD_2D)];
-         const double J21 = J[ijklNM(0,1,q,e,2,NUM_QUAD_2D)];
-         const double J22 = J[ijklNM(1,1,q,e,2,NUM_QUAD_2D)];
-         const double detJ = ((J11 * J22)-(J21 * J12));
-         oper[ijN(q,e,NUM_QUAD_2D)] = quadWeights[q] * COEFF * detJ;
-      }
-   }
-}
-
-// *****************************************************************************
-static void rMassAssemble2D(const int numElements,
-                            const int NUM_QUAD_2D,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   hipKer(rMassAssemble2D,numElements,NUM_QUAD_2D,COEFF,quadWeights,J,oper);
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble3D0(const int numElements,
-                      const int NUM_QUAD_3D,
-                      const double COEFF,
-                      const double* quadWeights,
-                      const double* J,
-                      double* __restrict oper)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      for (int q = 0; q < NUM_QUAD_3D; ++q)
-      {
-         const double J11 = J[ijklNM(0,0,q,e,3,NUM_QUAD_3D)];
-         const double J12 = J[ijklNM(1,0,q,e,3,NUM_QUAD_3D)];
-         const double J13 = J[ijklNM(2,0,q,e,3,NUM_QUAD_3D)];
-         const double J21 = J[ijklNM(0,1,q,e,3,NUM_QUAD_3D)];
-         const double J22 = J[ijklNM(1,1,q,e,3,NUM_QUAD_3D)];
-         const double J23 = J[ijklNM(2,1,q,e,3,NUM_QUAD_3D)];
-         const double J31 = J[ijklNM(0,2,q,e,3,NUM_QUAD_3D)];
-         const double J32 = J[ijklNM(1,2,q,e,3,NUM_QUAD_3D)];
-         const double J33 = J[ijklNM(2,2,q,e,3,NUM_QUAD_3D)];
-         const double detJ = ((J11*J22*J33)+(J12*J23*J31)+
-                              (J13*J21*J32)-(J13*J22*J31)-
-                              (J12*J21*J33)-(J11*J23*J32));
-         oper[ijN(q,e,NUM_QUAD_3D)] = quadWeights[q]*COEFF*detJ;
-      }
-   }
-}
-static void rMassAssemble3D(const int NUM_QUAD_3D,
-                            const int numElements,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   hipKer(rMassAssemble3D,numElements,NUM_QUAD_3D,COEFF,quadWeights,J,oper);
-}
-
-// *****************************************************************************
-void rMassAssemble(const int dim,
-                   const int NUM_QUAD,
-                   const int numElements,
-                   const double* quadWeights,
-                   const double* J,
-                   const double COEFF,
-                   double* __restrict oper)
-{
-   assert(false);
-   if (dim==1) { assert(false); }
-   if (dim==2) { rMassAssemble2D(numElements,NUM_QUAD,COEFF,quadWeights,J,oper); }
-   if (dim==3) { rMassAssemble3D(numElements,NUM_QUAD,COEFF,quadWeights,J,oper); }
-}
diff --git a/hip/hip/kernels/mass/multAdd.cpp b/hip/hip/kernels/mass/multAdd.cpp
deleted file mode 100644
index e2224465..00000000
--- a/hip/hip/kernels/mass/multAdd.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd2D(const int numElements,
-                    const double* restrict dofToQuad,
-                    const double* restrict dofToQuadD,
-                    const double* restrict quadToDof,
-                    const double* restrict quadToDofD,
-                    const double* restrict oper,
-                    const double* restrict solIn,
-                    double* restrict solOut)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double sol_xy[NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            sol_xy[qy][qx] = 0.0;
-         }
-      }
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double sol_x[NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            sol_x[qy] = 0.0;
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            const double s = solIn[ijkN(dx,dy,e,NUM_DOFS_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_x[qx] += dofToQuad[ijN(qx,dx,NUM_QUAD_1D)]* s;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double d2q = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xy[qy][qx] += d2q * sol_x[qx];
-            }
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            sol_xy[qy][qx] *= oper[ijkN(qx,qy,e,NUM_QUAD_1D)];
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         double sol_x[NUM_DOFS_1D];
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            sol_x[dx] = 0.0;
-         }
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            const double s = sol_xy[qy][qx];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_x[dx] += quadToDof[ijN(dx,qx,NUM_DOFS_1D)] * s;
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            const double q2d = quadToDof[ijN(dy,qy,NUM_DOFS_1D)];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               solOut[ijkN(dx,dy,e,NUM_DOFS_1D)] += q2d * sol_x[dx];
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd3D(const int numElements,
-                    const double* dofToQuad,
-                    const double* dofToQuadD,
-                    const double* quadToDof,
-                    const double* quadToDofD,
-                    const double* oper,
-                    const double* solIn,
-                    double* __restrict solOut)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double sol_xyz[NUM_QUAD_1D][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xyz[qz][qy][qx] = 0;
-            }
-         }
-      }
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double sol_xy[NUM_QUAD_1D][NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xy[qy][qx] = 0;
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double sol_x[NUM_QUAD_1D];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_x[qx] = 0;
-            }
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               const double s = solIn[ijklN(dx,dy,dz,e,NUM_DOFS_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_x[qx] += dofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * s;
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_xy[qy][qx] += wy * sol_x[qx];
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
-               }
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xyz[qz][qy][qx] *= oper[ijklN(qx,qy,qz,e,NUM_QUAD_1D)];
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         double sol_xy[NUM_DOFS_1D][NUM_DOFS_1D];
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_xy[dy][dx] = 0;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double sol_x[NUM_DOFS_1D];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_x[dx] = 0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double s = sol_xyz[qz][qy][qx];
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  sol_x[dx] += quadToDof[ijN(dx,qx,NUM_DOFS_1D)] * s;
-               }
-            }
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               const double wy = quadToDof[ijN(dy,qy,NUM_DOFS_1D)];
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  sol_xy[dy][dx] += wy * sol_x[dx];
-               }
-            }
-         }
-         for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-         {
-            const double wz = quadToDof[ijN(dz,qz,NUM_DOFS_1D)];
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  solOut[ijklN(dx,dy,dz,e,NUM_DOFS_1D)] += wz * sol_xy[dy][dx];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fMassMultAdd)(const int numElements,
-                             const double* dofToQuad,
-                             const double* dofToQuadD,
-                             const double* quadToDof,
-                             const double* quadToDofD,
-                             const double* oper,
-                             const double* solIn,
-                             double* __restrict solOut);
-
-// *****************************************************************************
-void rMassMultAdd(const int DIM,
-                  const int NUM_DOFS_1D,
-                  const int NUM_QUAD_1D,
-                  const int numElements,
-                  const double* dofToQuad,
-                  const double* dofToQuadD,
-                  const double* quadToDof,
-                  const double* quadToDofD,
-                  const double* op,
-                  const double* x,
-                  double* __restrict y)
-{
-   const int blck = 256;
-   const int grid = (numElements+blck-1)/blck;
-   assert(LOG2(DIM)<=4);
-   assert((NUM_QUAD_1D&1)==0);
-   assert(LOG2(NUM_DOFS_1D-1)<=8);
-   assert(LOG2(NUM_QUAD_1D>>1)<=8);
-   const unsigned int id = (DIM<<16)|((NUM_DOFS_1D-1)<<8)|(NUM_QUAD_1D>>1);
-   static std::unordered_map<unsigned int, fMassMultAdd> call =
-   {
-      // 2D
-      {0x20001,&rMassMultAdd2D<1,2>},    {0x20101,&rMassMultAdd2D<2,2>},
-      {0x20102,&rMassMultAdd2D<2,4>},    {0x20202,&rMassMultAdd2D<3,4>},
-      {0x20203,&rMassMultAdd2D<3,6>},    {0x20303,&rMassMultAdd2D<4,6>},
-      {0x20304,&rMassMultAdd2D<4,8>},    {0x20404,&rMassMultAdd2D<5,8>},
-      {0x20405,&rMassMultAdd2D<5,10>},   {0x20505,&rMassMultAdd2D<6,10>},
-      {0x20506,&rMassMultAdd2D<6,12>},   {0x20606,&rMassMultAdd2D<7,12>},
-      {0x20607,&rMassMultAdd2D<7,14>},   {0x20707,&rMassMultAdd2D<8,14>},
-      {0x20708,&rMassMultAdd2D<8,16>},   {0x20808,&rMassMultAdd2D<9,16>},
-      {0x20809,&rMassMultAdd2D<9,18>},   {0x20909,&rMassMultAdd2D<10,18>},
-      {0x2090A,&rMassMultAdd2D<10,20>},  {0x20A0A,&rMassMultAdd2D<11,20>},
-      {0x20A0B,&rMassMultAdd2D<11,22>},  {0x20B0B,&rMassMultAdd2D<12,22>},
-      {0x20B0C,&rMassMultAdd2D<12,24>},  {0x20C0C,&rMassMultAdd2D<13,24>},
-      {0x20C0D,&rMassMultAdd2D<13,26>},  {0x20D0D,&rMassMultAdd2D<14,26>},
-      {0x20D0E,&rMassMultAdd2D<14,28>},  {0x20E0E,&rMassMultAdd2D<15,28>},
-      {0x20E0F,&rMassMultAdd2D<15,30>},  {0x20F0F,&rMassMultAdd2D<16,30>},
-      {0x20F10,&rMassMultAdd2D<16,32>},  {0x21010,&rMassMultAdd2D<17,32>},
-      // 3D
-      {0x30001,&rMassMultAdd3D<1,2>},    {0x30101,&rMassMultAdd3D<2,2>},
-      {0x30102,&rMassMultAdd3D<2,4>},    {0x30202,&rMassMultAdd3D<3,4>},
-      {0x30203,&rMassMultAdd3D<3,6>},    {0x30303,&rMassMultAdd3D<4,6>},
-      {0x30304,&rMassMultAdd3D<4,8>},    {0x30404,&rMassMultAdd3D<5,8>},
-      {0x30405,&rMassMultAdd3D<5,10>},   {0x30505,&rMassMultAdd3D<6,10>},
-      {0x30506,&rMassMultAdd3D<6,12>},   {0x30606,&rMassMultAdd3D<7,12>},
-      {0x30607,&rMassMultAdd3D<7,14>},   {0x30707,&rMassMultAdd3D<8,14>},
-      {0x30708,&rMassMultAdd3D<8,16>},   {0x30808,&rMassMultAdd3D<9,16>},
-      {0x30809,&rMassMultAdd3D<9,18>},   {0x30909,&rMassMultAdd3D<10,18>},
-      {0x3090A,&rMassMultAdd3D<10,20>},  {0x30A0A,&rMassMultAdd3D<11,20>},
-      {0x30A0B,&rMassMultAdd3D<11,22>},  {0x30B0B,&rMassMultAdd3D<12,22>},
-      {0x30B0C,&rMassMultAdd3D<12,24>},  {0x30C0C,&rMassMultAdd3D<13,24>},
-      {0x30C0D,&rMassMultAdd3D<13,26>},  {0x30D0D,&rMassMultAdd3D<14,26>},
-      {0x30D0E,&rMassMultAdd3D<14,28>},  {0x30E0E,&rMassMultAdd3D<15,28>},
-      {0x30E0F,&rMassMultAdd3D<15,30>},  {0x30F0F,&rMassMultAdd3D<16,30>},
-      {0x30F10,&rMassMultAdd3D<16,32>},  {0x31010,&rMassMultAdd3D<17,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rMassMultAdd] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuad,dofToQuadD,quadToDof,quadToDofD,op,x,y);
-}
diff --git a/hip/hip/kernels/quad/gridFuncToQuad.cpp b/hip/hip/kernels/quad/gridFuncToQuad.cpp
deleted file mode 100644
index aac60b7d..00000000
--- a/hip/hip/kernels/quad/gridFuncToQuad.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad1D(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double r_out[NUM_VDIM][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            r_out[v][qx] = 0;
-         }
-      }
-      for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-      {
-         const int gid = l2gMap[(dx) + (NUM_DOFS_1D) * (e)];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            const double r_gf = gf[v + gid * NUM_VDIM];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               r_out[v][qx] += r_gf * dofToQuad[(qx) + (NUM_QUAD_1D) * (dx)];
-            }
-         }
-      }
-      for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-      {
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            out[(qx) + (NUM_QUAD_1D) * ((e) + (numElements) * (v))] = r_out[v][qx];
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad2D(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double out_xy[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               out_xy[v][qy][qx] = 0;
-            }
-         }
-      }
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double out_x[NUM_VDIM][NUM_QUAD_1D];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               out_x[v][qy] = 0;
-            }
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            const int gid = l2gMap[ijkN(dx, dy, e,NUM_DOFS_1D)];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               const double r_gf = gf[v + gid*NUM_VDIM];
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  out_x[v][qy] += r_gf * dofToQuad[ijN(qy, dx,NUM_QUAD_1D)];
-               }
-            }
-         }
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double d2q = dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xy[v][qy][qx] += d2q * out_x[v][qx];
-               }
-            }
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               out[_ijklNM(v, qx, qy, e,NUM_QUAD_1D,numElements)] = out_xy[v][qy][qx];
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad3D(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int e = blockDim.x * blockIdx.x + threadIdx.x;
-   if (e < numElements)
-   {
-      double out_xyz[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xyz[v][qz][qy][qx] = 0;
-               }
-            }
-         }
-      }
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double out_xy[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xy[v][qy][qx] = 0;
-               }
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double out_x[NUM_VDIM][NUM_QUAD_1D];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_x[v][qx] = 0;
-               }
-            }
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               const int gid = l2gMap[ijklN(dx, dy, dz, e,NUM_DOFS_1D)];
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  const double r_gf = gf[v + gid*NUM_VDIM];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_x[v][qx] += r_gf * dofToQuad[ijN(qx, dx, NUM_QUAD_1D)];
-                  }
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = dofToQuad[ijN(qy, dy, NUM_QUAD_1D)];
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_xy[v][qy][qx] += wy * out_x[v][qx];
-                  }
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = dofToQuad[ijN(qz, dz, NUM_QUAD_1D)];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_xyz[v][qz][qy][qx] += wz * out_xy[v][qy][qx];
-                  }
-               }
-            }
-         }
-      }
-
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  out[_ijklmNM(v, qx, qy, qz, e,NUM_QUAD_1D,
-                               numElements)] = out_xyz[v][qz][qy][qx];
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fGridFuncToQuad)(const int numElements,
-                                const double* restrict dofToQuad,
-                                const int* restrict l2gMap,
-                                const double* gf,
-                                double* restrict out);
-
-// *****************************************************************************
-void rGridFuncToQuad(const int DIM,
-                     const int NUM_VDIM,
-                     const int NUM_DOFS_1D,
-                     const int NUM_QUAD_1D,
-                     const int numElements,
-                     const double* dofToQuad,
-                     const int* l2gMap,
-                     const double* gf,
-                     double* __restrict out)
-{
-   const int blck = HIP_BLOCK_SIZE;
-   const int grid = (numElements+blck-1)/blck;
-   const unsigned int id = (DIM<<8)|(NUM_VDIM<<4)|(NUM_DOFS_1D-1);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(NUM_VDIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-1)<=4);
-   assert(NUM_QUAD_1D==2*NUM_DOFS_1D);
-   if (NUM_QUAD_1D!=2*NUM_DOFS_1D)
-   {
-      printf("\033[31;1m[rGridFuncToQuad] order ERROR: -ok=p -ot=p-1, p in [1,16]\033[m\n");
-      return exit(1);
-   }
-   static std::unordered_map<unsigned int, fGridFuncToQuad> call =
-   {
-      // 2D
-      {0x210,&rGridFuncToQuad2D<1,1,2>},
-      {0x211,&rGridFuncToQuad2D<1,2,4>},
-      {0x212,&rGridFuncToQuad2D<1,3,6>},
-      {0x213,&rGridFuncToQuad2D<1,4,8>},
-      {0x214,&rGridFuncToQuad2D<1,5,10>},
-      {0x215,&rGridFuncToQuad2D<1,6,12>},
-      {0x216,&rGridFuncToQuad2D<1,7,14>},
-      {0x217,&rGridFuncToQuad2D<1,8,16>},
-      {0x218,&rGridFuncToQuad2D<1,9,18>},
-      {0x219,&rGridFuncToQuad2D<1,10,20>},
-      {0x21A,&rGridFuncToQuad2D<1,11,22>},
-      {0x21B,&rGridFuncToQuad2D<1,12,24>},
-      {0x21C,&rGridFuncToQuad2D<1,13,26>},
-      {0x21D,&rGridFuncToQuad2D<1,14,28>},
-      {0x21E,&rGridFuncToQuad2D<1,15,30>},
-      {0x21F,&rGridFuncToQuad2D<1,16,32>},
-
-      // 3D
-      {0x310,&rGridFuncToQuad3D<1,1,2>},
-      {0x311,&rGridFuncToQuad3D<1,2,4>},
-      {0x312,&rGridFuncToQuad3D<1,3,6>},
-      {0x313,&rGridFuncToQuad3D<1,4,8>},
-      {0x314,&rGridFuncToQuad3D<1,5,10>},
-      {0x315,&rGridFuncToQuad3D<1,6,12>},
-      {0x316,&rGridFuncToQuad3D<1,7,14>},
-      {0x317,&rGridFuncToQuad3D<1,8,16>},
-      {0x318,&rGridFuncToQuad3D<1,9,18>},
-      {0x319,&rGridFuncToQuad3D<1,10,20>},
-      {0x31A,&rGridFuncToQuad3D<1,11,22>},
-      {0x31B,&rGridFuncToQuad3D<1,12,24>},
-      {0x31C,&rGridFuncToQuad3D<1,13,26>},
-      {0x31D,&rGridFuncToQuad3D<1,14,28>},
-      {0x31E,&rGridFuncToQuad3D<1,15,30>},
-      {0x31F,&rGridFuncToQuad3D<1,16,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rGridFuncToQuad] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuad,l2gMap,gf,out);
-}
diff --git a/hip/hip/kernels/quad/qDataInit.cpp b/hip/hip/kernels/quad/qDataInit.cpp
deleted file mode 100644
index 0432dc4e..00000000
--- a/hip/hip/kernels/quad/qDataInit.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-
-// *****************************************************************************
-template<const int NUM_QUAD> kernel
-void rInitQuadData(const int nzones,
-                   const double* restrict rho0,
-                   const double* restrict detJ,
-                   const double* restrict quadWeights,
-                   double* restrict rho0DetJ0w)
-{
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < nzones)
-   {
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         rho0DetJ0w[ijN(q,el,NUM_QUAD)] =
-            rho0[ijN(q,el,NUM_QUAD)]*detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fInitQuadratureData)(const int,const double*,const double*,
-                                    const double*,double*);
-void rInitQuadratureData(const int NUM_QUAD,
-                         const int numElements,
-                         const double* restrict rho0,
-                         const double* restrict detJ,
-                         const double* restrict quadWeights,
-                         double* restrict rho0DetJ0w)
-{
-   const int blck = HIP_BLOCK_SIZE;
-   const int grid = (numElements+blck-1)/blck;
-   const unsigned int id = NUM_QUAD;
-   static std::unordered_map<unsigned int, fInitQuadratureData> call =
-   {
-      {2,&rInitQuadData<2>},
-      {4,&rInitQuadData<4>},
-      {8,&rInitQuadData<8>},
-      {16,&rInitQuadData<16>},
-      {25,&rInitQuadData<25>},
-      {36,&rInitQuadData<36>},
-      {49,&rInitQuadData<49>},
-      {64,&rInitQuadData<64>},
-      {81,&rInitQuadData<81>},
-      {100,&rInitQuadData<100>},
-      {121,&rInitQuadData<121>},
-      {125,&rInitQuadData<125>},
-      {144,&rInitQuadData<144>},
-      {196,&rInitQuadData<196>},
-      {216,&rInitQuadData<216>},
-      {256,&rInitQuadData<256>},
-      {324,&rInitQuadData<324>},
-      {400,&rInitQuadData<400>},
-      {484,&rInitQuadData<484>},
-      {512,&rInitQuadData<512>},
-      {576,&rInitQuadData<576>},
-      {676,&rInitQuadData<676>},
-      {900,&rInitQuadData<900>},
-      {1000,&rInitQuadData<1000>},
-      {1024,&rInitQuadData<1024>},
-      {1728,&rInitQuadData<1728>},
-      {2744,&rInitQuadData<2744>},
-      {4096,&rInitQuadData<4096>},
-      {5832,&rInitQuadData<5832>},
-      {8000,&rInitQuadData<8000>},
-      {10648,&rInitQuadData<10648>},
-      {13824,&rInitQuadData<13824>},
-      {17576,&rInitQuadData<17576>},
-      {21952,&rInitQuadData<21952>},
-      {27000,&rInitQuadData<27000>},
-      {32768,&rInitQuadData<32768>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rInitQuadratureData] id \033[33m0x%X (%d)\033[m ",id,id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,rho0,detJ,quadWeights,rho0DetJ0w);
-}
diff --git a/hip/hip/kernels/quad/qDataUpdate.cpp b/hip/hip/kernels/quad/qDataUpdate.cpp
deleted file mode 100644
index 26b9181a..00000000
--- a/hip/hip/kernels/quad/qDataUpdate.cpp
+++ /dev/null
@@ -1,658 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData2D(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      const int DIM = 2;
-      const int VDIMQ = DIM*DIM * NUM_QUAD_2D;
-      double s_gradv[VDIMQ];
-
-      for (int i = 0; i < VDIMQ; ++i) { s_gradv[i] = 0.0; }
-
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double vDx[DIM*NUM_QUAD_1D];
-         double  vx[DIM*NUM_QUAD_1D];
-
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            for (int c = 0; c < DIM; ++c)
-            {
-               vDx[ijN(c,qx,DIM)] = 0.0;
-               vx[ijN(c,qx,DIM)] = 0.0;
-            }
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double wx  =  dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-               const double wDx = dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-               for (int c = 0; c < DIM; ++c)
-               {
-                  vDx[ijN(c,qx,DIM)] += wDx * v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-                  vx[ijN(c,qx,DIM)] +=  wx * v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-               }
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double  wy =  dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            const double wDy = dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               for (int c = 0; c < DIM; ++c)
-               {
-                  s_gradv[ijkN(c,0,qx+qy*NUM_QUAD_1D,DIM)] += wy *vDx[ijN(c,qx,DIM)];
-                  s_gradv[ijkN(c,1,qx+qy*NUM_QUAD_1D,DIM)] += wDy*vx[ijN(c,qx,DIM)];
-               }
-            }
-         }
-      }
-
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double q_gradv[NUM_DIM*NUM_DIM];
-         double q_stress[NUM_DIM*NUM_DIM];
-
-         const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-         q_gradv[ijN(0,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_00)+(s_gradv[ijkN(1,0,q,
-                                                                               2)]*invJ_01));
-         q_gradv[ijN(1,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_10)+(s_gradv[ijkN(1,0,q,
-                                                                               2)]*invJ_11));
-         q_gradv[ijN(0,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_00)+(s_gradv[ijkN(1,1,q,
-                                                                               2)]*invJ_01));
-         q_gradv[ijN(1,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_10)+(s_gradv[ijkN(1,1,q,
-                                                                               2)]*invJ_11));
-
-         const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-
-         const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-         const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-
-         // TODO: Input OccaVector eos(q,e) -> (stress,soundSpeed)
-         const double s = -(GAMMA-1.0)*q_rho*q_e;
-         q_stress[ijN(0,0,2)] = s; q_stress[ijN(1,0,2)] = 0;
-         q_stress[ijN(0,1,2)] = 0; q_stress[ijN(1,1,2)] = s;
-
-         const double gradv00 = q_gradv[ijN(0,0,2)];
-         const double gradv11 = q_gradv[ijN(1,1,2)];
-         const double gradv10 = 0.5*(q_gradv[ijN(1,0,2)]+q_gradv[ijN(0,1,2)]);
-         q_gradv[ijN(1,0,2)] = gradv10;
-         q_gradv[ijN(0,1,2)] = gradv10;
-
-         double comprDirX = 1;
-         double comprDirY = 0;
-         double minEig = 0;
-         // linalg/densemat.cpp: Eigensystem2S()
-         if (gradv10 == 0)
-         {
-            minEig = (gradv00 < gradv11) ? gradv00 : gradv11;
-         }
-         else
-         {
-            const double zeta  = (gradv11-gradv00) / (2.0*gradv10);
-            const double azeta = fabs(zeta);
-            double t = 1.0 / (azeta+sqrt(1.0+zeta*zeta));
-            if ((t < 0) != (zeta < 0))
-            {
-               t = -t;
-            }
-            const double c = sqrt(1.0 / (1.0+t*t));
-            const double s = c*t;
-            t *= gradv10;
-            if ((gradv00-t) <= (gradv11+t))
-            {
-               minEig = gradv00-t;
-               comprDirX = c;
-               comprDirY = -s;
-            }
-            else
-            {
-               minEig = gradv11+t;
-               comprDirX = s;
-               comprDirY = c;
-            }
-         }
-
-         // Computes the initial->physical transformation Jacobian.
-         const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double Jpi_00 = ((J_00*invJ0_00)+(J_10*invJ0_01));
-         const double Jpi_10 = ((J_00*invJ0_10)+(J_10*invJ0_11));
-         const double Jpi_01 = ((J_01*invJ0_00)+(J_11*invJ0_01));
-         const double Jpi_11 = ((J_01*invJ0_10)+(J_11*invJ0_11));
-         const double physDirX = (Jpi_00*comprDirX)+(Jpi_10*comprDirY);
-         const double physDirY = (Jpi_01*comprDirX)+(Jpi_11*comprDirY);
-         const double q_h = H0*sqrt((physDirX*physDirX)+(physDirY*physDirY));
-         // TODO: soundSpeed will be an input as well (function call or values per q)
-         const double soundSpeed = sqrt(GAMMA*(GAMMA-1.0)*q_e);
-         dtEst[ijN(q,el,NUM_QUAD)] = CFL*q_h / soundSpeed;
-         if (USE_VISCOSITY)
-         {
-            // TODO: Check how we can extract outside of kernel
-            const double mu = minEig;
-            double coeff = 2.0*q_rho*q_h*q_h*fabs(mu);
-            if (mu < 0)
-            {
-               coeff += 0.5*q_rho*q_h*soundSpeed;
-            }
-            for (int y = 0; y < NUM_DIM; ++y)
-            {
-               for (int x = 0; x < NUM_DIM; ++x)
-               {
-                  q_stress[ijN(x,y,2)] += coeff*q_gradv[ijN(x,y,2)];
-               }
-            }
-         }
-         const double S00 = q_stress[ijN(0,0,2)];
-         const double S10 = q_stress[ijN(1,0,2)];
-         const double S01 = q_stress[ijN(0,1,2)];
-         const double S11 = q_stress[ijN(1,1,2)];
-         stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_00)+(S10*invJ_01));
-         stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_10)+(S10*invJ_11));
-         stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_00)+(S11*invJ_01));
-         stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_10)+(S11*invJ_11));
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData3D(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   const int el = blockDim.x * blockIdx.x + threadIdx.x;
-   if (el < numElements)
-   {
-      double s_gradv[9*NUM_QUAD_3D];
-
-      for (int i = 0; i < (9*NUM_QUAD_3D); ++i)
-      {
-         s_gradv[i] = 0;
-      }
-
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double vDxy[3*NUM_QUAD_2D] ;
-         double vxDy[3*NUM_QUAD_2D] ;
-         double vxy[3*NUM_QUAD_2D]  ;
-         for (int i = 0; i < (3*NUM_QUAD_2D); ++i)
-         {
-            vDxy[i] = 0;
-            vxDy[i] = 0;
-            vxy[i]  = 0;
-         }
-
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double vDx[3*NUM_QUAD_1D] ;
-            double vx[3*NUM_QUAD_1D]  ;
-            for (int i = 0; i < (3*NUM_QUAD_1D); ++i)
-            {
-               vDx[i] = 0;
-               vx[i]  = 0;
-            }
-
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     vDx[ijN(vi,qx,3)] += v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,
-                                                     numElements)]*dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                     vx[ijN(vi,qx,3)]  += v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,
-                                                     numElements)]*dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  }
-               }
-            }
-
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy  = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               const double wDy = dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     vDxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)] += wy *vDx[ijN(vi,qx,3)];
-                     vxDy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)] += wDy*vx[ijN(vi,qx,3)];
-                     vxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)]  += wy *vx[ijN(vi,qx,3)];
-                  }
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_DOFS_1D; ++qz)
-         {
-            const double wz  = dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            const double wDz = dofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  const int q = qx+qy*NUM_QUAD_1D+qz*NUM_QUAD_2D;
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     s_gradv[ijkN(vi,0,q,3)] += wz *vDxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                     s_gradv[ijkN(vi,1,q,3)] += wz *vxDy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                     s_gradv[ijkN(vi,2,q,3)] += wDz*vxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                  }
-               }
-            }
-         }
-      }
-
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double q_gradv[9]  ;
-         double q_stress[9] ;
-
-         const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_20 = invJ[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_21 = invJ[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_02 = invJ[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_12 = invJ[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_22 = invJ[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         q_gradv[ijN(0,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_02));
-         q_gradv[ijN(1,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_12));
-         q_gradv[ijN(2,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_22));
-
-         q_gradv[ijN(0,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_02));
-         q_gradv[ijN(1,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_12));
-         q_gradv[ijN(2,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_22));
-
-         q_gradv[ijN(0,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_02));
-         q_gradv[ijN(1,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_12));
-         q_gradv[ijN(2,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_22));
-
-         const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-
-         const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-         const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-
-         const double s = -(GAMMA-1.0)*q_rho*q_e;
-         q_stress[ijN(0,0,3)] = s; q_stress[ijN(1,0,3)] = 0; q_stress[ijN(2,0,3)] = 0;
-         q_stress[ijN(0,1,3)] = 0; q_stress[ijN(1,1,3)] = s; q_stress[ijN(2,1,3)] = 0;
-         q_stress[ijN(0,2,3)] = 0; q_stress[ijN(1,2,3)] = 0; q_stress[ijN(2,2,3)] = s;
-
-         const double gradv00 = q_gradv[ijN(0,0,3)];
-         const double gradv11 = q_gradv[ijN(1,1,3)];
-         const double gradv22 = q_gradv[ijN(2,2,3)];
-         const double gradv10 = 0.5*(q_gradv[ijN(1,0,3)]+q_gradv[ijN(0,1,3)]);
-         const double gradv20 = 0.5*(q_gradv[ijN(2,0,3)]+q_gradv[ijN(0,2,3)]);
-         const double gradv21 = 0.5*(q_gradv[ijN(2,1,3)]+q_gradv[ijN(1,2,3)]);
-         q_gradv[ijN(1,0,3)] = gradv10; q_gradv[ijN(2,0,3)] = gradv20;
-         q_gradv[ijN(0,1,3)] = gradv10; q_gradv[ijN(2,1,3)] = gradv21;
-         q_gradv[ijN(0,2,3)] = gradv20; q_gradv[ijN(1,2,3)] = gradv21;
-
-         double minEig = 0;
-         double comprDirX = 1;
-         double comprDirY = 0;
-         double comprDirZ = 0;
-
-         {
-            // Compute eigenvalues using quadrature formula
-            const double q_ = (gradv00+gradv11+gradv22) / 3.0;
-            const double gradv_q00 = (gradv00-q_);
-            const double gradv_q11 = (gradv11-q_);
-            const double gradv_q22 = (gradv22-q_);
-
-            const double p1 = ((gradv10*gradv10) +
-                               (gradv20*gradv20) +
-                               (gradv21*gradv21));
-            const double p2 = ((gradv_q00*gradv_q00) +
-                               (gradv_q11*gradv_q11) +
-                               (gradv_q22*gradv_q22) +
-                               (2.0*p1));
-            const double p    = sqrt(p2 / 6.0);
-            const double pinv = 1.0 / p;
-            // det(pinv*(gradv-q*I))
-            const double r = (0.5*pinv*pinv*pinv *
-                              ((gradv_q00*gradv_q11*gradv_q22) +
-                               (2.0*gradv10*gradv21*gradv20) -
-                               (gradv_q11*gradv20*gradv20) -
-                               (gradv_q22*gradv10*gradv10) -
-                               (gradv_q00*gradv21*gradv21)));
-
-            double phi = 0;
-            if (r <= -1.0)
-            {
-               phi = M_PI / 3.0;
-            }
-            else if (r < 1.0)
-            {
-               phi = acos(r) / 3.0;
-            }
-
-            minEig = q_+(2.0*p*cos(phi+(2.0*M_PI / 3.0)));
-            const double eig3 = q_+(2.0*p*cos(phi));
-            const double eig2 = 3.0*q_-minEig-eig3;
-            double maxNorm = 0;
-
-            for (int i = 0; i < 3; ++i)
-            {
-               const double x = q_gradv[i+3*0]-(i == 0)*eig3;
-               const double y = q_gradv[i+3*1]-(i == 1)*eig3;
-               const double z = q_gradv[i+3*2]-(i == 2)*eig3;
-               const double cx = ((x*(gradv00-eig2)) +
-                                  (y*gradv10) +
-                                  (z*gradv20));
-               const double cy = ((x*gradv10) +
-                                  (y*(gradv11-eig2)) +
-                                  (z*gradv21));
-               const double cz = ((x*gradv20) +
-                                  (y*gradv21) +
-                                  (z*(gradv22-eig2)));
-               const double cNorm = (cx*cx+cy*cy+cz*cz);
-               //#warning 1e-16 to 1
-               if ((cNorm > 1.e-16) && (maxNorm < cNorm))
-               {
-                  comprDirX = cx;
-                  comprDirY = cy;
-                  comprDirZ = cz;
-                  maxNorm = cNorm;
-               }
-            }
-            //#warning 1e-16 to 1
-            if (maxNorm > 1.e-16)
-            {
-               const double maxNormInv = 1.0 / sqrt(maxNorm);
-               comprDirX *= maxNormInv;
-               comprDirY *= maxNormInv;
-               comprDirZ *= maxNormInv;
-            }
-         }
-
-         // Computes the initial->physical transformation Jacobian.
-         const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_20 = J[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_21 = J[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_02 = J[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_12 = J[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_22 = J[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_20 = invJ0[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_21 = invJ0[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_02 = invJ0[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_12 = invJ0[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_22 = invJ0[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         const double Jpi_00 = ((J_00*invJ0_00)+(J_10*invJ0_01)+(J_20*invJ0_02));
-         const double Jpi_10 = ((J_00*invJ0_10)+(J_10*invJ0_11)+(J_20*invJ0_12));
-         const double Jpi_20 = ((J_00*invJ0_20)+(J_10*invJ0_21)+(J_20*invJ0_22));
-
-         const double Jpi_01 = ((J_01*invJ0_00)+(J_11*invJ0_01)+(J_21*invJ0_02));
-         const double Jpi_11 = ((J_01*invJ0_10)+(J_11*invJ0_11)+(J_21*invJ0_12));
-         const double Jpi_21 = ((J_01*invJ0_20)+(J_11*invJ0_21)+(J_21*invJ0_22));
-
-         const double Jpi_02 = ((J_02*invJ0_00)+(J_12*invJ0_01)+(J_22*invJ0_02));
-         const double Jpi_12 = ((J_02*invJ0_10)+(J_12*invJ0_11)+(J_22*invJ0_12));
-         const double Jpi_22 = ((J_02*invJ0_20)+(J_12*invJ0_21)+(J_22*invJ0_22));
-
-         const double physDirX = ((Jpi_00*comprDirX)+(Jpi_10*comprDirY)+
-                                  (Jpi_20*comprDirZ));
-         const double physDirY = ((Jpi_01*comprDirX)+(Jpi_11*comprDirY)+
-                                  (Jpi_21*comprDirZ));
-         const double physDirZ = ((Jpi_02*comprDirX)+(Jpi_12*comprDirY)+
-                                  (Jpi_22*comprDirZ));
-
-         const double q_h = H0*sqrt((physDirX*physDirX)+
-                                    (physDirY*physDirY)+
-                                    (physDirZ*physDirZ));
-
-         const double soundSpeed = sqrt(GAMMA*(GAMMA-1.0)*q_e);
-         dtEst[ijN(q,el,NUM_QUAD)] = CFL*q_h / soundSpeed;
-
-         if (USE_VISCOSITY)
-         {
-            // TODO: Check how we can extract outside of kernel
-            const double mu = minEig;
-            double coeff = 2.0*q_rho*q_h*q_h*fabs(mu);
-            if (mu < 0)
-            {
-               coeff += 0.5*q_rho*q_h*soundSpeed;
-            }
-            for (int y = 0; y < 3; ++y)
-            {
-               for (int x = 0; x < 3; ++x)
-               {
-                  q_stress[ijN(x,y,3)] += coeff*q_gradv[ijN(x,y,3)];
-               }
-            }
-         }
-
-         const double S00 = q_stress[ijN(0,0,3)];
-         const double S10 = q_stress[ijN(1,0,3)];
-         const double S20 = q_stress[ijN(2,0,3)];
-         const double S01 = q_stress[ijN(0,1,3)];
-         const double S11 = q_stress[ijN(1,1,3)];
-         const double S21 = q_stress[ijN(2,1,3)];
-         const double S02 = q_stress[ijN(0,2,3)];
-         const double S12 = q_stress[ijN(1,2,3)];
-         const double S22 = q_stress[ijN(2,2,3)];
-
-         stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_00)+(S10*invJ_01)+(S20*invJ_02));
-         stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_10)+(S10*invJ_11)+(S20*invJ_12));
-         stressJinvT[ijklNM(2,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_20)+(S10*invJ_21)+(S20*invJ_22));
-
-         stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_00)+(S11*invJ_01)+(S21*invJ_02));
-         stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_10)+(S11*invJ_11)+(S21*invJ_12));
-         stressJinvT[ijklNM(2,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_20)+(S11*invJ_21)+(S21*invJ_22));
-
-         stressJinvT[ijklNM(0,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_00)+(S12*invJ_01)+(S22*invJ_02));
-         stressJinvT[ijklNM(1,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_10)+(S12*invJ_11)+(S22*invJ_12));
-         stressJinvT[ijklNM(2,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_20)+(S12*invJ_21)+(S22*invJ_22));
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fUpdateQuadratureData)(const double GAMMA,
-                                      const double H0,
-                                      const double CFL,
-                                      const bool USE_VISCOSITY,
-                                      const int numElements,
-                                      const double* restrict dofToQuad,
-                                      const double* restrict dofToQuadD,
-                                      const double* restrict quadWeights,
-                                      const double* restrict v,
-                                      const double* restrict e,
-                                      const double* restrict rho0DetJ0w,
-                                      const double* restrict invJ0,
-                                      const double* restrict J,
-                                      const double* restrict invJ,
-                                      const double* restrict detJ,
-                                      double* restrict stressJinvT,
-                                      double* restrict dtEst);
-
-// *****************************************************************************
-void rUpdateQuadratureData(const double GAMMA,
-                           const double H0,
-                           const double CFL,
-                           const bool USE_VISCOSITY,
-                           const int NUM_DIM,
-                           const int NUM_QUAD,
-                           const int NUM_QUAD_1D,
-                           const int NUM_DOFS_1D,
-                           const int nzones,
-                           const double* restrict dofToQuad,
-                           const double* restrict dofToQuadD,
-                           const double* restrict quadWeights,
-                           const double* restrict v,
-                           const double* restrict e,
-                           const double* restrict rho0DetJ0w,
-                           const double* restrict invJ0,
-                           const double* restrict J,
-                           const double* restrict invJ,
-                           const double* restrict detJ,
-                           double* restrict stressJinvT,
-                           double* restrict dtEst)
-{
-   const int blck = HIP_BLOCK_SIZE;
-   const int grid = (nzones+blck-1)/blck;
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(IROOT(NUM_DIM,NUM_QUAD)==NUM_QUAD_1D);
-   const unsigned int id = (NUM_DIM<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned int, fUpdateQuadratureData> call =
-   {
-      // 2D
-      {0x20,&rUpdateQuadratureData2D<2,2*2,2,2>},
-      {0x21,&rUpdateQuadratureData2D<2,4*4,4,3>},
-      {0x22,&rUpdateQuadratureData2D<2,6*6,6,4>},
-      {0x23,&rUpdateQuadratureData2D<2,8*8,8,5>},
-      {0x24,&rUpdateQuadratureData2D<2,10*10,10,6>},
-      {0x25,&rUpdateQuadratureData2D<2,12*12,12,7>},
-      {0x26,&rUpdateQuadratureData2D<2,14*14,14,8>},
-      {0x27,&rUpdateQuadratureData2D<2,16*16,16,9>},
-      {0x28,&rUpdateQuadratureData2D<2,18*18,18,10>},
-      {0x29,&rUpdateQuadratureData2D<2,20*20,20,11>},
-      {0x2A,&rUpdateQuadratureData2D<2,22*22,22,12>},
-      {0x2B,&rUpdateQuadratureData2D<2,24*24,24,13>},
-      {0x2C,&rUpdateQuadratureData2D<2,26*26,26,14>},
-      {0x2D,&rUpdateQuadratureData2D<2,28*28,28,15>},
-      {0x2E,&rUpdateQuadratureData2D<2,30*30,30,16>},
-      {0x2F,&rUpdateQuadratureData2D<2,32*32,32,17>},
-      // 3D
-      {0x30,&rUpdateQuadratureData3D<3,2*2*2,2,2>},
-      {0x31,&rUpdateQuadratureData3D<3,4*4*4,4,3>},
-      {0x32,&rUpdateQuadratureData3D<3,6*6*6,6,4>},
-      {0x33,&rUpdateQuadratureData3D<3,8*8*8,8,5>},
-      {0x34,&rUpdateQuadratureData3D<3,10*10*10,10,6>},
-      {0x35,&rUpdateQuadratureData3D<3,12*12*12,12,7>},
-      {0x36,&rUpdateQuadratureData3D<3,14*14*14,14,8>},
-      {0x37,&rUpdateQuadratureData3D<3,16*16*16,16,9>},
-      {0x38,&rUpdateQuadratureData3D<3,18*18*18,18,10>},
-      {0x39,&rUpdateQuadratureData3D<3,20*20*20,20,11>},
-      {0x3A,&rUpdateQuadratureData3D<3,22*22*22,22,12>},
-      {0x3B,&rUpdateQuadratureData3D<3,24*24*24,24,13>},
-      {0x3C,&rUpdateQuadratureData3D<3,26*26*26,26,14>},
-      {0x3D,&rUpdateQuadratureData3D<3,28*28*28,28,15>},
-      {0x3E,&rUpdateQuadratureData3D<3,30*30*30,30,16>},
-      {0x3F,&rUpdateQuadratureData3D<3,32*32*32,32,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rUpdateQuadratureData] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         GAMMA,H0,CFL,USE_VISCOSITY,
-         nzones,dofToQuad,dofToQuadD,quadWeights,
-         v,e,rho0DetJ0w,invJ0,J,invJ,detJ,
-         stressJinvT,dtEst);
-}
diff --git a/hip/hip/kernels/share/forceS.cpp b/hip/hip/kernels/share/forceS.cpp
deleted file mode 100644
index e7769d9d..00000000
--- a/hip/hip/kernels/share/forceS.cpp
+++ /dev/null
@@ -1,839 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMult2S(const int numElements,
-                  const double* restrict L2DofToQuad,
-                  const double* restrict H1QuadToDof,
-                  const double* restrict H1QuadToDofD,
-                  const double* restrict stressJinvT,
-                  const double* restrict e,
-                  double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int MAX_DOFS_1D = (L2_DOFS_1D > H1_DOFS_1D)?L2_DOFS_1D:H1_DOFS_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2DofToQuad[NUM_QUAD_1D * L2_DOFS_1D];
-      share double s_H1QuadToDof[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1QuadToDofD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_xy[MAX_DOFS_1D * NUM_QUAD_1D];
-      share double s_xDy[H1_DOFS_1D * NUM_QUAD_1D];
-      share double s_e[NUM_QUAD_2D];
-
-      const int idBlock = threadIdx.x;
-      {
-         for (int id = idBlock; id < (L2_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_L2DofToQuad[id] = L2DofToQuad[id];
-         }
-         for (int id = idBlock; id < (H1_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_H1QuadToDof[id]  = H1QuadToDof[id];
-            s_H1QuadToDofD[id] = H1QuadToDofD[id];
-         }
-      }
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            sync;
-            const int dx = threadIdx.x;
-            {
-               if (dx < L2_DOFS_1D)
-               {
-                  double r_x[L2_DOFS_1D];
-                  for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                  {
-                     r_x[dy] = e[ijkN(dx,dy,el,L2_DOFS_1D)];
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     double xy = 0;
-                     for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                     {
-                        xy += r_x[dy]*s_L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                     }
-                     s_xy[ijN(dx,qy,MAX_DOFS_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            const int qy = threadIdx.x;
-            {
-               if (qy < NUM_QUAD_1D)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     double r_e = 0;
-                     for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-                     {
-                        r_e += s_xy[ijN(dx,qy,MAX_DOFS_1D)]*s_L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                     }
-                     s_e[ijN(qx,qy,NUM_QUAD_1D)] = r_e;
-                  }
-               }
-            }
-
-            for (int c = 0; c < NUM_DIM; ++c)
-            {
-               sync;
-               const int qx = threadIdx.x;
-               {
-                  if (qx < NUM_QUAD_1D)
-                  {
-                     double r_x[NUM_QUAD_1D];
-                     double r_y[NUM_QUAD_1D];
-
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        const double r_e = s_e[(qx) + (NUM_QUAD_1D) * (qy)];
-                        r_x[qy] = r_e * stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)];
-                        r_y[qy] = r_e * stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)];
-                     }
-                     for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                     {
-                        double xy  = 0;
-                        double xDy = 0;
-                        for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                        {
-                           xy  += r_x[qy] * s_H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-                           xDy += r_y[qy] * s_H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-                        }
-                        s_xy[ijN(dy,qx,MAX_DOFS_1D)] = xy;
-                        s_xDy[ijN(dy,qx,H1_DOFS_1D)] = xDy;
-                     }
-                  }
-               }
-               sync;
-               const int dx = threadIdx.x;
-               {
-                  if (dx < H1_DOFS_1D)
-                  {
-                     for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                     {
-                        double r_v = 0;
-                        for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                        {
-                           r_v += ((s_xy[ijN(dy,qx,MAX_DOFS_1D)] * s_H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)]) +
-                                   (s_xDy[ijN(dy,qx,H1_DOFS_1D)] * s_H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)]));
-                        }
-                        v[ijklNM(dx,dy,el,c,NUM_DOFS_1D,numElements)] = r_v;
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMultTranspose2S(const int numElements,
-                           const double* restrict L2QuadToDof,
-                           const double* restrict H1DofToQuad,
-                           const double* restrict H1DofToQuadD,
-                           const double* restrict stressJinvT,
-                           const double* restrict v,
-                           double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD = NUM_QUAD_2D;
-   const int MAX_DOFS_1D = (L2_DOFS_1D > H1_DOFS_1D)?L2_DOFS_1D:H1_DOFS_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2QuadToDof[NUM_QUAD_1D * L2_DOFS_1D];
-      share double s_H1DofToQuad[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1DofToQuadD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_xy[MAX_DOFS_1D * NUM_QUAD_1D];
-      share double s_xDy[H1_DOFS_1D * NUM_QUAD_1D];
-      share double s_v[NUM_QUAD_1D  * NUM_QUAD_1D];
-
-      const int idBlock = 0 + threadIdx.x;
-      {
-         for (int id = idBlock; id < (L2_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_L2QuadToDof[id] = L2QuadToDof[id];
-         }
-         for (int id = idBlock; id < (H1_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE)
-         {
-            s_H1DofToQuad[id]  = H1DofToQuad[id];
-            s_H1DofToQuadD[id] = H1DofToQuadD[id];
-         }
-      }
-
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            sync;
-            const int qBlock = threadIdx.x;
-            {
-               for (int q = qBlock; q < NUM_QUAD; ++q)
-               {
-                  s_v[q] = 0;
-               }
-            }
-            for (int c = 0; c < NUM_DIM; ++c)
-            {
-               sync;
-               const int dx = threadIdx.x;
-               {
-                  if (dx < H1_DOFS_1D)
-                  {
-                     double r_v[H1_DOFS_1D];
-
-                     for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                     {
-                        r_v[dy] = v[ijklNM(dx,dy,el,c,H1_DOFS_1D,numElements)];
-                     }
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        double xy  = 0;
-                        double xDy = 0;
-                        for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                        {
-                           xy  += r_v[dy] * s_H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                           xDy += r_v[dy] * s_H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                        }
-                        s_xy[ijN(qy,dx,NUM_QUAD_1D)]  = xy;
-                        s_xDy[ijN(qy,dx,NUM_QUAD_1D)] = xDy;
-                     }
-                  }
-               }
-               sync;
-               const int qx = threadIdx.x;
-               {
-                  if (qx < NUM_QUAD_1D)
-                  {
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        double Dxy = 0;
-                        double xDy = 0;
-                        for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                        {
-                           Dxy += (s_xy[ijN(qy,dx,NUM_QUAD_1D)] * s_H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)]);
-                           xDy += (s_xDy[ijN(qy,dx,NUM_QUAD_1D)] * s_H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)]);
-                        }
-                        s_v[ijN(qx,qy,NUM_QUAD_1D)] += ((Dxy * stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,
-                                                                                   NUM_QUAD_1D)]) +
-                                                        (xDy * stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)]));
-                     }
-                  }
-               }
-            }
-            sync;
-            const int qx = threadIdx.x;
-            {
-               if (qx < NUM_QUAD_1D)
-               {
-                  double r_x[NUM_QUAD_1D];
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     r_x[qy] = s_v[ijN(qx,qy,NUM_QUAD_1D)];
-                  }
-                  for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                  {
-                     double xy = 0;
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        xy += r_x[qy] * s_L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-                     }
-                     s_xy[ijN(qx,dy,NUM_QUAD_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            const int dy = threadIdx.x;
-            {
-               if (dy < L2_DOFS_1D)
-               {
-                  for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-                  {
-                     double r_e = 0;
-                     for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                     {
-                        r_e += s_xy[ijN(qx,dy,NUM_QUAD_1D)] * s_L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-                     }
-                     e[ijkN(dx,dy,el,L2_DOFS_1D)] = r_e;
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-typedef void (*fForceMult2S)(const int numElements,
-                             const double* restrict L2DofToQuad,
-                             const double* restrict H1QuadToDof,
-                             const double* restrict H1QuadToDofD,
-                             const double* restrict stressJinvT,
-                             const double* restrict e,
-                             double* restrict v);
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMult3S(const int numElements,
-                  const double* restrict L2DofToQuad,
-                  const double* restrict H1QuadToDof,
-                  const double* restrict H1QuadToDofD,
-                  const double* restrict stressJinvT,
-                  const double* restrict e,
-                  double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int INNER_SIZE_2D = (INNER_SIZE * INNER_SIZE);
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2DofToQuad[NUM_QUAD_1D * L2_DOFS_1D];
-      share double s_H1QuadToDof[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1QuadToDofD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_Dxyz[INNER_SIZE_2D];
-      share double s_xDyz[NUM_QUAD_2D];
-      share double s_xyDz[NUM_QUAD_2D];
-
-      double r_z[NUM_QUAD_1D];
-
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * INNER_SIZE) + x;
-            for (int i = id; i < (L2_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_L2DofToQuad[i] = L2DofToQuad[i];
-            }
-            for (int i = id; i < (H1_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_H1QuadToDof[i]  = H1QuadToDof[i];
-               s_H1QuadToDofD[i] = H1QuadToDofD[i];
-            }
-         }
-      }
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            {
-               const int dy = threadIdx.y;
-               {
-                  const int dx = threadIdx.x;
-                  if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                  {
-                     // Calculate D -> Q in the Z axis
-                     const double r_e0 = e[ijklN(dx,dy,0,el,L2_DOFS_1D)];
-                     for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                     {
-                        r_z[qz] = r_e0 * s_L2DofToQuad[ijN(qz, 0,NUM_QUAD_1D)];
-                     }
-
-                     for (int dz = 1; dz < L2_DOFS_1D; ++dz)
-                     {
-                        const double r_e = e[ijklN(dx,dy,dz,el,L2_DOFS_1D)];
-                        for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                        {
-                           r_z[qz] += r_e * s_L2DofToQuad[ijN(qz, dz,NUM_QUAD_1D)];
-                        }
-                     }
-                  }
-               }
-            }
-            // For each xy plane
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               sync;
-               // Fill xy plane at given z position
-               {
-                  const int dy = threadIdx.y;
-                  {
-                     const int dx = threadIdx.x;
-                     if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                     {
-                        s_Dxyz[ijN(dx, dy,INNER_SIZE)] = r_z[qz];
-                     }
-                  }
-               }
-               // Calculate Dxyz, xDyz, xyDz in plane
-               sync;
-               {
-                  const int qy = threadIdx.y;
-                  {
-                     const int qx = threadIdx.x;
-                     if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-                     {
-                        double q_e = 0;
-                        for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-                        {
-                           double q_ex = 0;
-                           for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-                           {
-                              q_ex += s_Dxyz[ijN(dx, dy,INNER_SIZE)] * s_L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                           }
-                           q_e += q_ex * s_L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                        }
-                        r_z[qz] = q_e;
-                     }
-                  }
-               }
-            }
-            for (int c = 0; c < NUM_DIM; ++c)
-            {
-               for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-               {
-                  // Fill xy plane at given z position
-                  sync;
-                  {
-                     const int qy = threadIdx.y;
-                     {
-                        const int qx = threadIdx.x;
-                        if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-                        {
-                           double r_Dxyz = 0;
-                           double r_xDyz = 0;
-                           double r_xyDz = 0;
-                           for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                           {
-                              const double r_e = r_z[qz];
-                              const double wz  = s_H1QuadToDof[ijN(dz, qz,H1_DOFS_1D)];
-                              const double wDz = s_H1QuadToDofD[ijN(dz, qz,H1_DOFS_1D)];
-                              r_Dxyz += r_e * wz  * stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,NUM_DIM,
-                                                                         NUM_QUAD_1D)];
-                              r_xDyz += r_e * wz  * stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,
-                                                                         NUM_QUAD_1D)];
-                              r_xyDz += r_e * wDz * stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,
-                                                                         NUM_QUAD_1D)];
-                           }
-                           s_Dxyz[ijN(qx,qy,INNER_SIZE)] = r_Dxyz;
-                           s_xDyz[ijN(qx,qy,NUM_QUAD_1D)] = r_xDyz;
-                           s_xyDz[ijN(qx,qy,NUM_QUAD_1D)] = r_xyDz;
-                        }
-                     }
-                  }
-                  // Finalize solution in xy plane
-                  sync;
-                  {
-                     const int dy = threadIdx.y;
-                     {
-                        const int dx = threadIdx.x;
-                        if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D))
-                        {
-                           double r_v = 0;
-                           for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                           {
-                              const double wy  = s_H1QuadToDof[ijN(dy, qy,H1_DOFS_1D)];
-                              const double wDy = s_H1QuadToDofD[ijN(dy, qy,H1_DOFS_1D)];
-                              for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                              {
-                                 const double wx  = s_H1QuadToDof[ijN(dx, qx,H1_DOFS_1D)];
-                                 const double wDx = s_H1QuadToDofD[ijN(dx, qx,H1_DOFS_1D)];
-                                 r_v += ((wDx * wy  * s_Dxyz[ijN(qx, qy,INNER_SIZE)]) +
-                                         (wx  * wDy * s_xDyz[ijN(qx, qy,NUM_QUAD_1D)]) +
-                                         (wx  * wy  * s_xyDz[ijN(qx, qy,NUM_QUAD_1D)]));
-                              }
-                           }
-                           v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] = r_v;
-                        }
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D> kernel
-void rForceMultTranspose3S(const int numElements,
-                           const double* restrict L2QuadToDof,
-                           const double* restrict H1DofToQuad,
-                           const double* restrict H1DofToQuadD,
-                           const double* restrict stressJinvT,
-                           const double* restrict v,
-                           double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int idx = blockIdx.x;
-   const int elBlock = idx * ELEMENT_BATCH;
-   if (elBlock < numElements)
-   {
-      share double s_L2QuadToDof[L2_DOFS_1D * NUM_QUAD_1D];
-      share double s_H1DofToQuad[H1_DOFS_1D  * NUM_QUAD_1D];
-      share double s_H1DofToQuadD[H1_DOFS_1D * NUM_QUAD_1D];
-
-      share double s_xyz[NUM_QUAD_2D * NUM_DIM];
-      share double s_xyDz[NUM_QUAD_2D * NUM_DIM];
-      share double s_v[NUM_QUAD_2D];
-
-      double r_xyz[NUM_QUAD_1D*NUM_DIM];
-      double r_xyDz[NUM_QUAD_1D*NUM_DIM];
-      const int y = threadIdx.y;
-      {
-         const int x = threadIdx.x;
-         {
-            const int id = (y * INNER_SIZE) + x;
-            for (int i = id; i < (L2_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_L2QuadToDof[i] = L2QuadToDof[i];
-            }
-            for (int i = id; i < (H1_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE))
-            {
-               s_H1DofToQuad[i]  = H1DofToQuad[i];
-               s_H1DofToQuadD[i] = H1DofToQuadD[i];
-            }
-         }
-      }
-
-      for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el)
-      {
-         if (el < numElements)
-         {
-            sync;
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               {
-                  if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D))
-                  {
-                     double r_v[NUM_DIM][H1_DOFS_1D];
-                     for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-                     {
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           r_v[c][dz] = v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)];
-                        }
-                     }
-                     for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                     {
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           double xyz  = 0;
-                           double xyDz = 0;
-                           for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-                           {
-                              xyz  += r_v[c][dz] * s_H1DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-                              xyDz += r_v[c][dz] * s_H1DofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-                           }
-                           r_xyz[ijN(c,qz,NUM_DIM)]  = xyz;
-                           r_xyDz[ijN(c,qz,NUM_DIM)] = xyDz;
-                        }
-                     }
-                  }
-               }
-            }
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               sync;
-               const int dy = threadIdx.y;
-               {
-                  const int dx = threadIdx.x;
-                  {
-                     if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D))
-                     {
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           s_xyz[ijkNM(c,dx,dy,NUM_DIM,NUM_QUAD_1D)]  = r_xyz[ijN(c,qz,NUM_DIM)];
-                           s_xyDz[ijkNM(c,dx,dy,NUM_DIM,NUM_QUAD_1D)] = r_xyDz[ijN(c,qz,NUM_DIM)];
-                        }
-                     }
-                  }
-               }
-               // Finalize solution in xy plane
-               sync;
-               const int qy = threadIdx.y;
-               {
-                  const int qx = threadIdx.x;
-                  {
-                     if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-                     {
-                        double r_qv = 0;
-                        for (int c = 0; c < NUM_DIM; ++c)
-                        {
-                           double Dxyz = 0;
-                           double xDyz = 0;
-                           double xyDz = 0;
-                           for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-                           {
-                              const double wy  = s_H1DofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                              const double wDy = s_H1DofToQuadD[ijN(qy, dy,NUM_QUAD_1D)];
-                              double Dxz = 0;
-                              double xz  = 0;
-                              double xDz = 0;
-                              for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                              {
-                                 const double wx  = s_H1DofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                                 const double wDx = s_H1DofToQuadD[ijN(qx, dx,NUM_QUAD_1D)];
-                                 Dxz += wDx * s_xyz[ijkNM(c, dx, dy,NUM_DIM,NUM_QUAD_1D)];
-                                 xz  += wx  * s_xyz[ijkNM(c, dx, dy,NUM_DIM,NUM_QUAD_1D)];
-                                 xDz += wx  * s_xyDz[ijkNM(c, dx, dy,NUM_DIM,NUM_QUAD_1D)];
-                              }
-                              Dxyz += wy  * Dxz;
-                              xDyz += wDy * xz;
-                              xyDz += wy  * xDz;
-                           }
-                           r_qv += ((Dxyz * stressJinvT[ijklmnNM(0, c, qx, qy, qz, el,NUM_DIM,
-                                                                 NUM_QUAD_1D)]) +
-                                    (xDyz * stressJinvT[ijklmnNM(1, c, qx, qy, qz, el,NUM_DIM,NUM_QUAD_1D)]) +
-                                    (xyDz * stressJinvT[ijklmnNM(2, c, qx, qy, qz, el,NUM_DIM,NUM_QUAD_1D)]));
-                        }
-                        s_v[ijN(qx, qy,NUM_QUAD_1D)] = r_qv;
-                     }
-                  }
-               }
-               sync;
-               {
-                  const int dy = threadIdx.y;
-                  {
-                     const int dx = threadIdx.x;
-                     if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                     {
-                        double r_e = 0;
-                        for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                        {
-                           double r_ex = 0;
-                           for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                           {
-                              r_ex += s_v[ijN(qx, qy,NUM_QUAD_1D)] * s_L2QuadToDof[ijN(dx, qx,L2_DOFS_1D)];
-                           }
-                           r_e += r_ex * s_L2QuadToDof[ijN(dy, qy,L2_DOFS_1D)];
-                        }
-                        r_xyz[qz] = r_e;
-                     }
-                  }
-               }
-            }
-            sync;
-            {
-               const int dy = threadIdx.y;
-               const int dx = threadIdx.x;
-               {
-                  if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D))
-                  {
-                     for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-                     {
-                        double r_e = 0;
-                        for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                        {
-                           r_e += r_xyz[qz] * s_L2QuadToDof[ijN(dz,qz,L2_DOFS_1D)];
-                        }
-                        e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] = r_e;
-                     }
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-void rForceMultS(const int NUM_DIM,
-                 const int NUM_DOFS_1D,
-                 const int NUM_QUAD_1D,
-                 const int L2_DOFS_1D,
-                 const int H1_DOFS_1D,
-                 const int numElements,
-                 const double* restrict L2QuadToDof,
-                 const double* restrict H1DofToQuad,
-                 const double* restrict H1DofToQuadD,
-                 const double* restrict stressJinvT,
-                 const double* restrict e,
-                 double* restrict v)
-{
-   if (NUM_DIM==1) { assert(false); }
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int grid = ((numElements+ELEMENT_BATCH-1)/ELEMENT_BATCH);
-   const dim3 blck(INNER_SIZE,INNER_SIZE,1);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   const unsigned int id =(NUM_DIM<<4)|(NUM_DOFS_1D-2);
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   static std::unordered_map<unsigned int, fForceMult2S> call =
-   {
-      {0x20,&rForceMult2S<2,2,2,1,2>},
-      {0x21,&rForceMult2S<2,3,4,2,3>},
-      {0x22,&rForceMult2S<2,4,6,3,4>},
-      {0x23,&rForceMult2S<2,5,8,4,5>},
-      {0x24,&rForceMult2S<2,6,10,5,6>},
-      {0x25,&rForceMult2S<2,7,12,6,7>},
-      {0x26,&rForceMult2S<2,8,14,7,8>},
-      {0x27,&rForceMult2S<2,9,16,8,9>},
-      {0x28,&rForceMult2S<2,10,18,9,10>},
-      {0x29,&rForceMult2S<2,11,20,10,11>},
-      {0x2A,&rForceMult2S<2,12,22,11,12>},
-      {0x2B,&rForceMult2S<2,13,24,12,13>},
-      {0x2C,&rForceMult2S<2,14,26,13,14>},
-      {0x2D,&rForceMult2S<2,15,28,14,15>},
-      {0x2E,&rForceMult2S<2,16,30,15,16>},
-      {0x2F,&rForceMult2S<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMult3S<3,2,2,1,2>},
-      {0x31,&rForceMult3S<3,3,4,2,3>},
-      {0x32,&rForceMult3S<3,4,6,3,4>},
-      {0x33,&rForceMult3S<3,5,8,4,5>},
-      {0x34,&rForceMult3S<3,6,10,5,6>},
-      {0x35,&rForceMult3S<3,7,12,6,7>},
-      {0x36,&rForceMult3S<3,8,14,7,8>},
-      {0x37,&rForceMult3S<3,9,16,8,9>},
-      {0x38,&rForceMult3S<3,10,18,9,10>},
-      {0x39,&rForceMult3S<3,11,20,10,11>},
-      {0x3A,&rForceMult3S<3,12,22,11,12>},
-      {0x3B,&rForceMult3S<3,13,24,12,13>},
-      {0x3C,&rForceMult3S<3,14,26,13,14>},
-      {0x3D,&rForceMult3S<3,15,28,14,15>},
-      {0x3E,&rForceMult3S<3,16,30,15,16>},
-      {0x3F,&rForceMult3S<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMult] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,e,v);
-}
-
-
-// *****************************************************************************
-typedef void (*fForceMultTransposeS)(const int numElements,
-                                     const double* restrict L2QuadToDof,
-                                     const double* restrict H1DofToQuad,
-                                     const double* restrict H1DofToQuadD,
-                                     const double* restrict stressJinvT,
-                                     const double* restrict v,
-                                     double* restrict e);
-
-// *****************************************************************************
-void rForceMultTransposeS(const int NUM_DIM,
-                          const int NUM_DOFS_1D,
-                          const int NUM_QUAD_1D,
-                          const int L2_DOFS_1D,
-                          const int H1_DOFS_1D,
-                          const int numElements,
-                          const double* restrict L2QuadToDof,
-                          const double* restrict H1DofToQuad,
-                          const double* restrict H1DofToQuadD,
-                          const double* restrict stressJinvT,
-                          const double* restrict v,
-                          double* restrict e)
-{
-   const int H1_MAX_1D = (H1_DOFS_1D > NUM_QUAD_1D)?H1_DOFS_1D:NUM_QUAD_1D;
-   const int L2_MAX_1D = (L2_DOFS_1D > NUM_QUAD_1D)?L2_DOFS_1D:NUM_QUAD_1D;
-   const int INNER_SIZE = (H1_MAX_1D > L2_MAX_1D)?H1_MAX_1D:L2_MAX_1D;
-   const int grid = ((numElements+ELEMENT_BATCH-1)/ELEMENT_BATCH);
-   const dim3 blck(INNER_SIZE,INNER_SIZE,1);
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned long long, fForceMultTransposeS> call =
-   {
-      // 2D
-      {0x20,&rForceMultTranspose2S<2,2,2,1,2>},
-      {0x21,&rForceMultTranspose2S<2,3,4,2,3>},
-      {0x22,&rForceMultTranspose2S<2,4,6,3,4>},
-      {0x23,&rForceMultTranspose2S<2,5,8,4,5>},
-      {0x24,&rForceMultTranspose2S<2,6,10,5,6>},
-      {0x25,&rForceMultTranspose2S<2,7,12,6,7>},
-      {0x26,&rForceMultTranspose2S<2,8,14,7,8>},
-      {0x27,&rForceMultTranspose2S<2,9,16,8,9>},
-      {0x28,&rForceMultTranspose2S<2,10,18,9,10>},
-      {0x29,&rForceMultTranspose2S<2,11,20,10,11>},
-      {0x2A,&rForceMultTranspose2S<2,12,22,11,12>},
-      {0x2B,&rForceMultTranspose2S<2,13,24,12,13>},
-      {0x2C,&rForceMultTranspose2S<2,14,26,13,14>},
-      {0x2D,&rForceMultTranspose2S<2,15,28,14,15>},
-      {0x2E,&rForceMultTranspose2S<2,16,30,15,16>},
-      {0x2F,&rForceMultTranspose2S<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMultTranspose3S<3,2,2,1,2>},
-      {0x31,&rForceMultTranspose3S<3,3,4,2,3>},
-      {0x32,&rForceMultTranspose3S<3,4,6,3,4>},
-      {0x33,&rForceMultTranspose3S<3,5,8,4,5>},
-      {0x34,&rForceMultTranspose3S<3,6,10,5,6>},
-      {0x35,&rForceMultTranspose3S<3,7,12,6,7>},
-      {0x36,&rForceMultTranspose3S<3,8,14,7,8>},
-      {0x37,&rForceMultTranspose3S<3,9,16,8,9>},
-      {0x38,&rForceMultTranspose3S<3,10,18,9,10>},
-      {0x39,&rForceMultTranspose3S<3,11,20,10,11>},
-      {0x3A,&rForceMultTranspose3S<3,12,22,11,12>},
-      {0x3B,&rForceMultTranspose3S<3,13,24,12,13>},
-      {0x3C,&rForceMultTranspose3S<3,14,26,13,14>},
-      //{0x3D,&rForceMultTranspose3S<3,15,28,14,15>}, // uses too much shared data
-      //{0x3E,&rForceMultTranspose3S<3,16,30,15,16>},
-      //{0x3F,&rForceMultTranspose3S<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMultTranspose] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,v,e);
-}
diff --git a/hip/hip/kernels/share/gridFuncToQuadS.cpp b/hip/hip/kernels/share/gridFuncToQuadS.cpp
deleted file mode 100644
index fcb3aa18..00000000
--- a/hip/hip/kernels/share/gridFuncToQuadS.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad2S(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double * restrict gf,
-                       double* restrict out)
-{
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   // Iterate over elements
-   const int idx = blockIdx.x;
-   const int eOff = idx * M2_ELEMENT_BATCH;
-   if (eOff < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];//@dim(NUM_QUAD_1D, NUM_DOFS_1D);
-
-      // Store xy planes in shared memory
-      share double s_xy[NUM_QUAD_DOFS_1D];//@dim(NUM_DOFS_1D, NUM_QUAD_1D);
-
-      for (int x = 0; x < NUM_MAX_1D; ++x)
-      {
-         for (int id = x; id < NUM_QUAD_DOFS_1D; id += NUM_MAX_1D)
-         {
-            s_dofToQuad[id] = dofToQuad[id];
-         }
-      }
-
-      for (int e = eOff; e < (eOff + M2_ELEMENT_BATCH); ++e)
-      {
-         if (e < numElements)
-         {
-            sync;
-            {
-               const int dx = threadIdx.x;
-               if (dx < NUM_DOFS_1D)
-               {
-                  double r_x[NUM_DOFS_1D];
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     r_x[dy] = gf[l2gMap[ijkN(dx, dy, e,NUM_DOFS_1D)]];
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     double xy = 0;
-                     for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                     {
-                        xy += r_x[dy] * s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     }
-                     s_xy[ijN(dx, qy,NUM_DOFS_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            {
-               const int qy = threadIdx.x;
-               if (qy < NUM_QUAD_1D)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     double val = 0;
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        val += s_xy[ijN(dx, qy,NUM_DOFS_1D)] * s_dofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                     }
-                     out[ijkN(qx, qy, e,NUM_QUAD_1D)] = val;
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rGridFuncToQuad3S(const int numElements,
-                       const double* restrict dofToQuad,
-                       const int* restrict l2gMap,
-                       const double* restrict gf,
-                       double* restrict out)
-{
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int NUM_MAX_2D = NUM_MAX_1D*NUM_MAX_1D;
-   // Iterate over elements
-   const int idx = blockIdx.x;
-   const int e = idx ;
-   if (e < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      // Store xy planes in @shared memory
-      share double s_z[NUM_MAX_2D];
-      // Store z axis as registers
-      double r_qz[NUM_QUAD_1D];
-      sync;
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * NUM_MAX_1D) + x;
-            // Fetch Q <--> D maps
-            if (id < NUM_QUAD_DOFS_1D)
-            {
-               s_dofToQuad[id] = dofToQuad[id];
-            }
-            // Initialize our Z axis
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               r_qz[qz] = 0;
-            }
-         }
-      }
-
-      sync;
-      {
-         const int dy = threadIdx.y;
-         {
-            const int dx = threadIdx.x;
-            if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-            {
-               for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-               {
-                  const double val = gf[l2gMap[ijklN(dx,dy,dz,e,NUM_DOFS_1D)]];
-                  // Calculate D -> Q in the Z axis
-                  for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                  {
-                     r_qz[qz] += val * s_dofToQuad[ijN(qz, dz,NUM_QUAD_1D)];
-                  }
-               }
-            }
-         }
-      }
-      // For each xy plane
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         // Fill xy plane at given z position
-         sync;
-         {
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-               {
-                  s_z[ijN(dx, dy,NUM_DOFS_1D)] = r_qz[qz];
-               }
-            }
-         }
-         // Calculate Dxyz, xDyz, xyDz in plane
-         sync;
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = threadIdx.x;
-               if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-               {
-                  double val = 0;
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     const double wy = s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        const double wx = s_dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                        val += wx * wy * s_z[ijN(dx,dy,NUM_DOFS_1D)];
-                     }
-                  }
-                  out[ijklN(qx, qy, qz, e,NUM_QUAD_1D)] = val;
-               }
-            }
-         }
-      }
-   }
-}
-
-
-// *****************************************************************************
-typedef void (*fGridFuncToQuad)(const int numElements,
-                                const double* restrict dofToQuad,
-                                const int* restrict l2gMap,
-                                const double* gf,
-                                double* restrict out);
-// *****************************************************************************
-void rGridFuncToQuadS(const int DIM,
-                      const int NUM_VDIM,
-                      const int NUM_DOFS_1D,
-                      const int NUM_QUAD_1D,
-                      const int numElements,
-                      const double* dofToQuad,
-                      const int* l2gMap,
-                      const double* gf,
-                      double* __restrict out)
-{
-   if (DIM==1) { assert(false); }
-   const int MX_ELEMENT_BATCH = DIM==2?M2_ELEMENT_BATCH:1;
-   const int grid = ((numElements+MX_ELEMENT_BATCH-1)/MX_ELEMENT_BATCH);
-   const int b1d = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const dim3 blck(b1d,b1d,1);
-   const unsigned int id = (DIM<<8)|(NUM_VDIM<<4)|(NUM_DOFS_1D-1);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(NUM_VDIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-1)<=4);
-   assert(NUM_QUAD_1D==2*NUM_DOFS_1D);
-   if (NUM_QUAD_1D!=2*NUM_DOFS_1D)
-   {
-      printf("\033[31;1m[rGridFuncToQuad] order ERROR: -ok=p -ot=p-1, p in [1,16]\033[m\n");
-      return exit(1);
-   }
-   static std::unordered_map<unsigned int, fGridFuncToQuad> call =
-   {
-      // 2D
-      {0x210,&rGridFuncToQuad2S<1,1,2>},
-      {0x211,&rGridFuncToQuad2S<1,2,4>},
-      {0x212,&rGridFuncToQuad2S<1,3,6>},
-      {0x213,&rGridFuncToQuad2S<1,4,8>},
-      {0x214,&rGridFuncToQuad2S<1,5,10>},
-      {0x215,&rGridFuncToQuad2S<1,6,12>},
-      {0x216,&rGridFuncToQuad2S<1,7,14>},
-      {0x217,&rGridFuncToQuad2S<1,8,16>},
-      {0x218,&rGridFuncToQuad2S<1,9,18>},
-      {0x219,&rGridFuncToQuad2S<1,10,20>},
-      {0x21A,&rGridFuncToQuad2S<1,11,22>},
-      {0x21B,&rGridFuncToQuad2S<1,12,24>},
-      {0x21C,&rGridFuncToQuad2S<1,13,26>},
-      {0x21D,&rGridFuncToQuad2S<1,14,28>},
-      {0x21E,&rGridFuncToQuad2S<1,15,30>},
-      {0x21F,&rGridFuncToQuad2S<1,16,32>},
-      // 3D
-      {0x310,&rGridFuncToQuad3S<1,1,2>},
-      {0x311,&rGridFuncToQuad3S<1,2,4>},
-      {0x312,&rGridFuncToQuad3S<1,3,6>},
-      {0x313,&rGridFuncToQuad3S<1,4,8>},
-      {0x314,&rGridFuncToQuad3S<1,5,10>},
-      {0x315,&rGridFuncToQuad3S<1,6,12>},
-      {0x316,&rGridFuncToQuad3S<1,7,14>},
-      {0x317,&rGridFuncToQuad3S<1,8,16>},
-      {0x318,&rGridFuncToQuad3S<1,9,18>},
-      {0x319,&rGridFuncToQuad3S<1,10,20>},
-      {0x31A,&rGridFuncToQuad3S<1,11,22>},
-      {0x31B,&rGridFuncToQuad3S<1,12,24>},
-      {0x31C,&rGridFuncToQuad3S<1,13,26>},
-      {0x31D,&rGridFuncToQuad3S<1,14,28>},
-      {0x31E,&rGridFuncToQuad3S<1,15,30>},
-      {0x31F,&rGridFuncToQuad3S<1,16,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rGridFuncToQuad] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck, numElements,dofToQuad,l2gMap,gf,out);
-}
diff --git a/hip/hip/kernels/share/massAssembleS.cpp b/hip/hip/kernels/share/massAssembleS.cpp
deleted file mode 100644
index bfa0957e..00000000
--- a/hip/hip/kernels/share/massAssembleS.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble2S0(const int numElements,
-                      const int NUM_QUAD,
-                      const double COEFF,
-                      const double* quadWeights,
-                      const double* J,
-                      double* __restrict oper)
-{
-   const int idx = blockIdx.x;
-   const int eOff = idx;
-   if (eOff < numElements)
-   {
-      {
-         const int e = threadIdx.x;
-         {
-            const int qOff = threadIdx.y;
-            for (int q = qOff; q < NUM_QUAD; q += 1)
-            {
-               const double J11 = J[ijklNM(0, 0, q, e,2,NUM_QUAD)];
-               const double J12 = J[ijklNM(1, 0, q, e,2,NUM_QUAD)];
-               const double J21 = J[ijklNM(0, 1, q, e,2,NUM_QUAD)];
-               const double J22 = J[ijklNM(1, 1, q, e,2,NUM_QUAD)];
-
-               oper[ijN(q,e,NUM_QUAD)] = quadWeights[q] * COEFF * ((J11 * J22) - (J21 * J12));
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-extern "C" kernel
-void rMassAssemble3S0(const int numElements,
-                      const int NUM_QUAD,
-                      const double COEFF,
-                      const double* restrict quadWeights,
-                      const double* restrict J,
-                      double* __restrict oper)
-{
-   const int idx = blockIdx.x;
-   const int eOff = idx;
-   if (eOff < numElements)
-   {
-      const int e = threadIdx.x;
-      {
-         if (e < numElements)
-         {
-            const int qOff = threadIdx.y;
-            {
-               for (int q = qOff; q < NUM_QUAD; q += 1)
-               {
-                  const double J11 = J[ijklNM(0, 0, q, e,3,NUM_QUAD)];
-                  const double J12 = J[ijklNM(1, 0, q, e,3,NUM_QUAD)];
-                  const double J13 = J[ijklNM(2, 0, q, e,3,NUM_QUAD)];
-                  const double J21 = J[ijklNM(0, 1, q, e,3,NUM_QUAD)];
-                  const double J22 = J[ijklNM(1, 1, q, e,3,NUM_QUAD)];
-                  const double J23 = J[ijklNM(2, 1, q, e,3,NUM_QUAD)];
-                  const double J31 = J[ijklNM(0, 2, q, e,3,NUM_QUAD)];
-                  const double J32 = J[ijklNM(1, 2, q, e,3,NUM_QUAD)];
-                  const double J33 = J[ijklNM(2, 2, q, e,3,NUM_QUAD)];
-
-                  const double detJ = ((J11 * J22 * J33) + (J12 * J23 * J31) + (J13 * J21 * J32) -
-                                       (J13 * J22 * J31) - (J12 * J21 * J33) - (J11 * J23 * J32));
-
-                  oper[ijN(q, e,NUM_QUAD)] = quadWeights[q] * COEFF * detJ;
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-static void rMassAssemble2S(const int NUM_QUAD_2D,
-                            const int numElements,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   dim3 threads(1, 1, 1);
-   dim3 blocks(numElements, 1, 1);
-   hipKerGBS(rMassAssemble2S,blocks,threads,numElements,NUM_QUAD_2D,COEFF,
-            quadWeights,J,oper);
-}
-
-// *****************************************************************************
-static void rMassAssemble3S(const int NUM_QUAD_3D,
-                            const int numElements,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   dim3 threads(1, 1, 1);
-   dim3 blocks(numElements, 1, 1);
-   hipKerGBS(rMassAssemble3S,blocks,threads,numElements,NUM_QUAD_3D,COEFF,
-            quadWeights,J,oper);
-}
-
-// *****************************************************************************
-void rMassAssembleS(const int dim,
-                    const int NUM_QUAD,
-                    const int numElements,
-                    const double* quadWeights,
-                    const double* J,
-                    const double COEFF,
-                    double* __restrict oper)
-{
-   assert(false);
-   if (dim==1) {assert(false);}
-   if (dim==2) { rMassAssemble2S(NUM_QUAD,numElements,COEFF,quadWeights,J,oper); }
-   if (dim==3) { rMassAssemble3S(NUM_QUAD,numElements,COEFF,quadWeights,J,oper); }
-}
diff --git a/hip/hip/kernels/share/massMultAddS.cpp b/hip/hip/kernels/share/massMultAddS.cpp
deleted file mode 100644
index 416b0133..00000000
--- a/hip/hip/kernels/share/massMultAddS.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd2S(const int numElements,
-                    const double* restrict dofToQuad,
-                    const double* restrict dofToQuadD,
-                    const double* restrict quadToDof,
-                    const double* restrict quadToDofD,
-                    const double* restrict oper,
-                    const double* restrict solIn,
-                    double* restrict solOut)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   // Iterate over elements
-   const int idx = blockIdx.x;
-   const int eOff = idx * M2_ELEMENT_BATCH;
-   if (eOff < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      share double s_quadToDof[NUM_QUAD_DOFS_1D];
-
-      // Store xy planes in shared memory
-      share double s_xy[NUM_QUAD_DOFS_1D];
-      share double s_xy2[NUM_QUAD_2D];
-
-      double r_x[NUM_MAX_1D];
-
-      const int x = threadIdx.x;
-      {
-         for (int id = x; id < NUM_QUAD_DOFS_1D; id += NUM_MAX_1D)
-         {
-            s_dofToQuad[id]  = dofToQuad[id];
-            s_quadToDof[id]  = quadToDof[id];
-         }
-      }
-
-      for (int e = eOff; e < (eOff + M2_ELEMENT_BATCH); ++e)
-      {
-         if (e < numElements)
-         {
-            {
-               const int dx = threadIdx.x;
-               if (dx < NUM_DOFS_1D)
-               {
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     s_xy[ijN(dx, qy,NUM_DOFS_1D)] = 0;
-                  }
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     r_x[dy] = solIn[ijkN(dx, dy, e,NUM_DOFS_1D)];
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     double xy = 0;
-                     for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                     {
-                        xy += r_x[dy] * s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     }
-                     s_xy[ijN(dx, qy,NUM_DOFS_1D)] = xy;
-                  }
-               }
-            }
-            sync;
-            const int qy = threadIdx.x;
-            {
-               if (qy < NUM_QUAD_1D)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     double s = 0;
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        s += s_xy[ijN(dx, qy,NUM_DOFS_1D)] * s_dofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                     }
-                     s_xy2[ijN(qx, qy,NUM_QUAD_1D)] = s * oper[ijkN(qx, qy, e,NUM_QUAD_1D)];
-                  }
-               }
-            }
-            sync;
-            const int qx = threadIdx.x;
-            {
-               if (qx < NUM_QUAD_1D)
-               {
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     s_xy[ijN(dy, qx,NUM_DOFS_1D)] = 0;
-                  }
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     r_x[qy] = s_xy2[ijN(qx, qy,NUM_QUAD_1D)];
-                  }
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     double s = 0;
-                     for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                     {
-                        s += r_x[qy] * s_quadToDof[ijN(dy, qy,NUM_DOFS_1D)];
-                     }
-                     s_xy[ijN(dy, qx,NUM_DOFS_1D)] = s;
-                  }
-               }
-            }
-            sync;
-            const int dx = threadIdx.x;
-            {
-               if (dx < NUM_DOFS_1D)
-               {
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     double s = 0;
-                     for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                     {
-                        s += (s_xy[ijN(dy, qx,NUM_DOFS_1D)] * s_quadToDof[ijN(dx, qx,NUM_DOFS_1D)]);
-                     }
-                     solOut[ijkN(dx, dy, e,NUM_DOFS_1D)] += s;
-                  }
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D> kernel
-void rMassMultAdd3S(const int numElements,
-                    const double* restrict dofToQuad,
-                    const double* restrict dofToQuadD,
-                    const double* restrict quadToDof,
-                    const double* restrict quadToDofD,
-                    const double* restrict oper,
-                    const double* restrict solIn,
-                    double* restrict solOut)
-{
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int NUM_MAX_2D = NUM_MAX_1D*NUM_MAX_1D;
-   // Iterate over elements
-   const int e = blockIdx.x;
-   if (e < numElements)
-   {
-      // Store dof <--> quad mappings
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      share double s_quadToDof[NUM_QUAD_DOFS_1D];
-      // Store xy planes in @shared memory
-      share double s_xy[NUM_MAX_2D];
-      // Store z axis as registers
-      double r_z[NUM_QUAD_1D];
-      double r_z2[NUM_DOFS_1D];
-
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * NUM_MAX_1D) + x;
-            // Fetch Q <--> D maps
-            if (id < NUM_QUAD_DOFS_1D)
-            {
-               s_dofToQuad[id]  = dofToQuad[id];
-               s_quadToDof[id]  = quadToDof[id];
-            }
-            // Initialize our Z axis
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               r_z[qz] = 0;
-            }
-            for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-            {
-               r_z2[dz] = 0;
-            }
-         }
-      }
-      sync;
-      {
-         const int dy = threadIdx.y;
-         {
-            const int dx = threadIdx.x;
-            if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-            {
-               for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-               {
-                  const double s = solIn[ijklN(dx,dy,dz,e,NUM_DOFS_1D)];
-                  // Calculate D -> Q in the Z axis
-                  for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-                  {
-                     r_z[qz] += s * s_dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-                  }
-               }
-            }
-         }
-      }
-      // For each xy plane
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         // Fill xy plane at given z position
-         sync;
-         {
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-               {
-                  s_xy[ijN(dx, dy,NUM_DOFS_1D)] = r_z[qz];
-               }
-            }
-         }
-         // Calculate Dxyz, xDyz, xyDz in plane
-         sync;
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = threadIdx.x;
-               if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-               {
-                  double s = 0;
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     const double wy = s_dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        const double wx = s_dofToQuad[ijN(qx, dx,NUM_QUAD_1D)];
-                        s += wx * wy * s_xy[ijN(dx, dy,NUM_DOFS_1D)];
-                     }
-                  }
-
-                  s *= oper[ijklN(qx, qy, qz,e,NUM_QUAD_1D)];
-
-                  for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-                  {
-                     const double wz  = s_quadToDof[ijN(dz, qz,NUM_DOFS_1D)];
-                     r_z2[dz] += wz * s;
-                  }
-               }
-            }
-         }
-      }
-      // Iterate over xy planes to compute solution
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         // Place xy plane in @shared memory
-         sync;
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = threadIdx.x;
-               if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D))
-               {
-                  s_xy[ijN(qx, qy,NUM_QUAD_1D)] = r_z2[dz];
-               }
-            }
-         }
-         // Finalize solution in xy plane
-         sync;
-         {
-            const int dy = threadIdx.y;
-            {
-               const int dx = threadIdx.x;
-               if ((dx < NUM_DOFS_1D) && (dy < NUM_DOFS_1D))
-               {
-                  double solZ = 0;
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-                  {
-                     const double wy = s_quadToDof[ijN(dy, qy,NUM_DOFS_1D)];
-                     for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                     {
-                        const double wx = s_quadToDof[ijN(dx, qx,NUM_DOFS_1D)];
-                        solZ += wx * wy * s_xy[ijN(qx, qy,NUM_QUAD_1D)];
-                     }
-                  }
-                  solOut[ijklN(dx,dy,dz,e,NUM_DOFS_1D)] += solZ;
-               }
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fMassMultAdd)(const int numElements,
-                             const double* dofToQuad,
-                             const double* dofToQuadD,
-                             const double* quadToDof,
-                             const double* quadToDofD,
-                             const double* oper,
-                             const double* solIn,
-                             double* __restrict solOut);
-
-// *****************************************************************************
-void rMassMultAddS(const int DIM,
-                   const int NUM_DOFS_1D,
-                   const int NUM_QUAD_1D,
-                   const int numElements,
-                   const double* dofToQuad,
-                   const double* dofToQuadD,
-                   const double* quadToDof,
-                   const double* quadToDofD,
-                   const double* op,
-                   const double* x,
-                   double* __restrict y)
-{
-   if (DIM==1) { assert(false); }
-   const int b1d = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int MX_ELEMENT_BATCH = DIM==2?M2_ELEMENT_BATCH:1;
-   const int grid = ((numElements+MX_ELEMENT_BATCH-1)/MX_ELEMENT_BATCH);
-   const dim3 blck(b1d,b1d,1);
-   assert(LOG2(DIM)<=4);
-   assert((NUM_QUAD_1D&1)==0);
-   assert(LOG2(NUM_DOFS_1D-1)<=8);
-   assert(LOG2(NUM_QUAD_1D>>1)<=8);
-   const unsigned int id = (DIM<<16)|((NUM_DOFS_1D-1)<<8)|(NUM_QUAD_1D>>1);
-   static std::unordered_map<unsigned int, fMassMultAdd> call =
-   {
-      // 2D
-      {0x20001,&rMassMultAdd2S<1,2>},    {0x20101,&rMassMultAdd2S<2,2>},
-      {0x20102,&rMassMultAdd2S<2,4>},    {0x20202,&rMassMultAdd2S<3,4>},
-      {0x20203,&rMassMultAdd2S<3,6>},    {0x20303,&rMassMultAdd2S<4,6>},
-      {0x20304,&rMassMultAdd2S<4,8>},    {0x20404,&rMassMultAdd2S<5,8>},
-      {0x20405,&rMassMultAdd2S<5,10>},   {0x20505,&rMassMultAdd2S<6,10>},
-      {0x20506,&rMassMultAdd2S<6,12>},   {0x20606,&rMassMultAdd2S<7,12>},
-      {0x20607,&rMassMultAdd2S<7,14>},   {0x20707,&rMassMultAdd2S<8,14>},
-      {0x20708,&rMassMultAdd2S<8,16>},   {0x20808,&rMassMultAdd2S<9,16>},
-      {0x20809,&rMassMultAdd2S<9,18>},   {0x20909,&rMassMultAdd2S<10,18>},
-      {0x2090A,&rMassMultAdd2S<10,20>},  {0x20A0A,&rMassMultAdd2S<11,20>},
-      {0x20A0B,&rMassMultAdd2S<11,22>},  {0x20B0B,&rMassMultAdd2S<12,22>},
-      {0x20B0C,&rMassMultAdd2S<12,24>},  {0x20C0C,&rMassMultAdd2S<13,24>},
-      {0x20C0D,&rMassMultAdd2S<13,26>},  {0x20D0D,&rMassMultAdd2S<14,26>},
-      {0x20D0E,&rMassMultAdd2S<14,28>},  {0x20E0E,&rMassMultAdd2S<15,28>},
-      {0x20E0F,&rMassMultAdd2S<15,30>},  {0x20F0F,&rMassMultAdd2S<16,30>},
-      {0x20F10,&rMassMultAdd2S<16,32>},  {0x21010,&rMassMultAdd2S<17,32>},
-      // 3D
-      {0x30001,&rMassMultAdd3S<1,2>},    {0x30101,&rMassMultAdd3S<2,2>},
-      {0x30102,&rMassMultAdd3S<2,4>},    {0x30202,&rMassMultAdd3S<3,4>},
-      {0x30203,&rMassMultAdd3S<3,6>},    {0x30303,&rMassMultAdd3S<4,6>},
-      {0x30304,&rMassMultAdd3S<4,8>},    {0x30404,&rMassMultAdd3S<5,8>},
-      {0x30405,&rMassMultAdd3S<5,10>},   {0x30505,&rMassMultAdd3S<6,10>},
-      {0x30506,&rMassMultAdd3S<6,12>},   {0x30606,&rMassMultAdd3S<7,12>},
-      {0x30607,&rMassMultAdd3S<7,14>},   {0x30707,&rMassMultAdd3S<8,14>},
-      {0x30708,&rMassMultAdd3S<8,16>},   {0x30808,&rMassMultAdd3S<9,16>},
-      {0x30809,&rMassMultAdd3S<9,18>},   {0x30909,&rMassMultAdd3S<10,18>},
-      {0x3090A,&rMassMultAdd3S<10,20>},  {0x30A0A,&rMassMultAdd3S<11,20>},
-      {0x30A0B,&rMassMultAdd3S<11,22>},  {0x30B0B,&rMassMultAdd3S<12,22>},
-      {0x30B0C,&rMassMultAdd3S<12,24>},  {0x30C0C,&rMassMultAdd3S<13,24>},
-      {0x30C0D,&rMassMultAdd3S<13,26>},  {0x30D0D,&rMassMultAdd3S<14,26>},
-      {0x30D0E,&rMassMultAdd3S<14,28>},  {0x30E0E,&rMassMultAdd3S<15,28>},
-      {0x30E0F,&rMassMultAdd3S<15,30>},  {0x30F0F,&rMassMultAdd3S<16,30>},
-      {0x30F10,&rMassMultAdd3S<16,32>},  {0x31010,&rMassMultAdd3S<17,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rMassMultAddS] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         numElements,dofToQuad,dofToQuadD,quadToDof,quadToDofD,op,x,y);
-}
diff --git a/hip/hip/kernels/share/qDataUpdateS.cpp b/hip/hip/kernels/share/qDataUpdateS.cpp
deleted file mode 100644
index 2cda2623..00000000
--- a/hip/hip/kernels/share/qDataUpdateS.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../hip.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData2S(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int NUM_MAX_1D = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const int idx = blockIdx.x;
-   const int el = idx;
-   if (el < numElements)
-   {
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];//@dim(NUM_QUAD_1D, NUM_DOFS_1D);
-      share double s_dofToQuadD[NUM_QUAD_DOFS_1D];//@dim(NUM_QUAD_1D, NUM_DOFS_1D);
-
-      share double s_xy[NUM_DIM *
-                        NUM_QUAD_DOFS_1D];//@dim(NUM_DIM, NUM_DOFS_1D, NUM_QUAD_1D);
-      share double s_xDy[NUM_DIM *
-                         NUM_QUAD_DOFS_1D];//@dim(NUM_DIM, NUM_DOFS_1D, NUM_QUAD_1D);
-
-      share double s_gradv[NUM_DIM * NUM_DIM *
-                           NUM_QUAD_2D];//@dim(NUM_DIM, NUM_DIM, NUM_QUAD_2D);
-
-      double r_v[NUM_DIM * NUM_DOFS_1D];//@dim(NUM_DIM, NUM_DOFS_1D);
-
-      {
-         const int x = threadIdx.x;
-         for (int id = x; id < NUM_QUAD_DOFS_1D; id += NUM_MAX_1D)
-         {
-            s_dofToQuad[id]  = dofToQuad[id];
-            s_dofToQuadD[id] = dofToQuadD[id];
-         }
-      }
-
-      sync;
-      {
-         const int dx = threadIdx.x;
-         if (dx < NUM_DOFS_1D)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  s_xy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] = 0;
-                  s_xDy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] = 0;
-               }
-            }
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  r_v[ijN(vi, dy,NUM_DIM)] = v[_ijklNM(vi,dx,dy,el,NUM_DOFS_1D,numElements)];
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               double xy[NUM_DIM];
-               double xDy[NUM_DIM];
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  xy[vi]  = 0;
-                  xDy[vi] = 0;
-               }
-               for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-               {
-                  for (int vi = 0; vi < NUM_DIM; ++vi)
-                  {
-                     xy[vi]  += r_v[ijN(vi, dy,NUM_DIM)] * s_dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                     xDy[vi] += r_v[ijN(vi, dy,NUM_DIM)] * s_dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                  }
-               }
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  s_xy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)]  = xy[vi];
-                  s_xDy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] = xDy[vi];
-               }
-            }
-         }
-      }
-
-      sync;
-      {
-         const int qy = threadIdx.x;
-         if (qy < NUM_QUAD_1D)
-         {
-            for (int qx = 0; qx < NUM_MAX_1D; ++qx)
-            {
-               double gradX[NUM_DIM];
-               double gradY[NUM_DIM];
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  gradX[vi] = 0;
-                  gradY[vi] = 0;
-               }
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  for (int vi = 0; vi < NUM_DIM; ++vi)
-                  {
-                     gradX[vi] += s_xy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)]  * s_dofToQuadD[ijN(qx,
-                                                                                                  dx,NUM_QUAD_1D)];
-                     gradY[vi] += s_xDy[ijkNM(vi, dx, qy,NUM_DIM,NUM_DOFS_1D)] * s_dofToQuad[ijN(qx,
-                                                                                                 dx,NUM_QUAD_1D)];
-                  }
-               }
-               for (int vi = 0; vi < NUM_DIM; ++vi)
-               {
-                  s_gradv[ijkN(vi, 0, qx + qy*NUM_QUAD_1D,NUM_DIM)] = gradX[vi];
-                  s_gradv[ijkN(vi, 1, qx + qy*NUM_QUAD_1D,NUM_DIM)] = gradY[vi];
-               }
-            }
-         }
-      }
-
-      sync;
-      {
-         const int qBlock = threadIdx.x;
-         for (int q = qBlock; q < NUM_QUAD; q += NUM_MAX_1D)
-         {
-            double q_gradv[NUM_DIM * NUM_DIM];//@dim(NUM_DIM, NUM_DIM);
-            double q_stress[NUM_DIM * NUM_DIM];//@dim(NUM_DIM, NUM_DIM);
-
-            const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-            q_gradv[ijN(0,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_00) + (s_gradv[ijkN(1,0,q,
-                                                                                    2)]*invJ_01));
-            q_gradv[ijN(1,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_10) + (s_gradv[ijkN(1,0,q,
-                                                                                    2)]*invJ_11));
-            q_gradv[ijN(0,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_00) + (s_gradv[ijkN(1,1,q,
-                                                                                    2)]*invJ_01));
-            q_gradv[ijN(1,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_10) + (s_gradv[ijkN(1,1,q,
-                                                                                    2)]*invJ_11));
-
-            const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-
-            const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)]/q_Jw;
-            const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-
-            // TODO: Input OccaVector eos(q,e) -> (stress, soundSpeed)
-            const double s = -(GAMMA - 1.0) * q_rho * q_e;
-            q_stress[ijN(0,0,2)] = s; q_stress[ijN(1,0,2)] = 0;
-            q_stress[ijN(0,1,2)] = 0; q_stress[ijN(1,1,2)] = s;
-
-            const double gradv00 = q_gradv[ijN(0,0,2)];
-            const double gradv11 = q_gradv[ijN(1,1,2)];
-            const double gradv10 = 0.5 * (q_gradv[ijN(1,0,2)] + q_gradv[ijN(0,1,2)]);
-            q_gradv[ijN(1,0,2)] = gradv10;
-            q_gradv[ijN(0,1,2)] = gradv10;
-
-            double comprDirX = 1;
-            double comprDirY = 0;
-            double minEig = 0;
-            // linalg/densemat.cpp: Eigensystem2S()
-            if (gradv10 == 0)
-            {
-               minEig = (gradv00 < gradv11) ? gradv00 : gradv11;
-            }
-            else
-            {
-               const double zeta  = (gradv11 - gradv00) / (2.0 * gradv10);
-               const double azeta = fabs(zeta);
-               double t = 1.0 / (azeta + sqrt(1.0 + zeta*zeta));
-               if ((t < 0) != (zeta < 0))
-               {
-                  t = -t;
-               }
-
-               const double c = sqrt(1.0 / (1.0 + t*t));
-               const double s = c * t;
-               t *= gradv10;
-
-               if ((gradv00 - t) <= (gradv11 + t))
-               {
-                  minEig = gradv00 - t;
-                  comprDirX = c;
-                  comprDirY = -s;
-               }
-               else
-               {
-                  minEig = gradv11 + t;
-                  comprDirX = s;
-                  comprDirY = c;
-               }
-            }
-
-            // Computes the initial->physical transformation Jacobian.
-            const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-            const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-            const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-            const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-            const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01));
-            const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11));
-            const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01));
-            const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11));
-
-            const double physDirX = (Jpi_00 * comprDirX) + (Jpi_10 * comprDirY);
-            const double physDirY = (Jpi_01 * comprDirX) + (Jpi_11 * comprDirY);
-
-            const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY));
-
-            // TODO: soundSpeed will be an input as well (function call or values per q)
-            const double soundSpeed = sqrt(GAMMA * (GAMMA - 1.0) * q_e);
-            dtEst[ijN(q, el,NUM_QUAD)] = CFL * q_h / soundSpeed;
-
-            if (USE_VISCOSITY)
-            {
-               // TODO: Check how we can extract outside of kernel
-               const double mu = minEig;
-               double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-               if (mu < 0)
-               {
-                  coeff += 0.5 * q_rho * q_h * soundSpeed;
-               }
-               for (int y = 0; y < NUM_DIM; ++y)
-               {
-                  for (int x = 0; x < NUM_DIM; ++x)
-                  {
-                     q_stress[ijN(x,y,2)] += coeff * q_gradv[ijN(x,y,2)];
-                  }
-               }
-            }
-            const double S00 = q_stress[ijN(0,0,2)];
-            const double S10 = q_stress[ijN(1,0,2)];
-            const double S01 = q_stress[ijN(0,1,2)];
-            const double S11 = q_stress[ijN(1,1,2)];
-
-            stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01));
-            stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11));
-
-            stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01));
-            stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                               NUM_QUAD)] = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11));
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D> kernel
-void rUpdateQuadratureData3S(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_DOFS_1D = (NUM_QUAD_1D * NUM_DOFS_1D);
-   const int el = blockIdx.x;
-   if (el < numElements)
-   {
-      share double s_dofToQuad[NUM_QUAD_DOFS_1D];
-      share double s_dofToQuadD[NUM_QUAD_DOFS_1D];
-
-      {
-         const int y = threadIdx.y;
-         {
-            const int x = threadIdx.x;
-            const int id = (y * NUM_QUAD_1D) + x;
-            for (int i = id; i < (NUM_DOFS_1D * NUM_QUAD_1D); i += NUM_QUAD_2D)
-            {
-               s_dofToQuad[id]  = dofToQuad[id];
-               s_dofToQuadD[id] = dofToQuadD[id];
-            }
-         }
-      }
-      sync;
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         {
-            const int qy = threadIdx.y;
-            {
-               const int qx = 0 + threadIdx.x;
-               const int q = qx + qy*NUM_QUAD_1D + qz*NUM_QUAD_2D;
-               double gradv[9];
-               double q_gradv[9];
-               double q_stress[9];
-
-               // Brute-force convertion of dof -> quad for now
-               for (int i = 0; i < 9; ++i)
-               {
-                  gradv[i] = 0;
-               }
-               for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-               {
-                  double xy[3];
-                  double Dxy[3];
-                  double xDy[3];
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     xy[vi] = Dxy[vi] = xDy[vi] = 0;
-                  }
-                  for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-                  {
-                     double x[3];
-                     double Dx[3];
-                     for (int vi = 0; vi < 3; ++vi)
-                     {
-                        x[vi] = Dx[vi] = 0;
-                     }
-                     for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-                     {
-                        const double wx  = s_dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                        const double wDx = s_dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                        for (int vi = 0; vi < 3; ++vi)
-                        {
-                           const double r_v = v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,numElements)];
-                           x[vi]  += wx  * r_v;
-                           Dx[vi] += wDx * r_v;
-                        }
-                     }
-                     const double wy  = s_dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                     const double wDy = s_dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                     for (int vi = 0; vi < 3; ++vi)
-                     {
-                        xy[vi]  += wy  * x[vi];
-                        Dxy[vi] += wy  * Dx[vi];
-                        xDy[vi] += wDy * x[vi];
-                     }
-                  }
-                  const double wz  = s_dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-                  const double wDz = s_dofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     gradv[ijN(vi,0,3)] += wz  * Dxy[vi];
-                     gradv[ijN(vi,1,3)] += wz  * xDy[vi];
-                     gradv[ijN(vi,2,3)] += wDz * xy[vi];
-                  }
-               }
-
-               const double invJ_00 = invJ[ijklNM(0, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_10 = invJ[ijklNM(1, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_20 = invJ[ijklNM(2, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_01 = invJ[ijklNM(0, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_11 = invJ[ijklNM(1, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_21 = invJ[ijklNM(2, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_02 = invJ[ijklNM(0, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_12 = invJ[ijklNM(1, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ_22 = invJ[ijklNM(2, 2, q, el,NUM_DIM,NUM_QUAD)];
-
-               q_gradv[ijN(0,0,3)] = ((gradv[ijN(0,0,3)] * invJ_00) + (gradv[ijN(1,0,
-                                                                                 3)] * invJ_01) + (gradv[ijN(2,0,3)] * invJ_02));
-               q_gradv[ijN(1,0,3)] = ((gradv[ijN(0,0,3)] * invJ_10) + (gradv[ijN(1,0,
-                                                                                 3)] * invJ_11) + (gradv[ijN(2,0,3)] * invJ_12));
-               q_gradv[ijN(2,0,3)] = ((gradv[ijN(0,0,3)] * invJ_20) + (gradv[ijN(1,0,
-                                                                                 3)] * invJ_21) + (gradv[ijN(2,0,3)] * invJ_22));
-
-               q_gradv[ijN(0,1,3)] = ((gradv[ijN(0,1,3)] * invJ_00) + (gradv[ijN(1,1,
-                                                                                 3)] * invJ_01) + (gradv[ijN(2,1,3)] * invJ_02));
-               q_gradv[ijN(1,1,3)] = ((gradv[ijN(0,1,3)] * invJ_10) + (gradv[ijN(1,1,
-                                                                                 3)] * invJ_11) + (gradv[ijN(2,1,3)] * invJ_12));
-               q_gradv[ijN(2,1,3)] = ((gradv[ijN(0,1,3)] * invJ_20) + (gradv[ijN(1,1,
-                                                                                 3)] * invJ_21) + (gradv[ijN(2,1,3)] * invJ_22));
-
-               q_gradv[ijN(0,2,3)] = ((gradv[ijN(0,2,3)] * invJ_00) + (gradv[ijN(1,2,
-                                                                                 3)] * invJ_01) + (gradv[ijN(2,2,3)] * invJ_02));
-               q_gradv[ijN(1,2,3)] = ((gradv[ijN(0,2,3)] * invJ_10) + (gradv[ijN(1,2,
-                                                                                 3)] * invJ_11) + (gradv[ijN(2,2,3)] * invJ_12));
-               q_gradv[ijN(2,2,3)] = ((gradv[ijN(0,2,3)] * invJ_20) + (gradv[ijN(1,2,
-                                                                                 3)] * invJ_21) + (gradv[ijN(2,2,3)] * invJ_22));
-
-               const double q_Jw = detJ[ijN(q,el,NUM_QUAD)] * quadWeights[q];
-
-               const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-               const double q_e   = fmax(0.0, e[ijN(q,el,NUM_QUAD)]);
-
-               const double s = -(GAMMA - 1.0) * q_rho * q_e;
-               q_stress[ijN(0, 0,3)] = s; q_stress[ijN(1, 0,3)] = 0; q_stress[ijN(2, 0,3)] = 0;
-               q_stress[ijN(0, 1,3)] = 0; q_stress[ijN(1, 1,3)] = s; q_stress[ijN(2, 1,3)] = 0;
-               q_stress[ijN(0, 2,3)] = 0; q_stress[ijN(1, 2,3)] = 0; q_stress[ijN(2, 2,3)] = s;
-
-               const double gradv00 = q_gradv[ijN(0, 0,3)];
-               const double gradv11 = q_gradv[ijN(1, 1,3)];
-               const double gradv22 = q_gradv[ijN(2, 2,3)];
-               const double gradv10 = 0.5 * (q_gradv[ijN(1, 0,3)] + q_gradv[ijN(0, 1,3)]);
-               const double gradv20 = 0.5 * (q_gradv[ijN(2, 0,3)] + q_gradv[ijN(0, 2,3)]);
-               const double gradv21 = 0.5 * (q_gradv[ijN(2, 1,3)] + q_gradv[ijN(1, 2,3)]);
-               q_gradv[ijN(1, 0,3)] = gradv10; q_gradv[ijN(2, 0,3)] = gradv20;
-               q_gradv[ijN(0, 1,3)] = gradv10; q_gradv[ijN(2, 1,3)] = gradv21;
-               q_gradv[ijN(0, 2,3)] = gradv20; q_gradv[ijN(1, 2,3)] = gradv21;
-
-               double minEig = 0;
-               double comprDirX = 1;
-               double comprDirY = 0;
-               double comprDirZ = 0;
-
-               {
-                  // Compute eigenvalues using quadrature formula
-                  const double q_ = (gradv00 + gradv11 + gradv22) / 3.0;
-                  const double gradv_q00 = (gradv00 - q_);
-                  const double gradv_q11 = (gradv11 - q_);
-                  const double gradv_q22 = (gradv22 - q_);
-
-                  const double p1 = ((gradv10 * gradv10) +
-                                     (gradv20 * gradv20) +
-                                     (gradv21 * gradv21));
-                  const double p2 = ((gradv_q00 * gradv_q00) +
-                                     (gradv_q11 * gradv_q11) +
-                                     (gradv_q22 * gradv_q22) +
-                                     (2.0 * p1));
-                  const double p    = sqrt(p2 / 6.0);
-                  const double pinv = 1.0 / p;
-                  // det(pinv * (gradv - q*I))
-                  const double r = (0.5 * pinv * pinv * pinv *
-                                    ((gradv_q00 * gradv_q11 * gradv_q22) +
-                                     (2.0 * gradv10 * gradv21 * gradv20) -
-                                     (gradv_q11 * gradv20 * gradv20) -
-                                     (gradv_q22 * gradv10 * gradv10) -
-                                     (gradv_q00 * gradv21 * gradv21)));
-
-                  double phi = 0;
-                  if (r <= -1.0)
-                  {
-                     phi = M_PI / 3.0;
-                  }
-                  else if (r < 1.0)
-                  {
-                     phi = acos(r) / 3.0;
-                  }
-
-                  minEig = q_ + (2.0 * p * cos(phi + (2.0 * M_PI / 3.0)));
-                  const double eig3 = q_ + (2.0 * p * cos(phi));
-                  const double eig2 = 3.0 * q_ - minEig - eig3;
-                  double maxNorm = 0;
-
-                  for (int i = 0; i < 3; ++i)
-                  {
-                     const double x = q_gradv[i + 3*0] - (i == 0)*eig3;
-                     const double y = q_gradv[i + 3*1] - (i == 1)*eig3;
-                     const double z = q_gradv[i + 3*2] - (i == 2)*eig3;
-                     const double cx = ((x * (gradv00 - eig2)) +
-                                        (y * gradv10) +
-                                        (z * gradv20));
-                     const double cy = ((x * gradv10) +
-                                        (y * (gradv11 - eig2)) +
-                                        (z * gradv21));
-                     const double cz = ((x * gradv20) +
-                                        (y * gradv21) +
-                                        (z * (gradv22 - eig2)));
-                     const double cNorm = (cx*cx + cy*cy + cz*cz);
-                     if ((cNorm > 1e-16) && (maxNorm < cNorm))
-                     {
-                        comprDirX = cx;
-                        comprDirY = cy;
-                        comprDirZ = cz;
-                        maxNorm = cNorm;
-                     }
-                  }
-                  if (maxNorm > 1e-16)
-                  {
-                     const double maxNormInv = 1.0 / sqrt(maxNorm);
-                     comprDirX *= maxNormInv;
-                     comprDirY *= maxNormInv;
-                     comprDirZ *= maxNormInv;
-                  }
-               }
-
-               // Computes the initial->physical transformation Jacobian.
-               const double J_00 = J[ijklNM(0, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_10 = J[ijklNM(1, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_20 = J[ijklNM(2, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_01 = J[ijklNM(0, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_11 = J[ijklNM(1, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_21 = J[ijklNM(2, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_02 = J[ijklNM(0, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_12 = J[ijklNM(1, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double J_22 = J[ijklNM(2, 2, q, el,NUM_DIM,NUM_QUAD)];
-
-               const double invJ0_00 = invJ0[ijklNM(0, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_10 = invJ0[ijklNM(1, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_20 = invJ0[ijklNM(2, 0, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_01 = invJ0[ijklNM(0, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_11 = invJ0[ijklNM(1, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_21 = invJ0[ijklNM(2, 1, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_02 = invJ0[ijklNM(0, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_12 = invJ0[ijklNM(1, 2, q, el,NUM_DIM,NUM_QUAD)];
-               const double invJ0_22 = invJ0[ijklNM(2, 2, q, el,NUM_DIM,NUM_QUAD)];
-
-               const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01) +
-                                      (J_20 * invJ0_02));
-               const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11) +
-                                      (J_20 * invJ0_12));
-               const double Jpi_20 = ((J_00 * invJ0_20) + (J_10 * invJ0_21) +
-                                      (J_20 * invJ0_22));
-
-               const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01) +
-                                      (J_21 * invJ0_02));
-               const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11) +
-                                      (J_21 * invJ0_12));
-               const double Jpi_21 = ((J_01 * invJ0_20) + (J_11 * invJ0_21) +
-                                      (J_21 * invJ0_22));
-
-               const double Jpi_02 = ((J_02 * invJ0_00) + (J_12 * invJ0_01) +
-                                      (J_22 * invJ0_02));
-               const double Jpi_12 = ((J_02 * invJ0_10) + (J_12 * invJ0_11) +
-                                      (J_22 * invJ0_12));
-               const double Jpi_22 = ((J_02 * invJ0_20) + (J_12 * invJ0_21) +
-                                      (J_22 * invJ0_22));
-
-               const double physDirX = ((Jpi_00 * comprDirX) + (Jpi_10 * comprDirY) +
-                                        (Jpi_20 * comprDirZ));
-               const double physDirY = ((Jpi_01 * comprDirX) + (Jpi_11 * comprDirY) +
-                                        (Jpi_21 * comprDirZ));
-               const double physDirZ = ((Jpi_02 * comprDirX) + (Jpi_12 * comprDirY) +
-                                        (Jpi_22 * comprDirZ));
-
-               const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY) +
-                                            (physDirZ * physDirZ));
-
-               const double soundSpeed = sqrt(GAMMA * (GAMMA - 1.0) * q_e);
-               dtEst[ijN(q, el,NUM_QUAD)] = CFL * q_h / soundSpeed;
-
-               if (USE_VISCOSITY)
-               {
-                  // TODO: Check how we can extract outside of kernel
-                  const double mu = minEig;
-                  double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-                  if (mu < 0)
-                  {
-                     coeff += 0.5 * q_rho * q_h * soundSpeed;
-                  }
-                  for (int y = 0; y < 3; ++y)
-                  {
-                     for (int x = 0; x < 3; ++x)
-                     {
-                        q_stress[ijN(x, y,3)] += coeff * q_gradv[ijN(x, y,3)];
-                     }
-                  }
-               }
-
-               const double S00 = q_stress[ijN(0, 0,3)];
-               const double S10 = q_stress[ijN(1, 0,3)];
-               const double S20 = q_stress[ijN(2, 0,3)];
-               const double S01 = q_stress[ijN(0, 1,3)];
-               const double S11 = q_stress[ijN(1, 1,3)];
-               const double S21 = q_stress[ijN(2, 1,3)];
-               const double S02 = q_stress[ijN(0, 2,3)];
-               const double S12 = q_stress[ijN(1, 2,3)];
-               const double S22 = q_stress[ijN(2, 2,3)];
-
-               stressJinvT[ijklNM(0, 0, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01) + (S20 * invJ_02));
-               stressJinvT[ijklNM(1, 0, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11) + (S20 * invJ_12));
-               stressJinvT[ijklNM(2, 0, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S00 * invJ_20) + (S10 * invJ_21) + (S20 * invJ_22));
-
-               stressJinvT[ijklNM(0, 1, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01) + (S21 * invJ_02));
-               stressJinvT[ijklNM(1, 1, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11) + (S21 * invJ_12));
-               stressJinvT[ijklNM(2, 1, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S01 * invJ_20) + (S11 * invJ_21) + (S21 * invJ_22));
-
-               stressJinvT[ijklNM(0, 2, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S02 * invJ_00) + (S12 * invJ_01) + (S22 * invJ_02));
-               stressJinvT[ijklNM(1, 2, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S02 * invJ_10) + (S12 * invJ_11) + (S22 * invJ_12));
-               stressJinvT[ijklNM(2, 2, q, el,NUM_DIM,
-                                  NUM_QUAD)] = q_Jw * ((S02 * invJ_20) + (S12 * invJ_21) + (S22 * invJ_22));
-            }
-         }
-      }
-   }
-}
-
-// *****************************************************************************
-typedef void (*fUpdateQuadratureDataS)(const double GAMMA,
-                                       const double H0,
-                                       const double CFL,
-                                       const bool USE_VISCOSITY,
-                                       const int numElements,
-                                       const double* restrict dofToQuad,
-                                       const double* restrict dofToQuadD,
-                                       const double* restrict quadWeights,
-                                       const double* restrict v,
-                                       const double* restrict e,
-                                       const double* restrict rho0DetJ0w,
-                                       const double* restrict invJ0,
-                                       const double* restrict J,
-                                       const double* restrict invJ,
-                                       const double* restrict detJ,
-                                       double* restrict stressJinvT,
-                                       double* restrict dtEst);
-
-// *****************************************************************************
-void rUpdateQuadratureDataS(const double GAMMA,
-                            const double H0,
-                            const double CFL,
-                            const bool USE_VISCOSITY,
-                            const int NUM_DIM,
-                            const int NUM_QUAD,
-                            const int NUM_QUAD_1D,
-                            const int NUM_DOFS_1D,
-                            const int nzones,
-                            const double* restrict dofToQuad,
-                            const double* restrict dofToQuadD,
-                            const double* restrict quadWeights,
-                            const double* restrict v,
-                            const double* restrict e,
-                            const double* restrict rho0DetJ0w,
-                            const double* restrict invJ0,
-                            const double* restrict J,
-                            const double* restrict invJ,
-                            const double* restrict detJ,
-                            double* restrict stressJinvT,
-                            double* restrict dtEst)
-{
-   const int grid = nzones;
-   const int b1d = (NUM_QUAD_1D<NUM_DOFS_1D)?NUM_DOFS_1D:NUM_QUAD_1D;
-   const dim3 blck(b1d,b1d,1);
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(IROOT(NUM_DIM,NUM_QUAD)==NUM_QUAD_1D);
-   const unsigned int id = (NUM_DIM<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned int,fUpdateQuadratureDataS> call =
-   {
-      // 2D
-      {0x20,&rUpdateQuadratureData2S<2,2*2,2,2>},
-      {0x21,&rUpdateQuadratureData2S<2,4*4,4,3>},
-      {0x22,&rUpdateQuadratureData2S<2,6*6,6,4>},
-      {0x23,&rUpdateQuadratureData2S<2,8*8,8,5>},
-      {0x24,&rUpdateQuadratureData2S<2,10*10,10,6>},
-      {0x25,&rUpdateQuadratureData2S<2,12*12,12,7>},
-      {0x26,&rUpdateQuadratureData2S<2,14*14,14,8>},
-      {0x27,&rUpdateQuadratureData2S<2,16*16,16,9>},
-      {0x28,&rUpdateQuadratureData2S<2,18*18,18,10>},
-      {0x29,&rUpdateQuadratureData2S<2,20*20,20,11>},
-      {0x2A,&rUpdateQuadratureData2S<2,22*22,22,12>},
-      {0x2B,&rUpdateQuadratureData2S<2,24*24,24,13>},
-      {0x2C,&rUpdateQuadratureData2S<2,26*26,26,14>},
-      {0x2D,&rUpdateQuadratureData2S<2,28*28,28,15>},
-      //{0x2E,&rUpdateQuadratureData2S<2,30*30,30,16>}, uses too much shared data
-      //{0x2F,&rUpdateQuadratureData2S<2,32*32,32,17>}, uses too much shared data
-      // 3D
-      {0x30,&rUpdateQuadratureData3S<3,2*2*2,2,2>},
-      {0x31,&rUpdateQuadratureData3S<3,4*4*4,4,3>},
-      {0x32,&rUpdateQuadratureData3S<3,6*6*6,6,4>},
-      {0x33,&rUpdateQuadratureData3S<3,8*8*8,8,5>},
-      {0x34,&rUpdateQuadratureData3S<3,10*10*10,10,6>},
-      {0x35,&rUpdateQuadratureData3S<3,12*12*12,12,7>},
-      {0x36,&rUpdateQuadratureData3S<3,14*14*14,14,8>},
-      {0x37,&rUpdateQuadratureData3S<3,16*16*16,16,9>},
-      {0x38,&rUpdateQuadratureData3S<3,18*18*18,18,10>},
-      {0x39,&rUpdateQuadratureData3S<3,20*20*20,20,11>},
-      {0x3A,&rUpdateQuadratureData3S<3,22*22*22,22,12>},
-      {0x3B,&rUpdateQuadratureData3S<3,24*24*24,24,13>},
-      {0x3C,&rUpdateQuadratureData3S<3,26*26*26,26,14>},
-      {0x3D,&rUpdateQuadratureData3S<3,28*28*28,28,15>},
-      {0x3E,&rUpdateQuadratureData3S<3,30*30*30,30,16>},
-      {0x3F,&rUpdateQuadratureData3S<3,32*32*32,32,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rUpdateQuadratureDataS] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call0(id,grid,blck,
-         GAMMA,H0,CFL,USE_VISCOSITY,
-         nzones,dofToQuad,dofToQuadD,quadWeights,
-         v,e,rho0DetJ0w,invJ0,J,invJ,detJ,
-         stressJinvT,dtEst);
-}
diff --git a/hip/hip/linalg/ode.hpp b/hip/hip/linalg/ode.hpp
deleted file mode 100644
index 7ba6c573..00000000
--- a/hip/hip/linalg/ode.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_ODE
-#define LAGHOS_HIP_ODE
-
-namespace mfem
-{
-
-// ***************************************************************************
-class HipODESolver
-{
-protected:
-   HipTimeDependentOperator *f;
-public:
-   HipODESolver() : f(NULL) {}
-   virtual ~HipODESolver() {}
-   virtual void Init(HipTimeDependentOperator &f) { this->f = &f; }
-   virtual void Step(HipVector &x, double &t, double &dt) =0;
-};
-
-// ***************************************************************************
-class HipForwardEulerSolver : public HipODESolver
-{
-private:
-   HipVector dxdt;
-public:
-   void Init(HipTimeDependentOperator &_f)
-   {
-      f = &_f;
-      dxdt.SetSize(f->Width());
-   }
-   void Step(HipVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, dxdt);
-      x.Add(dt, dxdt);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class HipRK2Solver : public HipODESolver
-{
-private:
-   double a;
-   HipVector dxdt, x1;
-public:
-   HipRK2Solver(const double _a = 2./3.) : a(_a) { }
-   void Init(HipTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      dxdt.SetSize(n);
-      x1.SetSize(n);
-   }
-   void Step(HipVector &x, double &t, double &dt)
-   {
-      const double b = 0.5/a;
-      f->SetTime(t);
-      f->Mult(x, dxdt);
-      add(x, (1. - b)*dt, dxdt, x1);
-      x.Add(a*dt, dxdt);
-      f->SetTime(t + a*dt);
-      f->Mult(x, dxdt);
-      add(x1, b*dt, dxdt, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class HipRK3SSPSolver : public HipODESolver
-{
-private:
-   HipVector y, k;
-public:
-   void Init(HipTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      y.SetSize(n);
-      k.SetSize(n);
-   }
-   void Step(HipVector &x, double &t, double &dt)
-   {
-      // x0 = x, t0 = t, k0 = dt*f(t0, x0)
-      f->SetTime(t);
-      f->Mult(x, k);
-      // x1 = x + k0, t1 = t + dt, k1 = dt*f(t1, x1)
-      add(x, dt, k, y);
-      f->SetTime(t + dt);
-      f->Mult(y, k);
-      // x2 = 3/4*x + 1/4*(x1 + k1), t2 = t + 1/2*dt, k2 = dt*f(t2, x2)
-      y.Add(dt, k);
-      add(3./4, x, 1./4, y, y);
-      f->SetTime(t + dt/2);
-      f->Mult(y, k);
-      // x3 = 1/3*x + 2/3*(x2 + k2), t3 = t + dt
-      y.Add(dt, k);
-      add(1./3, x, 2./3, y, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class HipRK4Solver : public HipODESolver
-{
-private:
-   HipVector y, k, z;
-public:
-   void Init(HipTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = HipODESolver::f->Width();
-      y.SetSize(n);
-      k.SetSize(n);
-      z.SetSize(n);
-   }
-
-   void Step(HipVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, k); // k1
-      add(x, dt/2, k, y);
-      add(x, dt/6, k, z);
-      f->SetTime(t + dt/2);
-      f->Mult(y, k); // k2
-      add(x, dt/2, k, y);
-      z.Add(dt/3, k);
-      f->Mult(y, k); // k3
-      add(x, dt, k, y);
-      z.Add(dt/3, k);
-      f->SetTime(t + dt);
-      f->Mult(y, k); // k4
-      add(z, dt/6, k, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class HipExplicitRKSolver : public HipODESolver
-{
-private:
-   int s;
-   const double *a, *b, *c;
-   HipVector y, *k;
-public:
-   HipExplicitRKSolver(int _s, const double *_a,
-                        const double *_b, const double *_c)
-   {
-      s = _s;
-      a = _a;
-      b = _b;
-      c = _c;
-      k = new HipVector[s];
-   }
-   void Init(HipTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      y.SetSize(n);
-      for (int i = 0; i < s; i++)
-      {
-         k[i].SetSize(n);
-      }
-   }
-   void Step(HipVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, k[0]);
-      for (int l = 0, i = 1; i < s; i++)
-      {
-         add(x, a[l++]*dt, k[0], y);
-         for (int j = 1; j < i; j++)
-         {
-            y.Add(a[l++]*dt, k[j]);
-         }
-         f->SetTime(t + c[i-1]*dt);
-         f->Mult(y, k[i]);
-      }
-      for (int i = 0; i < s; i++)
-      {
-         x.Add(b[i]*dt, k[i]);
-      }
-      t += dt;
-   }
-   ~HipExplicitRKSolver()
-   {
-      delete [] k;
-   }
-};
-
-// ***************************************************************************
-// ***************************************************************************
-static const double RK6_a[28] =
-{
-   .6e-1,
-   .1923996296296296296296296296296296296296e-1,
-   .7669337037037037037037037037037037037037e-1,
-   .35975e-1,
-   0.,
-   .107925,
-   1.318683415233148260919747276431735612861,
-   0.,
-   -5.042058063628562225427761634715637693344,
-   4.220674648395413964508014358283902080483,
-   -41.87259166432751461803757780644346812905,
-   0.,
-   159.4325621631374917700365669070346830453,
-   -122.1192135650100309202516203389242140663,
-   5.531743066200053768252631238332999150076,
-   -54.43015693531650433250642051294142461271,
-   0.,
-   207.0672513650184644273657173866509835987,
-   -158.6108137845899991828742424365058599469,
-   6.991816585950242321992597280791793907096,
-   -.1859723106220323397765171799549294623692e-1,
-   -54.66374178728197680241215648050386959351,
-   0.,
-   207.9528062553893734515824816699834244238,
-   -159.2889574744995071508959805871426654216,
-   7.018743740796944434698170760964252490817,
-   -.1833878590504572306472782005141738268361e-1,
-   -.5119484997882099077875432497245168395840e-3
-};
-
-static const double RK6_b[8] =
-{
-   .3438957868357036009278820124728322386520e-1,
-   0.,
-   0.,
-   .2582624555633503404659558098586120858767,
-   .4209371189673537150642551514069801967032,
-   4.405396469669310170148836816197095664891,
-   -176.4831190242986576151740942499002125029,
-   172.3641334014150730294022582711902413315
-};
-
-static const double RK6_c[7] =
-{
-   .6e-1,
-   .9593333333333333333333333333333333333333e-1,
-   .1439,
-   .4973,
-   .9725,
-   .9995,
-   1.,
-};
-
-class HipRK6Solver : public HipExplicitRKSolver
-{
-public:
-   HipRK6Solver() : HipExplicitRKSolver(8, RK6_a, RK6_b, RK6_c) { }
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_ODE
diff --git a/hip/hip/linalg/operator.hpp b/hip/hip/linalg/operator.hpp
deleted file mode 100644
index 0c7c8a0b..00000000
--- a/hip/hip/linalg/operator.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_OPERATOR
-#define LAGHOS_HIP_OPERATOR
-
-namespace mfem
-{
-
-// ***************************************************************************
-class HipOperator : public rmemcpy
-{
-protected:
-   int height;
-   int width;
-public:
-   explicit HipOperator(int s = 0) { height = width = s; }
-   HipOperator(int h, int w) { height = h; width = w; }
-   inline int Height() const { return height; }
-   inline int Width() const { return width; }
-   virtual void Mult(const HipVector &x, HipVector &y) const  { assert(false); };
-   virtual void MultTranspose(const HipVector &x, HipVector &y) const { assert(false); }
-   virtual const HipOperator *GetProlongation() const { assert(false); return NULL; }
-   virtual const HipOperator *GetRestriction() const  { assert(false); return NULL; }
-   virtual void RecoverFEMSolution(const HipVector &X,
-                                   const HipVector &b,
-                                   HipVector &x) {assert(false);}
-};
-
-
-// ***************************************************************************
-class HipTimeDependentOperator : public HipOperator
-{
-private:
-   double t;
-public:
-   explicit HipTimeDependentOperator(int n = 0,
-                                      double t_ = 0.0) : HipOperator(n), t(t_) {}
-   void SetTime(const double _t) { t = _t; }
-};
-
-// ***************************************************************************
-class HipSolverOperator : public HipOperator
-{
-public:
-   bool iterative_mode;
-   explicit HipSolverOperator(int s = 0,
-                               bool iter_mode = false) :
-      HipOperator(s),
-      iterative_mode(iter_mode) { }
-   virtual void SetOperator(const HipOperator &op) = 0;
-};
-
-// ***************************************************************************
-class HipRAPOperator : public HipOperator
-{
-private:
-   const HipOperator &Rt;
-   const HipOperator &A;
-   const HipOperator &P;
-   mutable HipVector Px;
-   mutable HipVector APx;
-public:
-   /// Construct the RAP operator given R^T, A and P.
-   HipRAPOperator(const HipOperator &Rt_, const HipOperator &A_,
-                   const HipOperator &P_)
-      : HipOperator(Rt_.Width(), P_.Width()), Rt(Rt_), A(A_), P(P_),
-        Px(P.Height()), APx(A.Height()) { }
-   /// Operator application.
-   void Mult(const HipVector & x, HipVector & y) const
-   {
-      P.Mult(x, Px);
-      A.Mult(Px, APx);
-      Rt.MultTranspose(APx, y);
-   }
-   /// Application of the transpose.
-   void MultTranspose(const HipVector & x, HipVector & y) const
-   {
-      Rt.Mult(x, APx);
-      A.MultTranspose(APx, Px);
-      P.MultTranspose(Px, y);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_OPERATOR
diff --git a/hip/hip/linalg/solvers.cpp b/hip/hip/linalg/solvers.cpp
deleted file mode 100644
index 68bd69ae..00000000
--- a/hip/hip/linalg/solvers.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-// *************************************************************************
-void HipCGSolver::h_Mult(const HipVector &b, HipVector &x) const
-{
-   int i;
-   double r0, den, nom, nom0, betanom, alpha, beta;
-   if (iterative_mode)
-   {
-      oper->Mult(x, r);
-      subtract(b, r, r); // r = b - A x
-   }
-   else
-   {
-      r = b;
-      x = 0.0;
-   }
-
-   if (prec)
-   {
-      prec->Mult(r, z); // z = B r
-      d = z;
-   }
-   else
-   {
-      d = r;
-   }
-
-   nom0 = nom = Dot(d, r);
-   MFEM_ASSERT(IsFinite(nom), "nom = " << nom);
-
-   if (print_level == 1
-       || print_level == 3)
-   {
-      mfem::out << "   Iteration : " << std::setw(3) << 0 << "  (B r, r) = "
-                << nom << (print_level == 3 ? " ...\n" : "\n");
-   }
-
-   r0 = std::max(nom*rel_tol*rel_tol,abs_tol*abs_tol);
-
-   if (nom <= r0)
-   {
-      converged = 1;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-   }
-
-   oper->Mult(d, z);  // z = A d
-
-   den = Dot(z, d);
-   MFEM_ASSERT(IsFinite(den), "den = " << den);
-
-   if (print_level >= 0 && den < 0.0)
-   {
-      mfem::out << "Negative denominator in step 0 of PCG: " << den << '\n';
-   }
-
-   if (den == 0.0)
-   {
-      converged = 0;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-   }
-
-   // start iteration
-   converged = 0;
-   final_iter = max_iter;
-   for (i = 1; true; )
-   {
-      alpha = nom/den;
-      add(x,  alpha, d, x);     //  x = x + alpha d
-      add(r, -alpha, z, r);     //  r = r - alpha A d
-
-      if (prec)
-      {
-         prec->Mult(r, z);      //  z = B r
-         betanom = Dot(r, z);
-      }
-      else
-      {
-         betanom = Dot(r, r);
-      }
-      MFEM_ASSERT(IsFinite(betanom), "betanom = " << betanom);
-
-      if (print_level == 1)
-      {
-         mfem::out << "   Iteration : " << std::setw(3) << i << "  (B r, r) = "
-                   << betanom << '\n';
-      }
-
-      if (betanom < r0)
-      {
-         if (print_level == 2)
-         {
-            mfem::out << "Number of PCG iterations: " << i << '\n';
-         }
-         else if (print_level == 3)
-         {
-            mfem::out << "   Iteration : " << std::setw(3) << i << "  (B r, r) = "
-                      << betanom << '\n';
-         }
-         converged = 1;
-         final_iter = i;
-         break;
-      }
-
-      if (++i > max_iter)
-      {
-         break;
-      }
-
-      beta = betanom/nom;
-      if (prec)
-      {
-         add(z, beta, d, d);   //  d = z + beta d
-      }
-      else
-      {
-         add(r, beta, d, d);
-      }
-
-      oper->Mult(d, z);       //  z = A d
-      den = Dot(d, z);
-
-      MFEM_ASSERT(IsFinite(den), "den = " << den);
-      if (den <= 0.0)
-      {
-         if (print_level >= 0 && Dot(d, d) > 0.0)
-            mfem::out << "PCG: The operator is not positive definite. (Ad, d) = "
-                      << den << '\n';
-      }
-      nom = betanom;
-   }
-
-   if (print_level >= 0 && !converged)
-   {
-      if (print_level != 1)
-      {
-         if (print_level != 3)
-         {
-            mfem::out << "   Iteration : " << std::setw(3) << 0 << "  (B r, r) = "
-                      << nom0 << " ...\n";
-         }
-         mfem::out << "   Iteration : " << std::setw(3) << final_iter << "  (B r, r) = "
-                   << betanom << '\n';
-      }
-      mfem::out << "PCG: No convergence!" << '\n';
-   }
-
-   if (print_level >= 1 || (print_level >= 0 && !converged))
-   {
-      mfem::out << "Average reduction factor = "
-                << pow (betanom/nom0, 0.5/final_iter) << '\n';
-   }
-   final_norm = sqrt(betanom);
-}
-
-} // mfem
diff --git a/hip/hip/linalg/solvers.hpp b/hip/hip/linalg/solvers.hpp
deleted file mode 100644
index 4b807186..00000000
--- a/hip/hip/linalg/solvers.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_SOLVERS
-#define LAGHOS_HIP_SOLVERS
-
-#ifdef MFEM_USE_MPI
-#include <mpi.h>
-#endif
-
-namespace mfem
-{
-
-// ***************************************************************************
-class HipIterativeSolver : public HipSolverOperator
-{
-#ifdef MFEM_USE_MPI
-private:
-   int dot_prod_type; // 0 - local, 1 - global over 'comm'
-   MPI_Comm comm;
-#endif
-protected:
-   const HipOperator *oper;
-   HipSolverOperator *prec;
-   int max_iter, print_level;
-   double rel_tol, abs_tol;
-   // stats
-   mutable int final_iter, converged;
-   mutable double final_norm;
-   double Dot(const HipVector &x,
-              const HipVector &y) const
-   {
-#ifndef MFEM_USE_MPI
-      return (x * y);
-#else
-      if (dot_prod_type == 0)
-      {
-         return (x * y);
-      }
-      double local_dot = (x * y);
-      double global_dot;
-      MPI_Allreduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, comm);
-      return global_dot;
-#endif
-   }
-   double Norm(const HipVector &x) const { return sqrt(Dot(x, x)); }
-public:
-   HipIterativeSolver(): HipSolverOperator(0, true)
-   {
-      oper = NULL;
-      prec = NULL;
-      max_iter = 10;
-      print_level = -1;
-      rel_tol = abs_tol = 0.0;
-#ifdef MFEM_USE_MPI
-      dot_prod_type = 0;
-#endif
-   }
-
-#ifdef MFEM_USE_MPI
-   HipIterativeSolver(MPI_Comm _comm)
-      : HipSolverOperator(0, true)
-   {
-      oper = NULL;
-      prec = NULL;
-      max_iter = 10;
-      print_level = -1;
-      rel_tol = abs_tol = 0.0;
-      dot_prod_type = 1;
-      comm = _comm;
-   }
-#endif
-
-   void SetRelTol(double rtol) { rel_tol = rtol; }
-   void SetAbsTol(double atol) { abs_tol = atol; }
-   void SetMaxIter(int max_it) { max_iter = max_it; }
-   void SetPrintLevel(int print_lvl)
-   {
-#ifndef MFEM_USE_MPI
-      print_level = print_lvl;
-#else
-      if (dot_prod_type == 0)
-      {
-         print_level = print_lvl;
-      }
-      else
-      {
-         int rank;
-         MPI_Comm_rank(comm, &rank);
-         if (rank == 0)
-         {
-            print_level = print_lvl;
-         }
-      }
-#endif
-   }
-   int GetNumIterations() const { return final_iter; }
-   int GetConverged() const { return converged; }
-   double GetFinalNorm() const { return final_norm; }
-   /// This should be called before SetOperator
-   virtual void SetPreconditioner(HipSolverOperator &pr)
-   {
-      prec = &pr;
-      prec->iterative_mode = false;
-   }
-   /// Also calls SetOperator for the preconditioner if there is one
-   virtual void SetOperator(const HipOperator &op)
-   {
-      oper = &op;
-      height = op.Height();
-      width = op.Width();
-      if (prec)
-      {
-         prec->SetOperator(*oper);
-      }
-   }
-};
-
-// ***************************************************************************
-// Conjugate gradient method
-// ***************************************************************************
-class HipCGSolver : public HipIterativeSolver
-{
-protected:
-   mutable HipVector r, d, z;
-   void UpdateVectors()
-   {
-      r.SetSize(width);
-      d.SetSize(width);
-      z.SetSize(width);
-   }
-public:
-   HipCGSolver() { }
-#ifdef MFEM_USE_MPI
-   HipCGSolver(MPI_Comm _comm) : HipIterativeSolver(_comm) { }
-#endif
-   virtual void SetOperator(const HipOperator &op)
-   {
-      HipIterativeSolver::SetOperator(op);
-      UpdateVectors();
-   }
-   void h_Mult(const HipVector &b, HipVector &x) const ;
-   virtual void Mult(const HipVector &b, HipVector &x) const
-   {
-      h_Mult(b,x);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_HIP_SOLVERS
diff --git a/hip/hip/linalg/vector.cpp b/hip/hip/linalg/vector.cpp
deleted file mode 100644
index 67714853..00000000
--- a/hip/hip/linalg/vector.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../hip.hpp"
-
-namespace mfem
-{
-
-HipVector::~HipVector()
-{
-   if (!own) { return; }
-   rmalloc::operator delete (data);
-}
-
-// ***************************************************************************
-double* HipVector::alloc(const size_t sz)
-{
-   return (double*) rmalloc::operator new (sz);
-}
-
-// ***************************************************************************
-void HipVector::SetSize(const size_t sz, const void* ptr)
-{
-   own=true;
-   size = sz;
-   if (!data) { data = alloc(sz); }
-   if (ptr) { rDtoD(data,ptr,bytes()); }
-}
-
-// ***************************************************************************
-HipVector::HipVector(const size_t sz):size(sz),data(alloc(sz)),own(true) {}
-HipVector::HipVector(const size_t sz,double value):
-   size(sz),data(alloc(sz)),own(true)
-{
-   *this=value;
-}
-
-HipVector::HipVector(const HipVector& v):
-   size(0),data(NULL),own(true) { SetSize(v.Size(), v); }
-
-HipVector::HipVector(const HipVector *v):size(v->size),data(v->data),
-   own(false) {}
-
-HipVector::HipVector(HipArray<double>& v):size(v.size()),data(v.ptr()),
-   own(false) {}
-
-// Host 2 Device ***************************************************************
-HipVector::HipVector(const Vector& v):size(v.Size()),data(alloc(size)),
-   own(true)
-{
-   assert(v.GetData());
-   rmemcpy::rHtoD(data,v.GetData(),size*sizeof(double));
-}
-
-// Device 2 Host ***************************************************************
-HipVector::operator Vector()
-{
-   if (!rconfig::Get().Hip()) { return Vector(data,size); }
-   double *h_data= (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   Vector mfem_vector(h_data,size);
-   mfem_vector.MakeDataOwner();
-   return mfem_vector;
-}
-
-HipVector::operator Vector() const
-{
-   if (!rconfig::Get().Hip()) { return Vector(data,size); }
-   double *h_data= (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   Vector mfem_vector(h_data,size);
-   mfem_vector.MakeDataOwner();
-   return mfem_vector;
-}
-
-// ***************************************************************************
-void HipVector::Print(std::ostream& out, int width) const
-{
-   double *h_data = (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   for (size_t i=0; i<size; i+=1)
-   {
-      printf("\n\t[%ld] %.15e",i,h_data[i]);
-   }
-   free(h_data);
-}
-
-// ***************************************************************************
-HipVector* HipVector::GetRange(const size_t offset,
-                                 const size_t entries) const
-{
-   static HipVector ref;
-   ref.size = entries;
-   ref.data = (double*) ((unsigned char*)data + (offset*sizeof(double)));
-   ref.own = false;
-   return &ref;
-}
-
-// ***************************************************************************
-HipVector& HipVector::operator=(const HipVector& v)
-{
-   SetSize(v.Size(),v.data);
-   own = false;
-   return *this;
-}
-
-// ***************************************************************************
-HipVector& HipVector::operator=(const Vector& v)
-{
-   size=v.Size();
-   if (!rconfig::Get().Hip()) { SetSize(size,v.GetData()); }
-   else { rHtoD(data,v.GetData(),bytes()); }
-   own = false;
-   return *this;
-}
-
-// ***************************************************************************
-HipVector& HipVector::operator=(double value)
-{
-   vector_op_eq(size, value, data);
-   return *this;
-}
-
-// ***************************************************************************
-double HipVector::operator*(const HipVector& v) const
-{
-   return vector_dot(size, data, v.data);
-}
-
-// *****************************************************************************
-HipVector& HipVector::operator-=(const HipVector& v)
-{
-   vector_vec_sub(size, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-HipVector& HipVector::operator+=(const HipVector& v)
-{
-   vector_vec_add(size, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-HipVector& HipVector::operator+=(const Vector& v)
-{
-   double *d_v_data;
-   assert(v.GetData());
-   if (!rconfig::Get().Hip()) { d_v_data=v.GetData(); }
-   else { rmemcpy::rHtoD(d_v_data = alloc(size),v.GetData(),bytes()); }
-   vector_vec_add(size, data, d_v_data);
-   return *this;
-}
-
-// ***************************************************************************
-HipVector& HipVector::operator*=(const double d)
-{
-   vector_vec_mul(size, data, d);
-   return *this;
-}
-
-// ***************************************************************************
-HipVector& HipVector::Add(const double alpha, const HipVector& v)
-{
-   vector_axpy(Size(),alpha, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-void HipVector::Neg()
-{
-   vector_neg(Size(),ptr());
-}
-
-// *****************************************************************************
-void HipVector::SetSubVector(const HipArray<int> &ess_tdofs,
-                              const double value,
-                              const int N)
-{
-   vector_set_subvector_const(N, value, data, ess_tdofs.ptr());
-}
-
-
-// ***************************************************************************
-double HipVector::Min() const
-{
-   return vector_min(Size(),(double*)data);
-}
-
-// ***************************************************************************
-void add(const HipVector& v1, const double alpha,
-         const HipVector& v2, HipVector& out)
-{
-   vector_xpay(out.Size(),alpha,out.ptr(),v1.ptr(),v2.ptr());
-}
-
-// *****************************************************************************
-void add(const double alpha,
-         const HipVector& v1,
-         const double beta,
-         const HipVector& v2,
-         HipVector& out) { assert(false); }
-
-// ***************************************************************************
-void subtract(const HipVector& v1,
-              const HipVector& v2,
-              HipVector& out)
-{
-   vector_xsy(out.Size(),out.ptr(),v1.ptr(),v2.ptr());
-}
-
-} // mfem
diff --git a/hip/hip/linalg/vector.hpp b/hip/hip/linalg/vector.hpp
deleted file mode 100644
index 51448bb0..00000000
--- a/hip/hip/linalg/vector.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_HIP_VECTOR
-#define LAGHOS_HIP_VECTOR
-
-namespace mfem
-{
-
-class HipVector : public rmalloc<double>
-{
-private:
-   size_t size = 0;
-   double* data = NULL;
-   bool own = true;
-public:
-   HipVector(): size(0),data(NULL),own(true) {}
-   HipVector(const HipVector&);
-   HipVector(const HipVector*);
-   HipVector(const size_t);
-   HipVector(const size_t,double);
-   HipVector(const Vector& v);
-   HipVector(HipArray<double>& v);
-   operator Vector();
-   operator Vector() const;
-   double* alloc(const size_t);
-   inline double* ptr() const { return data;}
-   inline double* GetData() const { return data;}
-   inline operator double* () { return data; }
-   inline operator const double* () const { return data; }
-   void Print(std::ostream& = std::cout, int = 8) const;
-   void SetSize(const size_t,const void* =NULL);
-   inline size_t Size() const { return size; }
-   inline size_t bytes() const { return size*sizeof(double); }
-   double operator* (const HipVector& v) const;
-   HipVector& operator = (const HipVector& v);
-   HipVector& operator = (const Vector& v);
-   HipVector& operator = (double value);
-   HipVector& operator -= (const HipVector& v);
-   HipVector& operator += (const HipVector& v);
-   HipVector& operator += (const Vector& v);
-   HipVector& operator *=(const double d);
-   HipVector& Add(const double a, const HipVector& Va);
-   void Neg();
-   HipVector* GetRange(const size_t, const size_t) const;
-   void SetSubVector(const HipArray<int> &, const double, const int);
-   double Min() const;
-   ~HipVector();
-};
-
-// ***************************************************************************
-void add(const HipVector&,const double,const HipVector&,HipVector&);
-void add(const HipVector&,const HipVector&,HipVector&);
-void add(const double,const HipVector&,const double,const HipVector&,
-         HipVector&);
-void subtract(const HipVector&,const HipVector&,HipVector&);
-
-}
-
-#endif // LAGHOS_HIP_VECTOR
diff --git a/hip/laghos.cpp b/hip/laghos.cpp
deleted file mode 100644
index e1a14dc3..00000000
--- a/hip/laghos.cpp
+++ /dev/null
@@ -1,664 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-//
-//                     __                __
-//                    / /   ____  ____  / /_  ____  _____
-//                   / /   / __ `/ __ `/ __ \/ __ \/ ___/
-//                  / /___/ /_/ / /_/ / / / / /_/ (__  )
-//                 /_____/\__,_/\__, /_/ /_/\____/____/
-//                             /____/
-//
-//             High-order Lagrangian Hydrodynamics Miniapp
-//
-//                             HIP version
-//
-// Laghos(LAGrangian High-Order Solver) is a miniapp that solves the
-// time-dependent Euler equation of compressible gas dynamics in a moving
-// Lagrangian frame using unstructured high-order finite element spatial
-// discretization and explicit high-order time-stepping. Laghos is based on the
-// numerical algorithm described in the following article:
-//
-//    V. Dobrev, Tz. Kolev and R. Rieben, "High-order curvilinear finite element
-//    methods for Lagrangian hydrodynamics", SIAM Journal on Scientific
-//    Computing, (34) 2012, pp.B606–B641, https://doi.org/10.1137/120864672.
-//
-// Sample runs:
-//    mpirun -np 8 laghos -p 0 -m data/square01_quad.mesh -rs 3 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m data/square01_tri.mesh  -rs 1 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m data/cube01_hex.mesh    -rs 1 -tf 2.0
-//    mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 3 -tf 0.8
-//    mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 0 -tf 0.8 -ok 7 -ot 6
-//    mpirun -np 8 laghos -p 1 -m data/cube01_hex.mesh    -rs 2 -tf 0.6
-//    mpirun -np 8 laghos -p 2 -m data/segment01.mesh     -rs 5 -tf 0.2
-//    mpirun -np 8 laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 2.5
-//    mpirun -np 8 laghos -p 3 -m data/box01_hex.mesh        -rs 1 -tf 2.5
-//
-// Test problems:
-//    p = 0  --> Taylor-Green vortex (smooth problem).
-//    p = 1  --> Sedov blast.
-//    p = 2  --> 1D Sod shock tube.
-//    p = 3  --> Triple point.
-
-
-#include "laghos_solver.hpp"
-#include <memory>
-#include <iostream>
-#include <fstream>
-#include <sys/time.h>
-
-using namespace std;
-using namespace mfem;
-using namespace mfem::hydrodynamics;
-
-// Choice for the problem setup.
-int problem = 0;
-
-void display_banner(ostream & os);
-
-int main(int argc, char *argv[])
-{
-   // Initialize MPI.
-   MPI_Session mpi(argc, argv);
-   int myid = mpi.WorldRank();
-
-   // Print the banner.
-   if (mpi.Root()) { display_banner(cout); }
-
-   // Parse command-line options.
-   const char *mesh_file = "../data/square01_quad.mesh";
-   int rs_levels = 0;
-   int rp_levels = 0;
-   int order_v = 2;
-   int order_e = 1;
-   int ode_solver_type = 4;
-   double t_final = 0.5;
-   double cfl = 0.5;
-   double cg_tol = 1e-8;
-   int cg_max_iter = 300;
-   int max_tsteps = -1;
-   bool p_assembly = true;
-   bool visualization = false;
-   int vis_steps = 5;
-   bool visit = false;
-   bool gfprint = false;
-   const bool hip = true;
-   bool aware = false;
-   bool share = false;
-   bool hcpo = false; // do Host Conforming Prolongation Operation
-   bool sync = false;
-
-   const char *basename = "results/Laghos";
-   OptionsParser args(argc, argv);
-   // Standard Options *********************************************************
-   args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use.");
-   args.AddOption(&rs_levels, "-rs", "--refine-serial",
-                  "Number of times to refine the mesh uniformly in serial.");
-   args.AddOption(&rp_levels, "-rp", "--refine-parallel",
-                  "Number of times to refine the mesh uniformly in parallel.");
-   args.AddOption(&problem, "-p", "--problem", "Problem setup to use.");
-   args.AddOption(&order_v, "-ok", "--order-kinematic",
-                  "Order (degree) of the kinematic finite element space.");
-   args.AddOption(&order_e, "-ot", "--order-thermo",
-                  "Order (degree) of the thermodynamic finite element space.");
-   args.AddOption(&ode_solver_type, "-s", "--ode-solver",
-                  "ODE solver: 1 - Forward Euler,\n\t"
-                  "            2 - RK2 SSP, 3 - RK3 SSP, 4 - RK4, 6 - RK6.");
-   args.AddOption(&t_final, "-tf", "--t-final",
-                  "Final time; start time is 0.");
-   args.AddOption(&cfl, "-cfl", "--cfl", "CFL-condition number.");
-   args.AddOption(&cg_tol, "-cgt", "--cg-tol",
-                  "Relative CG tolerance (velocity linear solve).");
-   args.AddOption(&cg_max_iter, "-cgm", "--cg-max-steps",
-                  "Maximum number of CG iterations (velocity linear solve).");
-   args.AddOption(&max_tsteps, "-ms", "--max-steps",
-                  "Maximum number of steps (negative means no restriction).");
-   args.AddOption(&p_assembly, "-pa", "--partial-assembly", "-fa",
-                  "--full-assembly",
-                  "Activate 1D tensor-based assembly (partial assembly).");
-   args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
-                  "--no-visualization",
-                  "Enable or disable GLVis visualization.");
-   args.AddOption(&vis_steps, "-vs", "--visualization-steps",
-                  "Visualize every n-th timestep.");
-   args.AddOption(&visit, "-visit", "--visit", "-no-visit", "--no-visit",
-                  "Enable or disable VisIt visualization.");
-   args.AddOption(&gfprint, "-print", "--print", "-no-print", "--no-print",
-                  "Enable or disable result output (files in mfem format).");
-   args.AddOption(&basename, "-k", "--outputfilename",
-                  "Name of the visit dump files");
-   // HIP Options *************************************************************
-   args.AddOption(&aware, "-aware", "--aware", "-no-aware", "--no-aware",
-                  "[32mEnable or disable MPI HIP Aware (GPUDirect).[m");
-   args.AddOption(&hcpo, "-hcpo", "--hcpo", "-not-hcpo", "--no-hcpo",
-                  "[32mEnable or disable Host Conforming Prolongation Operations,\n"
-                  "\twhich transfers ALL the data to the host before communications.[m");
-   args.AddOption(&sync, "-sync", "--sync", "-no-sync", "--no-sync",
-                  "[32mEnable or disable Enforced Kernel Synchronization.[m");
-   // Not usable Options *******************************************************
-   args.AddOption(&share, "-share", "--share", "-no-share", "--no-share",
-                  "Enable or disable SHARE kernels (WIP, not usable).");
-   args.Parse();
-   if (!args.Good())
-   {
-      if (mpi.Root()) { args.PrintUsage(cout); }
-      return 1;
-   }
-   if (mpi.Root()) { args.PrintOptions(cout); }
-
-   // HIP set device & options
-   // **************************************************************************
-   rconfig::Get().Setup(mpi.WorldRank(),mpi.WorldSize(),
-                        hip,aware,share,hcpo,sync,rs_levels);
-
-   // Read the serial mesh from the given mesh file on all processors.
-   // Refine the mesh in serial to increase the resolution.
-   Mesh *mesh = new Mesh(mesh_file, 1, 1);
-   const int dim = mesh->Dimension();
-   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
-
-   if (p_assembly && dim == 1)
-   {
-      p_assembly = false;
-      if (mpi.Root())
-      {
-         cout << "Laghos does not support PA in 1D. Switching to FA." << endl;
-      }
-   }
-
-   // Parallel partitioning of the mesh.
-   // **************************************************************************
-   ParMesh *pmesh = NULL;
-   const int num_tasks = mpi.WorldSize();
-   const int partitions = floor(pow(num_tasks, 1.0 / dim) + 1e-2);
-   int *nxyz = new int[dim];
-   int product = 1;
-   for (int d = 0; d < dim; d++)
-   {
-      nxyz[d] = partitions;
-      product *= partitions;
-   }
-   if (product == num_tasks)
-   {
-      if (myid == 0)
-      {
-         printf("\033[32m[laghos] \033[32;1mCartesian\033[m\033[32m partitioning will be used\033[m\n");
-      }
-      int *partitioning = mesh->CartesianPartitioning(nxyz);
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
-      delete[] partitioning;
-   }
-   else
-   {
-      if (myid == 0)
-      {
-         printf("\033[32m[laghos] Non-Cartesian partitioning through METIS will be used\033[m\n");
-#ifndef MFEM_USE_METIS
-         cout << "MFEM was built without METIS. "
-              << "Adjust the number of tasks to use a Cartesian split." << endl;
-#endif
-      }
-#ifndef MFEM_USE_METIS
-      return 1;
-#endif
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh);
-   }
-   delete [] nxyz;
-   delete mesh;
-
-   // **************************************************************************
-   // We need at least some elements in each partition for now
-#ifdef MFEM_USE_MPI
-   int global_pmesh_NE;
-   const int pmesh_NE=pmesh->GetNE();
-   MPI_Allreduce(&pmesh_NE,&global_pmesh_NE,1,MPI_INT,MPI_MIN,pmesh->GetComm());
-   if (global_pmesh_NE==0) { printf("[Laghos] ERROR: pmesh->GetNE()==0!"); return 1;}
-   else { printf("\033[32m[laghos] pmesh->GetNE()=%d\033[m\n",global_pmesh_NE); }
-   assert(pmesh->GetNE()>0);
-#endif
-
-   // Refine the mesh further in parallel to increase the resolution.
-   for (int lev = 0; lev < rp_levels; lev++) { pmesh->UniformRefinement(); }
-
-   // Define the parallel finite element spaces. We use:
-   // - H1 (Gauss-Lobatto, continuous) for position and velocity.
-   // - L2 (Bernstein, discontinuous) for specific internal energy.
-   L2_FECollection L2FEC(order_e, dim, BasisType::Positive);
-   H1_FECollection H1FEC(order_v, dim);
-   HipFiniteElementSpace L2FESpace(pmesh, &L2FEC);
-   HipFiniteElementSpace H1FESpace(pmesh, &H1FEC, pmesh->Dimension());
-
-   // Boundary conditions: all tests use v.n = 0 on the boundary,
-   // and we assume that the boundaries are straight.
-   Array<int> essential_tdofs;
-   {
-      Array<int> ess_bdr(pmesh->bdr_attributes.Max()), tdofs1d;
-      for (int d = 0; d < pmesh->Dimension(); d++)
-      {
-         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e., we must
-         // enforce v_x/y/z = 0 for the velocity components.
-         ess_bdr = 0; ess_bdr[d] = 1;
-         H1FESpace.GetEssentialTrueDofs(ess_bdr, tdofs1d, d);
-         essential_tdofs.Append(tdofs1d);
-      }
-   }
-
-   // Define the explicit ODE solver used for time integration.
-   HipODESolver *ode_solver = NULL;
-   switch (ode_solver_type)
-   {
-      case 1: ode_solver = new HipForwardEulerSolver; break;
-      case 2: ode_solver = new HipRK2Solver(0.5); break;
-      case 3: ode_solver = new HipRK3SSPSolver; break;
-      case 4: ode_solver = new HipRK4Solver; break;
-      case 6: ode_solver = new HipRK6Solver; break;
-      default:
-         if (myid == 0)
-         {
-            cout << "Unknown ODE solver type: " << ode_solver_type << '\n';
-         }
-         delete pmesh;
-         MPI_Finalize();
-         return 3;
-   }
-
-   HYPRE_Int glob_size_l2 = L2FESpace.GlobalTrueVSize();
-   HYPRE_Int glob_size_h1 = H1FESpace.GlobalTrueVSize();
-
-   if (mpi.Root())
-   {
-      cout << "Number of kinematic (position, velocity) dofs: "
-           << glob_size_h1 << endl;
-      cout << "Number of specific internal energy dofs: "
-           << glob_size_l2 << endl<< endl;
-   }
-
-   int Vsize_l2 = L2FESpace.GetVSize();
-   int Vsize_h1 = H1FESpace.GetVSize();
-
-   // The monolithic BlockVector stores unknown fields as:
-   // - 0 -> position
-   // - 1 -> velocity
-   // - 2 -> specific internal energy
-   Array<int> true_offset(4);
-   true_offset[0] = 0;
-   true_offset[1] = true_offset[0] + Vsize_h1;
-   true_offset[2] = true_offset[1] + Vsize_h1;
-   true_offset[3] = true_offset[2] + Vsize_l2;
-   HipVector S(true_offset[3]);
-
-   // Define GridFunction objects for the position, velocity and specific
-   // internal energy.  There is no function for the density, as we can always
-   // compute the density values given the current mesh position, using the
-   // property of pointwise mass conservation.
-   ParGridFunction x_gf(&H1FESpace);
-   ParGridFunction v_gf(&H1FESpace);
-   ParGridFunction e_gf(&L2FESpace);
-
-   HipGridFunction d_x_gf(H1FESpace, S.GetRange(true_offset[0], true_offset[1]));
-   HipGridFunction d_v_gf(H1FESpace, S.GetRange(true_offset[1], true_offset[2]));
-   HipGridFunction d_e_gf(L2FESpace, S.GetRange(true_offset[2], true_offset[3]));
-
-   // Initialize x_gf using the starting mesh coordinates. This also links the
-   // mesh positions to the values in x_gf.
-   pmesh->SetNodalGridFunction(&x_gf);
-   d_x_gf = x_gf;
-
-   // Initialize the velocity.
-   VectorFunctionCoefficient v_coeff(pmesh->Dimension(), v0);
-   v_gf.ProjectCoefficient(v_coeff);
-   d_v_gf = v_gf;
-
-   // Initialize density and specific internal energy values. We interpolate in
-   // a non-positive basis to get the correct values at the dofs.  Then we do an
-   // L2 projection to the positive basis in which we actually compute. The goal
-   // is to get a high-order representation of the initial condition. Note that
-   // this density is a temporary function and it will not be updated during the
-   // time evolution.
-   ParGridFunction rho(&L2FESpace);
-   FunctionCoefficient rho_coeff(hydrodynamics::rho0);
-   L2_FECollection l2_fec(order_e, pmesh->Dimension());
-   HipFiniteElementSpace l2_fes(pmesh, &l2_fec);
-   ParGridFunction l2_rho(&l2_fes), l2_e(&l2_fes);
-   l2_rho.ProjectCoefficient(rho_coeff);
-   rho.ProjectGridFunction(l2_rho);
-   HipGridFunction d_rho(L2FESpace);
-   d_rho = rho;
-   if (problem == 1)
-   {
-      // For the Sedov test, we use a delta function at the origin.
-      DeltaCoefficient e_coeff(0, 0, 0.25);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   else
-   {
-      FunctionCoefficient e_coeff(e0);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   e_gf.ProjectGridFunction(l2_e);
-   d_e_gf = e_gf;
-
-   Coefficient *material_pcf = new FunctionCoefficient(hydrodynamics::gamma);
-
-   // Additional details, depending on the problem.
-   int source = 0; bool visc=false;
-   switch (problem)
-   {
-      case 0: if (pmesh->Dimension() == 2) { source = 1; }
-         visc = false; break;
-      case 1: visc = true; break;
-      case 2: visc = true; break;
-      case 3: visc = true; break;
-      default: MFEM_ABORT("Wrong problem specification!");
-   }
-
-   LagrangianHydroOperator oper(S.Size(), H1FESpace, L2FESpace,
-                                essential_tdofs, d_rho, source, cfl, material_pcf,
-                                visc, p_assembly, cg_tol, cg_max_iter);
-
-   socketstream vis_rho, vis_v, vis_e;
-   char vishost[] = "localhost";
-   int  visport   = 19916;
-
-   ParGridFunction rho_gf;
-   if (visualization || visit) { oper.ComputeDensity(rho_gf); }
-
-   if (visualization)
-   {
-      // Make sure all MPI ranks have sent their 'v' solution before initiating
-      // another set of GLVis connections (one from each rank):
-      MPI_Barrier(pmesh->GetComm());
-
-      vis_rho.precision(8);
-      vis_v.precision(8);
-      vis_e.precision(8);
-
-      int Wx = 0, Wy = 0; // window position
-      const int Ww = 350, Wh = 350; // window size
-      int offx = Ww+10; // window offsets
-
-      if (problem != 0 && problem != 4)
-      {
-         VisualizeField(vis_rho, vishost, visport, rho_gf,
-                        "Density", Wx, Wy, Ww, Wh);
-      }
-
-      Wx += offx;
-      VisualizeField(vis_v, vishost, visport, v_gf,
-                     "Velocity", Wx, Wy, Ww, Wh);
-      Wx += offx;
-      VisualizeField(vis_e, vishost, visport, e_gf,
-                     "Specific Internal Energy", Wx, Wy, Ww, Wh);
-   }
-
-   // Save data for VisIt visualization
-   VisItDataCollection visit_dc(basename, pmesh);
-   if (visit)
-   {
-      visit_dc.RegisterField("Density",  &rho_gf);
-      visit_dc.RegisterField("Velocity", &v_gf);
-      visit_dc.RegisterField("Specific Internal Energy", &e_gf);
-      visit_dc.SetCycle(0);
-      visit_dc.SetTime(0.0);
-      visit_dc.Save();
-   }
-
-   // Perform time-integration (looping over the time iterations, ti, with a
-   // time-step dt). The object oper is of type LagrangianHydroOperator that
-   // defines the Mult() method that used by the time integrators.
-   ode_solver->Init(oper);
-   oper.ResetTimeStepEstimate();
-   double t = 0.0, dt = oper.GetTimeStepEstimate(S), t_old;
-   bool last_step = false;
-   int steps = 0;
-   HipVector S_old(S);
-
-   for (int ti = 1; !last_step; ti++)
-   {
-      if (t + dt >= t_final)
-      {
-         dt = t_final - t;
-         last_step = true;
-      }
-      if (steps == max_tsteps) { last_step = true; }
-
-      S_old = S;
-      t_old = t;
-      oper.ResetTimeStepEstimate();
-
-      // S is the vector of dofs, t is the current time,
-      // and dt is the time step to advance.
-      ode_solver->Step(S, t, dt);
-      steps++;
-
-      // Make sure that the mesh corresponds to the new solution state.
-      x_gf = d_x_gf;
-      pmesh->NewNodes(x_gf, false);
-
-      // Adaptive time step control.
-      const double dt_est = oper.GetTimeStepEstimate(S);
-      if (dt_est < dt)
-      {
-         // Repeat (solve again) with a decreased time step - decrease of the
-         // time estimate suggests appearance of oscillations.
-         dt *= 0.85;
-         if (dt < numeric_limits<double>::epsilon())
-         { MFEM_ABORT("The time step crashed!"); }
-         t = t_old;
-         S = S_old;
-         oper.ResetQuadratureData();
-         if (mpi.Root()) { cout << "Repeating step " << ti << endl; }
-         ti--; continue;
-      }
-      else if (dt_est > 1.25 * dt) { dt *= 1.02; }
-
-
-      if (last_step || (ti % vis_steps) == 0)
-      {
-         double loc_norm = d_e_gf * d_e_gf, tot_norm;
-         MPI_Allreduce(&loc_norm, &tot_norm, 1, MPI_DOUBLE, MPI_SUM,
-                       pmesh->GetComm());
-         if (mpi.Root())
-         {
-            cout << fixed;
-            cout << "step " << setw(5) << ti
-                 << ",\tt = " << setw(5) << setprecision(4) << t
-                 << ",\tdt = " << setw(5) << setprecision(6) << dt
-                 << ",\t|e| = " << setprecision(10)
-                 << sqrt(tot_norm) << endl;
-         }
-
-         // Make sure all ranks have sent their 'v' solution before initiating
-         // another set of GLVis connections (one from each rank):
-         MPI_Barrier(pmesh->GetComm());
-
-         if (visualization || visit || gfprint) { oper.ComputeDensity(rho_gf); }
-         if (visualization)
-         {
-            int Wx = 0, Wy = 0; // window position
-            int Ww = 350, Wh = 350; // window size
-            int offx = Ww+10; // window offsets
-
-            VisualizeField(vis_rho, vishost, visport, rho_gf,
-                           "Density", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_v, vishost, visport,
-                           v_gf, "Velocity", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_e, vishost, visport, e_gf,
-                           "Specific Internal Energy", Wx, Wy, Ww,Wh);
-            Wx += offx;
-         }
-
-         if (visit)
-         {
-            visit_dc.SetCycle(ti);
-            visit_dc.SetTime(t);
-            visit_dc.Save();
-         }
-
-         if (gfprint)
-         {
-            ostringstream mesh_name, rho_name, v_name, e_name;
-            mesh_name << basename << "_" << ti
-                      << "_mesh." << setfill('0') << setw(6) << myid;
-            rho_name  << basename << "_" << ti
-                      << "_rho." << setfill('0') << setw(6) << myid;
-            v_name << basename << "_" << ti
-                   << "_v." << setfill('0') << setw(6) << myid;
-            e_name << basename << "_" << ti
-                   << "_e." << setfill('0') << setw(6) << myid;
-
-            ofstream mesh_ofs(mesh_name.str().c_str());
-            mesh_ofs.precision(8);
-            pmesh->Print(mesh_ofs);
-            mesh_ofs.close();
-
-            ofstream rho_ofs(rho_name.str().c_str());
-            rho_ofs.precision(8);
-            rho_gf.Save(rho_ofs);
-            rho_ofs.close();
-
-            ofstream v_ofs(v_name.str().c_str());
-            v_ofs.precision(8);
-            v_gf.Save(v_ofs);
-            v_ofs.close();
-
-            ofstream e_ofs(e_name.str().c_str());
-            e_ofs.precision(8);
-            e_gf.Save(e_ofs);
-            e_ofs.close();
-         }
-      }
-   }
-
-   switch (ode_solver_type)
-   {
-      case 2: steps *= 2; break;
-      case 3: steps *= 3; break;
-      case 4: steps *= 4; break;
-      case 6: steps *= 6;
-   }
-   oper.PrintTimingData(mpi.Root(), steps);
-
-   if (visualization)
-   {
-      vis_v.close();
-      vis_e.close();
-   }
-
-   // Free the used memory.
-   delete ode_solver;
-   delete pmesh;
-   delete material_pcf;
-   return 0;
-}
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-double rho0(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0: return 1.0;
-      case 1: return 1.0;
-      case 2: if (x(0) < 0.5) { return 1.0; }
-         else { return 0.1; }
-      case 3: if (x(0) > 1.0 && x(1) <= 1.5) { return 1.0; }
-         else { return 0.125; }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-double gamma(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0: return 5./3.;
-      case 1: return 1.4;
-      case 2: return 1.4;
-      case 3: if (x(0) > 1.0 && x(1) <= 1.5) { return 1.4; }
-         else { return 1.5; }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-void v0(const Vector &x, Vector &v)
-{
-   switch (problem)
-   {
-      case 0:
-         v(0) =  sin(M_PI*x(0)) * cos(M_PI*x(1));
-         v(1) = -cos(M_PI*x(0)) * sin(M_PI*x(1));
-         if (x.Size() == 3)
-         {
-            v(0) *= cos(M_PI*x(2));
-            v(1) *= cos(M_PI*x(2));
-            v(2) = 0.0;
-         }
-         break;
-      case 1: v = 0.0; break;
-      case 2: v = 0.0; break;
-      case 3: v = 0.0; break;
-      default: MFEM_ABORT("Bad number given for problem id!");
-   }
-}
-
-double e0(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0:
-      {
-         const double denom = 2.0 / 3.0;  // (5/3 - 1) * density.
-         double val;
-         if (x.Size() == 2)
-         {
-            val = 1.0 + (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) / 4.0;
-         }
-         else
-         {
-            val = 100.0 + ((cos(2*M_PI*x(2)) + 2) *
-                           (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) - 2) / 16.0;
-         }
-         return val/denom;
-      }
-      case 1: return 0.0; // This case in initialized in main().
-      case 2: if (x(0) < 0.5) { return 1.0 / rho0(x) / (gamma(x) - 1.0); }
-         else { return 0.1 / rho0(x) / (gamma(x) - 1.0); }
-      case 3: if (x(0) > 1.0) { return 0.1 / rho0(x) / (gamma(x) - 1.0); }
-         else { return 1.0 / rho0(x) / (gamma(x) - 1.0); }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-void display_banner(ostream & os)
-{
-   os << endl
-      << "       __                __                 " << endl
-      << "      / /   ____  ____  / /_  ____  _____   " << endl
-      << "     / /   / __ `/ __ `/ __ \\/ __ \\/ ___/ " << endl
-      << "    / /___/ /_/ / /_/ / / / / /_/ (__  )    " << endl
-      << "   /_____/\\__,_/\\__, /_/ /_/\\____/____/  " << endl
-      << "               /____/                       " << endl << endl;
-}
diff --git a/hip/laghos_assembly.cpp b/hip/laghos_assembly.cpp
deleted file mode 100644
index 0d171d9c..00000000
--- a/hip/laghos_assembly.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project (17-SC-20-SC)
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_assembly.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-QuadratureData::QuadratureData(int dim,
-                               int nzones,
-                               int nqp)
-{ Setup(dim, nzones, nqp); }
-
-
-void QuadratureData::Setup(int dim,
-                           int nzones,
-                           int nqp)
-{
-   rho0DetJ0w.SetSize(nqp * nzones);
-   stressJinvT.SetSize(dim * dim * nqp * nzones);
-   dtEst.SetSize(nqp * nzones);
-}
-
-void DensityIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
-                                               ElementTransformation &Tr,
-                                               Vector &elvect)
-{
-   const int ip_cnt = integ_rule.GetNPoints();
-   Vector shape(fe.GetDof());
-   Vector rho0DetJ0w = quad_data.rho0DetJ0w;
-   elvect.SetSize(fe.GetDof());
-   elvect = 0.0;
-   for (int q = 0; q < ip_cnt; q++)
-   {
-      fe.CalcShape(integ_rule.IntPoint(q), shape);
-      shape *= rho0DetJ0w(Tr.ElementNo*ip_cnt + q);
-      elvect += shape;
-   }
-}
-
-// *****************************************************************************
-HipMassOperator::HipMassOperator(HipFiniteElementSpace &fes_,
-                                   const IntegrationRule &integ_rule_,
-                                   QuadratureData *quad_data_)
-   : HipOperator(fes_.GetTrueVSize()),
-     fes(fes_),
-     integ_rule(integ_rule_),
-     ess_tdofs_count(0),
-     bilinearForm(&fes),
-     quad_data(quad_data_),
-     x_gf(fes),
-     y_gf(fes) {}
-
-// *****************************************************************************
-HipMassOperator::~HipMassOperator()
-{
-}
-
-// *****************************************************************************
-void HipMassOperator::Setup()
-{
-   dim=fes.GetMesh()->Dimension();
-   nzones=fes.GetMesh()->GetNE();
-   HipMassIntegrator &massInteg = *(new HipMassIntegrator());
-   massInteg.SetIntegrationRule(integ_rule);
-   massInteg.SetOperator(quad_data->rho0DetJ0w);
-   bilinearForm.AddDomainIntegrator(&massInteg);
-   bilinearForm.Assemble();
-   bilinearForm.FormOperator(Array<int>(), massOperator);
-}
-
-// *************************************************************************
-void HipMassOperator::SetEssentialTrueDofs(Array<int> &dofs)
-{
-   ess_tdofs_count = dofs.Size();
-   if (ess_tdofs.Size()==0)
-   {
-#ifdef MFEM_USE_MPI
-      int global_ess_tdofs_count;
-      const MPI_Comm comm = fes.GetParMesh()->GetComm();
-      MPI_Allreduce(&ess_tdofs_count,&global_ess_tdofs_count,
-                    1, MPI_INT, MPI_SUM, comm);
-      assert(global_ess_tdofs_count>0);
-      ess_tdofs.allocate(global_ess_tdofs_count);
-#else
-      assert(ess_tdofs_count>0);
-      ess_tdofs.allocate(ess_tdofs_count);
-#endif
-   }
-   else { assert(ess_tdofs_count<=ess_tdofs.Size()); }
-   assert(ess_tdofs.ptr());
-   if (ess_tdofs_count == 0) { return; }
-   assert(ess_tdofs_count>0);
-   assert(dofs.GetData());
-   rHtoD(ess_tdofs.ptr(),dofs.GetData(),ess_tdofs_count*sizeof(int));
-}
-
-// *****************************************************************************
-void HipMassOperator::EliminateRHS(HipVector &b)
-{
-   if (ess_tdofs_count > 0)
-   {
-      b.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-// *************************************************************************
-void HipMassOperator::Mult(const HipVector &x, HipVector &y) const
-{
-   distX = x;
-   if (ess_tdofs_count)
-   {
-      distX.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-   massOperator->Mult(distX, y);
-   if (ess_tdofs_count)
-   {
-      y.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-
-// *****************************************************************************
-// * HipForceOperator
-// *****************************************************************************
-HipForceOperator::HipForceOperator(HipFiniteElementSpace &h1fes_,
-                                     HipFiniteElementSpace &l2fes_,
-                                     const IntegrationRule &integ_rule_,
-                                     const QuadratureData *quad_data_)
-   : HipOperator(l2fes_.GetTrueVSize(), h1fes_.GetTrueVSize()),
-     dim(h1fes_.GetMesh()->Dimension()),
-     nzones(h1fes_.GetMesh()->GetNE()),
-     h1fes(h1fes_),
-     l2fes(l2fes_),
-     integ_rule(integ_rule_),
-     quad_data(quad_data_),
-     gVecL2(l2fes.GetLocalDofs() * nzones),
-     gVecH1(h1fes.GetVDim() * h1fes.GetLocalDofs() * nzones) { }
-
-// *****************************************************************************
-HipForceOperator::~HipForceOperator() {}
-
-// *************************************************************************
-void HipForceOperator::Setup()
-{
-   h1D2Q = HipDofQuadMaps::Get(h1fes, integ_rule);
-   l2D2Q = HipDofQuadMaps::Get(l2fes, integ_rule);
-}
-
-// *************************************************************************
-void HipForceOperator::Mult(const HipVector &vecL2,
-                             HipVector &vecH1) const
-{
-   l2fes.GlobalToLocal(vecL2, gVecL2);
-   const int NUM_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int L2_DOFS_1D = l2fes.GetFE(0)->GetOrder()+1;
-   const int H1_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   if (rconfig::Get().Share())
-      rForceMultS(dim,
-                  NUM_DOFS_1D,
-                  NUM_QUAD_1D,
-                  L2_DOFS_1D,
-                  H1_DOFS_1D,
-                  nzones,
-                  l2D2Q->dofToQuad,
-                  h1D2Q->quadToDof,
-                  h1D2Q->quadToDofD,
-                  quad_data->stressJinvT,
-                  gVecL2,
-                  gVecH1);
-   else
-      rForceMult(dim,
-                 NUM_DOFS_1D,
-                 NUM_QUAD_1D,
-                 L2_DOFS_1D,
-                 H1_DOFS_1D,
-                 nzones,
-                 l2D2Q->dofToQuad,
-                 h1D2Q->quadToDof,
-                 h1D2Q->quadToDofD,
-                 quad_data->stressJinvT,
-                 gVecL2,
-                 gVecH1);
-   h1fes.LocalToGlobal(gVecH1, vecH1);
-}
-
-// *************************************************************************
-void HipForceOperator::MultTranspose(const HipVector &vecH1,
-                                      HipVector &vecL2) const
-{
-   h1fes.GlobalToLocal(vecH1, gVecH1);
-   const int NUM_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int L2_DOFS_1D = l2fes.GetFE(0)->GetOrder()+1;
-   const int H1_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   if (rconfig::Get().Share())
-      rForceMultTransposeS(dim,
-                           NUM_DOFS_1D,
-                           NUM_QUAD_1D,
-                           L2_DOFS_1D,
-                           H1_DOFS_1D,
-                           nzones,
-                           l2D2Q->quadToDof,
-                           h1D2Q->dofToQuad,
-                           h1D2Q->dofToQuadD,
-                           quad_data->stressJinvT,
-                           gVecH1,
-                           gVecL2);
-   else
-      rForceMultTranspose(dim,
-                          NUM_DOFS_1D,
-                          NUM_QUAD_1D,
-                          L2_DOFS_1D,
-                          H1_DOFS_1D,
-                          nzones,
-                          l2D2Q->quadToDof,
-                          h1D2Q->dofToQuad,
-                          h1D2Q->dofToQuadD,
-                          quad_data->stressJinvT,
-                          gVecH1,
-                          gVecL2);
-   l2fes.LocalToGlobal(gVecL2, vecL2);
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/hip/laghos_assembly.hpp b/hip/laghos_assembly.hpp
deleted file mode 100644
index c723afee..00000000
--- a/hip/laghos_assembly.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_ASSEMBLY
-#define MFEM_LAGHOS_ASSEMBLY
-
-#include "mfem.hpp"
-#include "hip/hip.hpp"
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-// Container for all data needed at quadrature points.
-struct QuadratureData
-{
-   // TODO: use QuadratureFunctions?
-
-   // Reference to physical Jacobian for the initial mesh. These are computed
-   // only at time zero and stored here.
-   HipVector Jac0inv;
-
-   // Quadrature data used for full/partial assembly of the force operator. At
-   // each quadrature point, it combines the stress, inverse Jacobian,
-   // determinant of the Jacobian and the integration weight. It must be
-   // recomputed in every time step.
-   HipVector stressJinvT;
-   HipDofQuadMaps *dqMaps;
-   HipGeometry *geom;
-
-   // Quadrature data used for full/partial assembly of the mass matrices. At
-   // time zero, we compute and store (rho0 * det(J0) * qp_weight) at each
-   // quadrature point. Note the at any other time, we can compute
-   // rho = rho0 * det(J0) / det(J), representing the notion of pointwise mass
-   // conservation.
-   HipVector rho0DetJ0w;
-
-
-   // Initial length scale. This represents a notion of local mesh size. We
-   // assume that all initial zones have similar size.
-   double h0;
-
-   // Estimate of the minimum time step over all quadrature points. This is
-   // recomputed at every time step to achieve adaptive time stepping.
-   double dt_est;
-   HipVector dtEst;
-
-   QuadratureData(int dim, int nzones, int quads_per_zone);
-
-   void Setup(int dim, int nzones, int quads_per_zone);
-};
-
-// This class is used only for visualization. It assembles (rho, phi) in each
-// zone, which is used by LagrangianHydroOperator::ComputeDensity to do an L2
-// projection of the density.
-class DensityIntegrator : public LinearFormIntegrator
-{
-private:
-   const QuadratureData &quad_data;
-   const IntegrationRule &integ_rule;
-public:
-   DensityIntegrator(const QuadratureData &qd,
-                     const IntegrationRule &ir) : quad_data(qd),
-      integ_rule(ir) {}
-
-   void AssembleRHSElementVect(const FiniteElement &fe,
-                               ElementTransformation &Tr,
-                               Vector &elvect);
-
-   void AssembleRHSElementVect(const FiniteElement &el,
-                               FaceElementTransformations &Tr,
-                               Vector &elvect) {assert(false);}
-
-};
-
-// *****************************************************************************
-// * HipMassOperator
-// *****************************************************************************
-class HipMassOperator : public HipOperator
-{
-private:
-   int dim;
-   int nzones;
-   HipFiniteElementSpace &fes;
-   const IntegrationRule &integ_rule;
-   unsigned int ess_tdofs_count;
-   HipArray<int> ess_tdofs;
-   HipBilinearForm bilinearForm;
-   HipOperator *massOperator;
-   QuadratureData *quad_data;
-   // For distributing X
-   mutable HipVector distX;
-   mutable HipGridFunction x_gf, y_gf;
-public:
-   HipMassOperator(HipFiniteElementSpace &fes_,
-                    const IntegrationRule &integ_rule_,
-                    QuadratureData *quad_data_);
-   ~HipMassOperator();
-   void Setup();
-   void SetEssentialTrueDofs(Array<int> &dofs);
-   // Can be used for both velocity and specific internal energy. For the case
-   // of velocity, we only work with one component at a time.
-   void Mult(const HipVector &x, HipVector &y) const;
-   void EliminateRHS(HipVector &b);
-   void ComputeDiagonal2D(Vector &diag) const;
-   void ComputeDiagonal3D(Vector &diag) const;
-};
-
-// Performs partial assembly, which corresponds to (and replaces) the use of the
-// LagrangianHydroOperator::Force global matrix.
-class HipForceOperator : public HipOperator
-{
-private:
-   const int dim;
-   const int nzones;
-   const HipFiniteElementSpace &h1fes, &l2fes;
-   const IntegrationRule &integ_rule;
-   const QuadratureData *quad_data;
-   const HipDofQuadMaps *l2D2Q, *h1D2Q;
-   mutable HipVector gVecL2, gVecH1;
-public:
-   HipForceOperator(HipFiniteElementSpace &h1fes_,
-                     HipFiniteElementSpace &l2fes_,
-                     const IntegrationRule &integ_rule,
-                     const QuadratureData *quad_data_);
-   void Setup();
-   void Mult(const HipVector &vecL2, HipVector &vecH1) const;
-   void MultTranspose(const HipVector &vecH1, HipVector &vecL2) const;
-   ~HipForceOperator();
-};
-
-// Scales by the inverse diagonal of the MassPAOperator.
-class DiagonalSolver : public Solver
-{
-private:
-   Vector diag;
-   FiniteElementSpace &FESpace;
-public:
-   DiagonalSolver(FiniteElementSpace &fes): Solver(fes.GetVSize()),
-      diag(),
-      FESpace(fes) { }
-
-   void SetDiagonal(Vector &d)
-   {
-      const Operator *P = FESpace.GetProlongationMatrix();
-      diag.SetSize(P->Width());
-      P->MultTranspose(d, diag);
-   }
-
-   virtual void Mult(const Vector &x, Vector &y) const
-   {
-      for (int i = 0; i < x.Size(); i++) { y(i) = x(i) / diag(i); }
-   }
-   virtual void SetOperator(const Operator &op) { }
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS_ASSEMBLY
diff --git a/hip/laghos_solver.cpp b/hip/laghos_solver.cpp
deleted file mode 100644
index 68c6d3f2..00000000
--- a/hip/laghos_solver.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_solver.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x, int y, int w, int h, bool vec)
-{
-   ParMesh &pmesh = *gf.ParFESpace()->GetParMesh();
-   MPI_Comm comm = pmesh.GetComm();
-
-   int num_procs, myid;
-   MPI_Comm_size(comm, &num_procs);
-   MPI_Comm_rank(comm, &myid);
-
-   bool newly_opened = false;
-   int connection_failed;
-
-   do
-   {
-      if (myid == 0)
-      {
-         if (!sock.is_open() || !sock)
-         {
-            sock.open(vishost, visport);
-            sock.precision(8);
-            newly_opened = true;
-         }
-         sock << "solution\n";
-      }
-
-      pmesh.PrintAsOne(sock);
-      gf.SaveAsOne(sock);
-
-      if (myid == 0 && newly_opened)
-      {
-         sock << "window_title '" << title << "'\n"
-              << "window_geometry "
-              << x << " " << y << " " << w << " " << h << "\n"
-              << "keys maaAcl";
-         if ( vec ) { sock << "vvv"; }
-         sock << endl;
-      }
-
-      if (myid == 0)
-      {
-         connection_failed = !sock && !newly_opened;
-      }
-      MPI_Bcast(&connection_failed, 1, MPI_INT, 0, comm);
-   }
-   while (connection_failed);
-}
-
-// ***************************************************************************
-// * LagrangianHydroOperator
-// ***************************************************************************
-LagrangianHydroOperator::LagrangianHydroOperator(int size,
-                                                 HipFiniteElementSpace &h1_fes,
-                                                 HipFiniteElementSpace &l2_fes,
-                                                 Array<int> &essential_tdofs,
-                                                 HipGridFunction &rho0,
-                                                 int source_type_, double cfl_,
-                                                 Coefficient *material_,
-                                                 bool visc, bool pa,
-                                                 double cgt, int cgiter)
-   : HipTimeDependentOperator(size),
-     H1FESpace(h1_fes), L2FESpace(l2_fes),
-     H1compFESpace(h1_fes.GetParMesh(), h1_fes.FEColl(),1),
-     ess_tdofs(essential_tdofs),
-     dim(h1_fes.GetMesh()->Dimension()),
-     nzones(h1_fes.GetMesh()->GetNE()),
-     l2dofs_cnt(l2_fes.GetFE(0)->GetDof()),
-     h1dofs_cnt(h1_fes.GetFE(0)->GetDof()),
-     source_type(source_type_), cfl(cfl_),
-     use_viscosity(visc), p_assembly(pa), cg_rel_tol(cgt), cg_max_iter(cgiter),
-     material_pcf(material_),
-     integ_rule(IntRules.Get(h1_fes.GetMesh()->GetElementBaseGeometry(0),
-                             3*h1_fes.GetOrder(0) + l2_fes.GetOrder(0) - 1)),
-     quad_data(dim, nzones, integ_rule.GetNPoints()),
-     quad_data_is_current(false),
-     VMassPA(H1compFESpace, integ_rule, &quad_data),
-     EMassPA(L2FESpace, integ_rule, &quad_data),
-     VMassPA_prec(H1FESpace),
-     ForcePA(H1FESpace, L2FESpace, integ_rule, &quad_data),
-     CG_VMass(H1FESpace.GetParMesh()->GetComm()),
-     CG_EMass(L2FESpace.GetParMesh()->GetComm()),
-     timer(),
-     v(),e(),
-     rhs(H1FESpace.GetVSize()),
-     B(H1compFESpace.GetTrueVSize()),X(H1compFESpace.GetTrueVSize()),
-     one(L2FESpace.GetVSize(),1.0),
-     e_rhs(L2FESpace.GetVSize()),
-     rhs_c(H1compFESpace.GetVSize()),
-     v_local(H1FESpace.GetVDim() * H1FESpace.GetLocalDofs()*nzones),
-     e_quad()
-{
-   // Initial local mesh size (assumes similar cells).
-   double loc_area = 0.0, glob_area;
-   int loc_z_cnt = nzones, glob_z_cnt;
-   ParMesh *pm = H1FESpace.GetParMesh();
-   for (int i = 0; i < nzones; i++) { loc_area += pm->GetElementVolume(i); }
-   MPI_Allreduce(&loc_area, &glob_area, 1, MPI_DOUBLE, MPI_SUM, pm->GetComm());
-   MPI_Allreduce(&loc_z_cnt, &glob_z_cnt, 1, MPI_INT, MPI_SUM, pm->GetComm());
-   switch (pm->GetElementBaseGeometry(0))
-   {
-      case Geometry::SEGMENT:
-         quad_data.h0 = glob_area / glob_z_cnt; break;
-      case Geometry::SQUARE:
-         quad_data.h0 = sqrt(glob_area / glob_z_cnt); break;
-      case Geometry::TRIANGLE:
-         quad_data.h0 = sqrt(2.0 * glob_area / glob_z_cnt); break;
-      case Geometry::CUBE:
-         quad_data.h0 = pow(glob_area / glob_z_cnt, 1.0/3.0); break;
-      case Geometry::TETRAHEDRON:
-         quad_data.h0 = pow(6.0 * glob_area / glob_z_cnt, 1.0/3.0); break;
-      default: MFEM_ABORT("Unknown zone type!");
-   }
-   quad_data.h0 /= (double) H1FESpace.GetOrder(0);
-
-   quad_data.dqMaps = HipDofQuadMaps::Get(H1FESpace,integ_rule);
-   quad_data.geom = HipGeometry::Get(H1FESpace,integ_rule);
-   quad_data.Jac0inv = quad_data.geom->invJ;
-
-   HipVector rhoValues; // used in rInitQuadratureData
-   rho0.ToQuad(integ_rule, rhoValues);
-
-   if (dim==1) { assert(false); }
-   const int NUM_QUAD = integ_rule.GetNPoints();
-
-   rInitQuadratureData(NUM_QUAD,
-                       nzones,
-                       rhoValues,
-                       quad_data.geom->detJ,
-                       quad_data.dqMaps->quadWeights,
-                       quad_data.rho0DetJ0w);
-
-   // Needs quad_data.rho0DetJ0w
-   ForcePA.Setup();
-   VMassPA.Setup();
-   EMassPA.Setup();
-
-   {
-      // Setup the preconditioner of the velocity mass operator.
-      //Vector d;
-      //#warning ComputeDiagonal
-      //(dim == 2) ? VMassPA.ComputeDiagonal2D(d) : VMassPA.ComputeDiagonal3D(d);
-      //VMassPA_prec.SetDiagonal(d);
-   }
-
-   CG_VMass.SetOperator(VMassPA);
-   CG_VMass.SetRelTol(cg_rel_tol);
-   CG_VMass.SetAbsTol(0.0);
-   CG_VMass.SetMaxIter(cg_max_iter);
-   CG_VMass.SetPrintLevel(-1);
-
-   CG_EMass.SetOperator(EMassPA);
-   CG_EMass.iterative_mode = false;
-   CG_EMass.SetRelTol(1e-8);
-   CG_EMass.SetAbsTol(1e-8 * numeric_limits<double>::epsilon());
-   CG_EMass.SetMaxIter(200);
-   CG_EMass.SetPrintLevel(-1);
-}
-
-// *****************************************************************************
-LagrangianHydroOperator::~LagrangianHydroOperator() {}
-
-// *****************************************************************************
-void LagrangianHydroOperator::Mult(const HipVector &S, HipVector &dS_dt) const
-{
-   dS_dt = 0.0;
-
-   // Make sure that the mesh positions correspond to the ones in S. This is
-   // needed only because some mfem time integrators don't update the solution
-   // vector at every intermediate stage (hence they don't change the mesh).
-   Vector h_x = HipVector(S.GetRange(0, H1FESpace.GetVSize()));
-   ParGridFunction x(&H1FESpace, h_x.GetData());
-   H1FESpace.GetParMesh()->NewNodes(x, false);
-
-   UpdateQuadratureData(S);
-
-   // The monolithic BlockVector stores the unknown fields as follows:
-   // - Position
-   // - Velocity
-   // - Specific Internal Energy
-   const int VsizeL2 = L2FESpace.GetVSize();
-   const int VsizeH1 = H1FESpace.GetVSize();
-
-   v = S.GetRange(VsizeH1, VsizeH1);
-   e = S.GetRange(2*VsizeH1, VsizeL2);
-
-   HipVector dx = dS_dt.GetRange(0, VsizeH1);
-   HipVector dv = dS_dt.GetRange(VsizeH1, VsizeH1);
-   HipVector de = dS_dt.GetRange(2*VsizeH1, VsizeL2);
-
-   // Set dx_dt = v (explicit)
-   dx = v;
-
-   // Solve for velocity.
-   timer.sw_force.Start();
-   ForcePA.Mult(one, rhs);
-   timer.sw_force.Stop();
-   rhs.Neg();
-
-   // Partial assembly solve for each velocity component.
-   const int size = H1compFESpace.GetVSize();
-
-   for (int c = 0; c < dim; c++)
-   {
-      rhs_c = rhs.GetRange(c*size, size);
-      HipVector dv_c = dv.GetRange(c*size, size);
-      Array<int> c_tdofs;
-      Array<int> ess_bdr(H1FESpace.GetMesh()->bdr_attributes.Max());
-      // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e.,
-      // we must enforce v_x/y/z = 0 for the velocity components.
-      ess_bdr = 0; ess_bdr[c] = 1;
-      // Essential true dofs as if there's only one component.
-      H1compFESpace.GetEssentialTrueDofs(ess_bdr, c_tdofs);
-
-      dv_c = 0.0;
-
-      H1compFESpace.GetProlongationOperator()->MultTranspose(rhs_c, B);
-      H1compFESpace.GetRestrictionOperator()->Mult(dv_c, X);
-
-      VMassPA.SetEssentialTrueDofs(c_tdofs);
-      VMassPA.EliminateRHS(B);
-
-      timer.sw_cgH1.Start();
-      CG_VMass.Mult(B, X);
-      timer.sw_cgH1.Stop();
-      timer.H1cg_iter += CG_VMass.GetNumIterations();
-      //printf("\n[H1cg_iter] %d",timer.H1cg_iter);
-      H1compFESpace.GetProlongationOperator()->Mult(X, dv_c);
-   }
-
-
-   // Solve for energy, assemble the energy source if such exists.
-   LinearForm *e_source = NULL;
-   if (source_type == 1) // 2D Taylor-Green.
-   {
-      e_source = new LinearForm(&L2FESpace);
-      assert(L2FESpace.FEColl());
-      TaylorCoefficient coeff;
-      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &integ_rule);
-      e_source->AddDomainIntegrator(d);
-      e_source->Assemble();
-   }
-   Array<int> l2dofs;
-   {
-      timer.sw_force.Start();
-      ForcePA.MultTranspose(v, e_rhs);
-      timer.sw_force.Stop();
-   }
-
-   if (e_source) { e_rhs += *e_source; }
-
-   {
-      timer.sw_cgL2.Start();
-      CG_EMass.Mult(e_rhs, de);
-      timer.sw_cgL2.Stop();
-      timer.L2cg_iter += CG_EMass.GetNumIterations();
-   }
-   delete e_source;
-   quad_data_is_current = false;
-}
-
-double LagrangianHydroOperator::GetTimeStepEstimate(const HipVector &S) const
-{
-   UpdateQuadratureData(S);
-   double glob_dt_est;
-   MPI_Allreduce(&quad_data.dt_est, &glob_dt_est, 1, MPI_DOUBLE, MPI_MIN,
-                 H1FESpace.GetParMesh()->GetComm());
-   return glob_dt_est;
-}
-
-void LagrangianHydroOperator::ResetTimeStepEstimate() const
-{
-   quad_data.dt_est = numeric_limits<double>::infinity();
-}
-
-void LagrangianHydroOperator::ComputeDensity(ParGridFunction &rho)
-{
-   rho.SetSpace(&L2FESpace);
-   DenseMatrix Mrho(l2dofs_cnt);
-   Vector rhs(l2dofs_cnt), rho_z(l2dofs_cnt);
-   Array<int> dofs(l2dofs_cnt);
-   DenseMatrixInverse inv(&Mrho);
-   MassIntegrator mi(&integ_rule);
-   DensityIntegrator di(quad_data,integ_rule);
-   for (int i = 0; i < nzones; i++)
-   {
-      di.AssembleRHSElementVect(*L2FESpace.GetFE(i),
-                                *L2FESpace.GetElementTransformation(i), rhs);
-      mi.AssembleElementMatrix(*L2FESpace.GetFE(i),
-                               *L2FESpace.GetElementTransformation(i), Mrho);
-      inv.Factor();
-      inv.Mult(rhs, rho_z);
-      L2FESpace.GetElementDofs(i, dofs);
-      rho.SetSubVector(dofs, rho_z);
-   }
-}
-
-void LagrangianHydroOperator::PrintTimingData(bool IamRoot, int steps)
-{
-   double my_rt[5], rt_max[5];
-   my_rt[0] = timer.sw_cgH1.RealTime();
-   my_rt[1] = timer.sw_cgL2.RealTime();
-   my_rt[2] = timer.sw_force.RealTime();
-   my_rt[3] = timer.sw_qdata.RealTime();
-   my_rt[4] = my_rt[0] + my_rt[2] + my_rt[3];
-   MPI_Reduce(my_rt, rt_max, 5, MPI_DOUBLE, MPI_MAX, 0, H1FESpace.GetComm());
-
-   HYPRE_Int mydata[2], alldata[2];
-   mydata[0] = timer.L2cg_iter;
-   mydata[1] = timer.quad_tstep;
-   MPI_Reduce(mydata, alldata, 2, HYPRE_MPI_INT, MPI_SUM, 0, H1FESpace.GetComm());
-
-   if (IamRoot)
-   {
-      const HYPRE_Int H1gsize = H1FESpace.GlobalTrueVSize(),
-                      L2gsize = L2FESpace.GlobalTrueVSize();
-      using namespace std;
-      cout << endl;
-      cout << "CG (H1) total time: " << rt_max[0] << endl;
-      cout << "CG (H1) rate (megadofs="<<H1gsize<<" x cg_iterations="<<timer.H1cg_iter<<" / second): "
-           << 1e-6 * H1gsize * timer.H1cg_iter / rt_max[0] << endl;
-      cout << endl;
-      cout << "CG (L2) total time: " << rt_max[1] << endl;
-      cout << "CG (L2) rate (megadofs x cg_iterations / second): "
-           << 1e-6 * L2gsize * timer.L2cg_iter/*alldata[0]*/ / rt_max[1] << endl;
-      cout << endl;
-      // The Force operator is applied twice per time step, on the H1 and the L2
-      // vectors, respectively.
-      cout << "Forces total time: " << rt_max[2] << endl;
-      cout << "Forces rate (megadofs x timesteps / second): "
-           << 1e-6 * steps * (H1gsize + L2gsize) / rt_max[2] << endl;
-      cout << endl;
-      cout << "UpdateQuadData total time: " << rt_max[3] << endl;
-      cout << "UpdateQuadData rate (megaquads x timesteps / second): "
-           << 1e-6 * alldata[1] * integ_rule.GetNPoints() / rt_max[3] << endl;
-      cout << endl;
-      cout << "Major kernels total time (seconds): " << rt_max[4] << endl;
-      cout << "Major kernels total rate (megadofs x time steps / second): "
-           << 1e-6 * H1gsize * steps / rt_max[4] << endl;
-   }
-}
-
-// *****************************************************************************
-void LagrangianHydroOperator::UpdateQuadratureData(const HipVector &S) const
-{
-   if (quad_data_is_current) { return; }
-
-   timer.sw_qdata.Start();
-
-   const int vSize = H1FESpace.GetVSize();
-   const int eSize = L2FESpace.GetVSize();
-
-   const HipVector x = S.GetRange(0, vSize);
-   HipVector v = S.GetRange(vSize, vSize);
-   HipGridFunction e(L2FESpace, S.GetRange(2*vSize, eSize));
-
-   quad_data.geom = HipGeometry::Get(H1FESpace,integ_rule,x);
-   H1FESpace.GlobalToLocal(v, v_local);
-   e.ToQuad(integ_rule, e_quad);
-
-   const int NUM_QUAD = integ_rule.GetNPoints();
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int NUM_DOFS_1D  = H1FESpace.GetFE(0)->GetOrder()+1;
-
-   ElementTransformation *T = H1FESpace.GetElementTransformation(0);
-   const IntegrationPoint &ip = integ_rule.IntPoint(0);
-   const double gamma = material_pcf->Eval(*T, ip);
-   if (rconfig::Get().Share())
-      rUpdateQuadratureDataS(gamma,
-                             quad_data.h0,
-                             cfl,
-                             use_viscosity,
-                             dim,
-                             NUM_QUAD,
-                             NUM_QUAD_1D,
-                             NUM_DOFS_1D,
-                             nzones,
-                             quad_data.dqMaps->dofToQuad,
-                             quad_data.dqMaps->dofToQuadD,
-                             quad_data.dqMaps->quadWeights,
-                             v_local,
-                             e_quad,
-                             quad_data.rho0DetJ0w,
-                             quad_data.Jac0inv,
-                             quad_data.geom->J,
-                             quad_data.geom->invJ,
-                             quad_data.geom->detJ,
-                             quad_data.stressJinvT,
-                             quad_data.dtEst);
-   else
-      rUpdateQuadratureData(gamma,
-                            quad_data.h0,
-                            cfl,
-                            use_viscosity,
-                            dim,
-                            NUM_QUAD,
-                            NUM_QUAD_1D,
-                            NUM_DOFS_1D,
-                            nzones,
-                            quad_data.dqMaps->dofToQuad,
-                            quad_data.dqMaps->dofToQuadD,
-                            quad_data.dqMaps->quadWeights,
-                            v_local,
-                            e_quad,
-                            quad_data.rho0DetJ0w,
-                            quad_data.Jac0inv,
-                            quad_data.geom->J,
-                            quad_data.geom->invJ,
-                            quad_data.geom->detJ,
-                            quad_data.stressJinvT,
-                            quad_data.dtEst);
-
-   quad_data.dt_est = quad_data.dtEst.Min();
-   quad_data_is_current = true;
-   timer.sw_qdata.Stop();
-   timer.quad_tstep += nzones;
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/hip/laghos_solver.hpp b/hip/laghos_solver.hpp
deleted file mode 100644
index 6054c0c8..00000000
--- a/hip/laghos_solver.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_SOLVER
-#define MFEM_LAGHOS_SOLVER
-
-#include "mfem.hpp"
-#include "hip/hip.hpp"
-
-#include "laghos_assembly.hpp"
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-#include <fstream>
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-/// Visualize the given parallel grid function, using a GLVis server on the
-/// specified host and port. Set the visualization window title, and optionally,
-/// its geometry.
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x = 0, int y = 0, int w = 400, int h = 400,
-                    bool vec = false);
-
-
-// These are defined in laghos.cpp
-double rho0(const Vector &);
-void v0(const Vector &, Vector &);
-double e0(const Vector &);
-double gamma(const Vector &);
-
-struct TimingData
-{
-   // Total times for all major computations:
-   // CG solves (H1 and L2) / force RHS assemblies / quadrature computations.
-   StopWatch sw_cgH1, sw_cgL2, sw_force, sw_qdata;
-
-   // These accumulate the total processed dofs or quad points:
-   // #(CG iterations) for the H1 CG solves.
-   // #dofs  * #(CG iterations) for the L2 CG solve.
-   // #quads * #(RK sub steps) for the quadrature data computations.
-   int H1cg_iter, L2cg_iter, quad_tstep;
-
-   TimingData()
-      : H1cg_iter(0), L2cg_iter(0), quad_tstep(0) {}
-};
-
-// Given a solutions state (x, v, e), this class performs all necessary
-// computations to evaluate the new slopes (dx_dt, dv_dt, de_dt).
-class LagrangianHydroOperator : public HipTimeDependentOperator
-{
-protected:
-   HipFiniteElementSpace &H1FESpace;
-   HipFiniteElementSpace &L2FESpace;
-   mutable HipFiniteElementSpace H1compFESpace;
-
-   Array<int> &ess_tdofs;
-
-   const int dim, nzones, l2dofs_cnt, h1dofs_cnt, source_type;
-   const double cfl;
-   const bool use_viscosity, p_assembly;
-   const double cg_rel_tol;
-   const int cg_max_iter;
-   Coefficient *material_pcf;
-
-   // Integration rule for all assemblies.
-   const IntegrationRule &integ_rule;
-
-   // Data associated with each quadrature point in the mesh. These values are
-   // recomputed at each time step.
-   mutable QuadratureData quad_data;
-   mutable bool quad_data_is_current;
-
-   // Force matrix that combines the kinematic and thermodynamic spaces. It is
-   // assembled in each time step and then it's used to compute the final
-   // right-hand sides for momentum and specific internal energy.
-   mutable HipMassOperator VMassPA, EMassPA;
-   mutable DiagonalSolver VMassPA_prec;
-   mutable HipForceOperator ForcePA;
-
-   // Linear solver for energy.
-   //HipCGSolver locCG;
-   HipCGSolver CG_VMass,CG_EMass;
-
-   mutable TimingData timer;
-
-   // Device vectors we want to keep
-   mutable HipVector v,e,rhs,B,X;
-   const HipVector one;
-   mutable HipVector e_rhs;
-   mutable HipVector rhs_c;
-   mutable HipVector v_local,e_quad;
-
-   virtual void ComputeMaterialProperties(int nvalues, const double gamma[],
-                                          const double rho[], const double e[],
-                                          double p[], double cs[]) const
-   {
-      for (int v = 0; v < nvalues; v++)
-      {
-         p[v]  = (gamma[v] - 1.0) * rho[v] * e[v];
-         cs[v] = sqrt(gamma[v] * (gamma[v]-1.0) * e[v]);
-      }
-   }
-
-   void UpdateQuadratureData(const HipVector &S) const;
-
-public:
-   LagrangianHydroOperator(int size, HipFiniteElementSpace &h1_fes,
-                           HipFiniteElementSpace &l2_fes,
-                           Array<int> &essential_tdofs, HipGridFunction &rho0,
-                           int source_type_, double cfl_,
-                           Coefficient *material_, bool visc, bool pa,
-                           double cgt, int cgiter);
-
-   // Solve for dx_dt, dv_dt and de_dt.
-   virtual void Mult(const HipVector &S, HipVector &dS_dt) const;
-
-   // Calls UpdateQuadratureData to compute the new quad_data.dt_est.
-   double GetTimeStepEstimate(const HipVector &S) const;
-   void ResetTimeStepEstimate() const;
-   void ResetQuadratureData() const { quad_data_is_current = false; }
-
-   // The density values, which are stored only at some quadrature points, are
-   // projected as a ParGridFunction.
-   void ComputeDensity(ParGridFunction &rho);
-
-   void PrintTimingData(bool IamRoot, int steps);
-
-   ~LagrangianHydroOperator();
-};
-
-class TaylorCoefficient : public Coefficient
-{
-   virtual double Eval(ElementTransformation &T,
-                       const IntegrationPoint &ip)
-   {
-      Vector x(2);
-      T.Transform(ip, x);
-      return 3.0 / 8.0 * M_PI * ( cos(3.0*M_PI*x(0)) * cos(M_PI*x(1)) -
-                                  cos(M_PI*x(0))     * cos(3.0*M_PI*x(1)) );
-   }
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS
diff --git a/hip/makefile b/hip/makefile
deleted file mode 100644
index 6bf600ff..00000000
--- a/hip/makefile
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-# reserved. See files LICENSE and NOTICE for details.
-#
-# This file is part of CEED, a collection of benchmarks, miniapps, software
-# libraries and APIs for efficient high-order finite element and spectral
-# element discretizations for exascale applications. For more information and
-# source code availability see http://github.com/ceed.
-#
-# The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-# a collaborative effort of two U.S. Department of Energy organizations (Office
-# of Science and the National Nuclear Security Administration) responsible for
-# the planning and preparation of a capable exascale ecosystem, including
-# software, applications, hardware, advanced system engineering and early
-# testbed platforms, in support of the nation's exascale computing imperative.
-
-AMD_ARCH ?= --amdgpu-target=gfx906
-ROCM_DIR ?= /opt/rocm
-
-MPI_HOME ?= /usr/lib/openmpi
-MPI_INCFLAGS = -I$(MPI_HOME)/include
-
-MPI_LIBS := $(shell mpicxx --showme:link)
-
-HIPCC_CXXFLAGS = -std=c++11 -Wall $(AMD_ARCH) -I$(ROCM_DIR)/include $(shell $(ROCM_DIR)/bin/hipconfig --cpp_config)
-HIPCC_LIBS =  -L$(ROCM_DIR)/lib/ -lhip_hcc $(MPI_INCFLAGS) $(MPI_LIBS)
-
-
-define LAGHOS_HELP_MSG
-
-Laghos makefile targets:
-
-   make
-   make status/info
-   make install
-   make clean
-   make distclean
-   make style
-
-Examples:
-
-make -j 4
-   Build Laghos using the current configuration options from MFEM.
-   (Laghos requires the MFEM finite element library, and uses its compiler and
-    linker options in its build process.)
-make status
-   Display information about the current configuration.
-make install PREFIX=<dir>
-   Install the Laghos executable in <dir>.
-make clean
-   Clean the Laghos executable, library and object files.
-make distclean
-   In addition to "make clean", remove the local installation directory and some
-   run-time generated files.
-make style
-   Format the Laghos C++ source files using the Artistic Style (astyle) settings
-   from MFEM.
-
-endef
-
-# Default installation location
-PREFIX = ./bin
-INSTALL = /usr/bin/install
-
-# Use the MFEM build directory
-MFEM_DIR = ../../mfem
-CONFIG_MK = $(MFEM_DIR)/config/config.mk
-TEST_MK = $(MFEM_DIR)/config/test.mk
-# Use the MFEM install directory
-# MFEM_DIR = ../mfem/mfem
-# CONFIG_MK = $(MFEM_DIR)/config.mk
-# TEST_MK = $(MFEM_DIR)/test.mk
-
-# Use two relative paths to MFEM: first one for compilation in '.' and second
-# one for compilation in 'lib'.
-MFEM_DIR1 := $(MFEM_DIR)
-MFEM_DIR2 := $(realpath $(MFEM_DIR))
-
-# Use the compiler used by MFEM. Get the compiler and the options for compiling
-# and linking from MFEM's config.mk. (Skip this if the target does not require
-# building.)
-MFEM_LIB_FILE = mfem_is_not_built
-ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
-   -include $(CONFIG_MK)
-endif
-
-CXX = $(ROCM_DIR)/bin/hipcc
-CPPFLAGS = $(MFEM_CPPFLAGS)
-CXXFLAGS = $(MFEM_CXXFLAGS)
-
-# MFEM config does not define C compiler
-CC     = gcc
-CFLAGS = -O3
-
-# Optional link flags
-LDFLAGS =
-
-OPTIM_OPTS = -O3
-DEBUG_OPTS = -g -Wall
-LAGHOS_DEBUG = $(MFEM_DEBUG)
-ifneq ($(LAGHOS_DEBUG),$(MFEM_DEBUG))
-   ifeq ($(LAGHOS_DEBUG),YES)
-      CXXFLAGS = $(DEBUG_OPTS)
-   else
-      CXXFLAGS = $(OPTIM_OPTS)
-   endif
-endif
-
-LAGHOS_FLAGS = $(CPPFLAGS) $(CXXFLAGS) $(MFEM_INCFLAGS) \
-      $(MPI_INCFLAGS) $(HIPCC_CXXFLAGS) $(HIPCC_INCFLAGS)
-LAGHOS_LIBS = $(MFEM_LIBS) $(HIPCC_LIBS)
-
-ifeq ($(LAGHOS_DEBUG),YES)
-   LAGHOS_FLAGS += -DLAGHOS_DEBUG
-endif
-
-LIBS = $(strip $(LAGHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(LAGHOS_FLAGS))
-Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
-
-SOURCE_FILES = laghos.cpp laghos_assembly.cpp laghos_solver.cpp \
-	hip/linalg/solvers.cpp \
-	hip/linalg/vector.cpp \
-	hip/kernels/share/qDataUpdateS.cpp \
-	hip/kernels/share/gridFuncToQuadS.cpp \
-	hip/kernels/share/forceS.cpp \
-	hip/kernels/share/massMultAddS.cpp \
-	hip/kernels/share/massAssembleS.cpp \
-	hip/kernels/force/force.cpp \
-	hip/kernels/geom/initGeom.cpp \
-	hip/kernels/quad/gridFuncToQuad.cpp \
-	hip/kernels/quad/qDataUpdate.cpp \
-	hip/kernels/quad/qDataInit.cpp \
-	hip/kernels/maps/globalToLocal.cpp \
-	hip/kernels/maps/mapping.cpp \
-	hip/kernels/maps/localToGlobal.cpp \
-	hip/kernels/mass/multAdd.cpp \
-	hip/kernels/mass/assemble.cpp \
-	hip/kernels/blas/vector_map_dofs.cpp \
-	hip/kernels/blas/vector_vec_sub.cpp \
-	hip/kernels/blas/vector_dot.cpp \
-	hip/kernels/blas/vector_clear_dofs.cpp \
-	hip/kernels/blas/vector_xsy.cpp \
-	hip/kernels/blas/vector_xpay.cpp \
-	hip/kernels/blas/vector_axpy.cpp \
-	hip/kernels/blas/vector_op_eq.cpp \
-	hip/kernels/blas/vector_get_subvector.cpp \
-	hip/kernels/blas/vector_vec_add.cpp \
-	hip/kernels/blas/vector_min.cpp \
-	hip/kernels/blas/vector_set_subvector.cpp \
-	hip/kernels/blas/vector_set_subvector_const.cpp \
-	hip/kernels/blas/vector_vec_mul.cpp \
-	hip/kernels/blas/vector_neg.cpp \
-	hip/fem/bilinearform.cpp \
-	hip/fem/hipGridfunc.cpp \
-	hip/fem/restrict.cpp \
-	hip/fem/fespace.cpp \
-	hip/fem/bilininteg.cpp \
-	hip/fem/conform.cpp \
-	hip/fem/prolong.cpp \
-	hip/general/memcpy.cpp \
-	hip/general/commd.cpp \
-	hip/general/table.cpp \
-	hip/config/config.cpp
-
-OBJECT_FILES1 = $(SOURCE_FILES:.cpp=.o)
-OBJECT_FILES = $(OBJECT_FILES1:.c=.o)
-HEADER_FILES = laghos_solver.hpp laghos_assembly.hpp \
-	hip/linalg/solvers.hpp \
-	hip/linalg/operator.hpp \
-	hip/linalg/ode.hpp \
-	hip/linalg/vector.hpp \
-	hip/kernels/hip.hpp \
-	hip/kernels/include/forall.hpp \
-	hip/kernels/include/kernels.hpp \
-	hip/kernels/include/offsets.hpp \
-	hip/fem/prolong.hpp \
-	hip/fem/hipGridfunc.hpp \
-	hip/fem/conform.hpp \
-	hip/fem/restrict.hpp \
-	hip/fem/bilininteg.hpp \
-	hip/fem/fespace.hpp \
-	hip/fem/bilinearform.hpp \
-	hip/general/commd.hpp \
-	hip/general/table.hpp \
-	hip/general/malloc.hpp \
-	hip/general/memcpy.hpp \
-	hip/general/array.hpp \
-	hip/config/config.hpp \
-	hip/hip.hpp \
-
-# Targets
-
-.PHONY: all clean distclean install status info opt debug test style clean-build clean-exec
-
-.SUFFIXES: .c .cpp .o
-.cpp.o:
-	cd $(<D); $(CCC) -c $(<F)
-.c.o:
-	cd $(<D); $(Ccc) -c $(<F)
-
-laghos: override MFEM_DIR = $(MFEM_DIR1)
-laghos:	$(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
-	$(CXX) -o laghos $(OBJECT_FILES) $(LIBS)
-
-all: laghos
-
-opt:
-	$(MAKE) "LAGHOS_DEBUG=NO"
-
-debug:
-	$(MAKE) "LAGHOS_DEBUG=YES"
-
-$(OBJECT_FILES): override MFEM_DIR = $(MFEM_DIR2)
-$(OBJECT_FILES): $(HEADER_FILES) $(CONFIG_MK)
-
-MFEM_TESTS = laghos
-include $(TEST_MK)
-# Testing: Specific execution options
-RUN_MPI = $(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) 4
-test: laghos
-	@$(call mfem-test,$<, $(RUN_MPI), Laghos miniapp,\
-	-p 0 -m data/square01_quad.mesh -rs 3 -tf 0.1)
-# Testing: "test" target and mfem-test* variables are defined in MFEM's
-# config/test.mk
-
-# Generate an error message if the MFEM library is not built and exit
-$(CONFIG_MK) $(MFEM_LIB_FILE):
-	$(error The MFEM library is not built)
-
-clean: clean-build clean-exec
-
-clean-build:
-	rm -rf laghos $(OBJECT_FILES) *~ *.dSYM
-clean-exec:
-	rm -rf ./results
-
-distclean: clean
-	rm -rf bin/
-
-install: laghos
-	mkdir -p $(PREFIX)
-	$(INSTALL) -m 750 laghos $(PREFIX)
-
-help:
-	$(info $(value LAGHOS_HELP_MSG))
-	@true
-
-status info:
-	$(info MFEM_DIR    = $(MFEM_DIR))
-	$(info LAGHOS_FLAGS = $(LAGHOS_FLAGS))
-	$(info LAGHOS_LIBS  = $(value LAGHOS_LIBS))
-	$(info PREFIX      = $(PREFIX))
-	@true
-
-ASTYLE = astyle --options=$(MFEM_DIR1)/config/mfem.astylerc
-FORMAT_FILES := $(SOURCE_FILES) $(HEADER_FILES)
-
-style:
-	@if ! $(ASTYLE) $(FORMAT_FILES) | grep Formatted; then\
-	   echo "No source files were changed.";\
-	fi
diff --git a/laghos.cpp b/laghos.cpp
index d4f3fe5e..7fc86c54 100644
--- a/laghos.cpp
+++ b/laghos.cpp
@@ -39,6 +39,8 @@
 //    p = 2  --> 1D Sod shock tube.
 //    p = 3  --> Triple point.
 //    p = 4  --> Gresho vortex (smooth problem).
+//    p = 5  --> 2D Riemann problem, config. 12 of doi.org/10.1002/num.10025
+//    p = 6  --> 2D Riemann problem, config.  6 of doi.org/10.1002/num.10025
 //
 // Sample runs: see README.md, section 'Verification of Results'.
 //
@@ -56,39 +58,47 @@
 // -m data/cube_522_hex.mesh -pt 521 for 10 / 80 / 640 / 5120 ... tasks.
 // -m data/cube_12_hex.mesh  -pt 322 for 12 / 96 / 768 / 6144 ... tasks.
 
-#include "laghos_solver.hpp"
-#include "laghos_timeinteg.hpp"
 #include <fstream>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "laghos_solver.hpp"
 
-using namespace std;
+using std::cout;
+using std::endl;
 using namespace mfem;
-using namespace mfem::hydrodynamics;
 
 // Choice for the problem setup.
-int problem;
+static int problem;
 
-double rho0(const Vector &);
-void v0(const Vector &, Vector &);
+// Forward declarations.
 double e0(const Vector &);
+double rho0(const Vector &);
 double gamma(const Vector &);
-void display_banner(ostream & os);
+void v0(const Vector &, Vector &);
+
+static long GetMaxRssMB();
+static void display_banner(std::ostream&);
+static void Checks(const int dim, const int ti, const double norm, int &checks);
 
 int main(int argc, char *argv[])
 {
    // Initialize MPI.
    MPI_Session mpi(argc, argv);
-   int myid = mpi.WorldRank();
+   const int myid = mpi.WorldRank();
 
    // Print the banner.
    if (mpi.Root()) { display_banner(cout); }
 
    // Parse command-line options.
    problem = 1;
-   const char *mesh_file = "data/cube01_hex.mesh";
+   int dim = 3;
+   const char *mesh_file = "default";
    int rs_levels = 2;
    int rp_levels = 0;
+   Array<int> cxyz;
    int order_v = 2;
    int order_e = 1;
+   int order_q = -1;
    int ode_solver_type = 4;
    double t_final = 0.6;
    double cfl = 0.5;
@@ -104,21 +114,31 @@ int main(int argc, char *argv[])
    bool gfprint = false;
    const char *basename = "results/Laghos";
    int partition_type = 0;
+   const char *device = "cpu";
+   bool check = false;
+   bool mem_usage = false;
+   bool fom = false;
+   bool gpu_aware_mpi = false;
+   int dev = 0;
    double blast_energy = 0.25;
    double blast_position[] = {0.0, 0.0, 0.0};
 
    OptionsParser args(argc, argv);
-   args.AddOption(&mesh_file, "-m", "--mesh",
-                  "Mesh file to use.");
+   args.AddOption(&dim, "-dim", "--dimension", "Dimension of the problem.");
+   args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use.");
    args.AddOption(&rs_levels, "-rs", "--refine-serial",
                   "Number of times to refine the mesh uniformly in serial.");
    args.AddOption(&rp_levels, "-rp", "--refine-parallel",
                   "Number of times to refine the mesh uniformly in parallel.");
+   args.AddOption(&cxyz, "-c", "--cartesian-partitioning",
+                  "Use Cartesian partitioning.");
    args.AddOption(&problem, "-p", "--problem", "Problem setup to use.");
    args.AddOption(&order_v, "-ok", "--order-kinematic",
                   "Order (degree) of the kinematic finite element space.");
    args.AddOption(&order_e, "-ot", "--order-thermo",
                   "Order (degree) of the thermodynamic finite element space.");
+   args.AddOption(&order_q, "-oq", "--order-intrule",
+                  "Order  of the integration rule.");
    args.AddOption(&ode_solver_type, "-s", "--ode-solver",
                   "ODE solver: 1 - Forward Euler,\n\t"
                   "            2 - RK2 SSP, 3 - RK3 SSP, 4 - RK4, 6 - RK6,\n\t"
@@ -160,6 +180,17 @@ int main(int argc, char *argv[])
                   "of zones in each direction, e.g., the number of zones in direction x\n\t"
                   "must be divisible by the number of MPI tasks in direction x.\n\t"
                   "Available options: 11, 21, 111, 211, 221, 311, 321, 322, 432.");
+   args.AddOption(&device, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.AddOption(&check, "-chk", "--checks", "-no-chk", "--no-checks",
+                  "Enable 2D checks.");
+   args.AddOption(&mem_usage, "-mb", "--mem", "-no-mem", "--no-mem",
+                  "Enable memory usage.");
+   args.AddOption(&fom, "-f", "--fom", "-no-fom", "--no-fom",
+                  "Enable figure of merit output.");
+   args.AddOption(&gpu_aware_mpi, "-gam", "--gpu-aware-mpi", "-no-gam",
+                  "--no-gpu-aware-mpi", "Enable GPU aware MPI communications.");
+   args.AddOption(&dev, "-dev", "--dev", "GPU device to use.");
    args.Parse();
    if (!args.Good())
    {
@@ -168,12 +199,53 @@ int main(int argc, char *argv[])
    }
    if (mpi.Root()) { args.PrintOptions(cout); }
 
-   // Read the serial mesh from the given mesh file on all processors.
-   // Refine the mesh in serial to increase the resolution.
-   Mesh *mesh = new Mesh(mesh_file, 1, 1);
-   const int dim = mesh->Dimension();
-   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
+   // Configure the device from the command line options
+   Device backend;
+   backend.Configure(device, dev);
+   if (mpi.Root()) { backend.Print(); }
+   backend.SetGPUAwareMPI(gpu_aware_mpi);
 
+   // On all processors, use the default builtin 1D/2D/3D mesh or read the
+   // serial one given on the command line.
+   Mesh *mesh;
+   if (strncmp(mesh_file, "default", 7) != 0)
+   {
+      mesh = new Mesh(mesh_file, true, true);
+   }
+   else
+   {
+      if (dim == 1)
+      {
+         mesh = new Mesh(2);
+         mesh->GetBdrElement(0)->SetAttribute(1);
+         mesh->GetBdrElement(1)->SetAttribute(1);
+      }
+      if (dim == 2)
+      {
+         mesh = new Mesh(2, 2, Element::QUADRILATERAL, true);
+         const int NBE = mesh->GetNBE();
+         for (int b = 0; b < NBE; b++)
+         {
+            Element *bel = mesh->GetBdrElement(b);
+            const int attr = (b < NBE/2) ? 2 : 1;
+            bel->SetAttribute(attr);
+         }
+      }
+      if (dim == 3)
+      {
+         mesh = new Mesh(2, 2, 2, Element::HEXAHEDRON, true);
+         const int NBE = mesh->GetNBE();
+         for (int b = 0; b < NBE; b++)
+         {
+            Element *bel = mesh->GetBdrElement(b);
+            const int attr = (b < NBE/3) ? 3 : (b < 2*NBE/3) ? 1 : 2;
+            bel->SetAttribute(attr);
+         }
+      }
+   }
+   dim = mesh->Dimension();
+
+   // 1D vs partial assembly sanity check.
    if (p_assembly && dim == 1)
    {
       p_assembly = false;
@@ -183,9 +255,17 @@ int main(int argc, char *argv[])
       }
    }
 
+   // Refine the mesh in serial to increase the resolution.
+   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
+   const int mesh_NE = mesh->GetNE();
+   if (mpi.Root())
+   {
+      cout << "Number of zones in the serial mesh: " << mesh_NE << endl;
+   }
+
    // Parallel partitioning of the mesh.
-   ParMesh *pmesh = NULL;
-   const int num_tasks = mpi.WorldSize(); int unit;
+   ParMesh *pmesh = nullptr;
+   const int num_tasks = mpi.WorldSize(); int unit = 1;
    int *nxyz = new int[dim];
    switch (partition_type)
    {
@@ -194,59 +274,75 @@ int main(int argc, char *argv[])
          break;
       case 11:
       case 111:
-         unit = floor(pow(num_tasks, 1.0 / dim) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks, 1.0 / dim) + 1e-2));
          for (int d = 0; d < dim; d++) { nxyz[d] = unit; }
          break;
       case 21: // 2D
-         unit = floor(pow(num_tasks / 2, 1.0 / 2) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 2, 1.0 / 2) + 1e-2));
          nxyz[0] = 2 * unit; nxyz[1] = unit;
          break;
+      case 31: // 2D
+         unit = static_cast<int>(floor(pow(num_tasks / 3, 1.0 / 2) + 1e-2));
+         nxyz[0] = 3 * unit; nxyz[1] = unit;
+         break;
+      case 32: // 2D
+         unit = static_cast<int>(floor(pow(2 * num_tasks / 3, 1.0 / 2) + 1e-2));
+         nxyz[0] = 3 * unit / 2; nxyz[1] = unit;
+         break;
+      case 49: // 2D
+         unit = static_cast<int>(floor(pow(9 * num_tasks / 4, 1.0 / 2) + 1e-2));
+         nxyz[0] = 4 * unit / 9; nxyz[1] = unit;
+         break;
+      case 51: // 2D
+         unit = static_cast<int>(floor(pow(num_tasks / 5, 1.0 / 2) + 1e-2));
+         nxyz[0] = 5 * unit; nxyz[1] = unit;
+         break;
       case 211: // 3D.
-         unit = floor(pow(num_tasks / 2, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 2, 1.0 / 3) + 1e-2));
          nxyz[0] = 2 * unit; nxyz[1] = unit; nxyz[2] = unit;
          break;
       case 221: // 3D.
-         unit = floor(pow(num_tasks / 4, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 4, 1.0 / 3) + 1e-2));
          nxyz[0] = 2 * unit; nxyz[1] = 2 * unit; nxyz[2] = unit;
          break;
       case 311: // 3D.
-         unit = floor(pow(num_tasks / 3, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 3, 1.0 / 3) + 1e-2));
          nxyz[0] = 3 * unit; nxyz[1] = unit; nxyz[2] = unit;
          break;
       case 321: // 3D.
-         unit = floor(pow(num_tasks / 6, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 6, 1.0 / 3) + 1e-2));
          nxyz[0] = 3 * unit; nxyz[1] = 2 * unit; nxyz[2] = unit;
          break;
       case 322: // 3D.
-         unit = floor(pow(2 * num_tasks / 3, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(2 * num_tasks / 3, 1.0 / 3) + 1e-2));
          nxyz[0] = 3 * unit / 2; nxyz[1] = unit; nxyz[2] = unit;
          break;
       case 432: // 3D.
-         unit = floor(pow(num_tasks / 3, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 3, 1.0 / 3) + 1e-2));
          nxyz[0] = 2 * unit; nxyz[1] = 3 * unit / 2; nxyz[2] = unit;
          break;
       case 511: // 3D.
-         unit = floor(pow(num_tasks / 5, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 5, 1.0 / 3) + 1e-2));
          nxyz[0] = 5 * unit; nxyz[1] = unit; nxyz[2] = unit;
          break;
       case 521: // 3D.
-         unit = floor(pow(num_tasks / 10, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 10, 1.0 / 3) + 1e-2));
          nxyz[0] = 5 * unit; nxyz[1] = 2 * unit; nxyz[2] = unit;
          break;
       case 522: // 3D.
-         unit = floor(pow(num_tasks / 20, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 20, 1.0 / 3) + 1e-2));
          nxyz[0] = 5 * unit; nxyz[1] = 2 * unit; nxyz[2] = 2 * unit;
          break;
       case 911: // 3D.
-         unit = floor(pow(num_tasks / 9, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 9, 1.0 / 3) + 1e-2));
          nxyz[0] = 9 * unit; nxyz[1] = unit; nxyz[2] = unit;
          break;
       case 921: // 3D.
-         unit = floor(pow(num_tasks / 18, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 18, 1.0 / 3) + 1e-2));
          nxyz[0] = 9 * unit; nxyz[1] = 2 * unit; nxyz[2] = unit;
          break;
       case 922: // 3D.
-         unit = floor(pow(num_tasks / 36, 1.0 / 3) + 1e-2);
+         unit = static_cast<int>(floor(pow(num_tasks / 36, 1.0 / 3) + 1e-2));
          nxyz[0] = 9 * unit; nxyz[1] = 2 * unit; nxyz[2] = 2 * unit;
          break;
       default:
@@ -260,9 +356,22 @@ int main(int argc, char *argv[])
    }
    int product = 1;
    for (int d = 0; d < dim; d++) { product *= nxyz[d]; }
-   if (product == num_tasks)
+   const bool cartesian_partitioning = (cxyz.Size()>0)?true:false;
+   if (product == num_tasks || cartesian_partitioning)
    {
-      int *partitioning = mesh->CartesianPartitioning(nxyz);
+      if (cartesian_partitioning)
+      {
+         int cproduct = 1;
+         for (int d = 0; d < dim; d++) { cproduct *= cxyz[d]; }
+         MFEM_VERIFY(!cartesian_partitioning || cxyz.Size() == dim,
+                     "Expected " << mesh->SpaceDimension() << " integers with the "
+                     "option --cartesian-partitioning.");
+         MFEM_VERIFY(!cartesian_partitioning || num_tasks == cproduct,
+                     "Expected cartesian partitioning product to match number of ranks.");
+      }
+      int *partitioning = cartesian_partitioning ?
+                          mesh->CartesianPartitioning(cxyz):
+                          mesh->CartesianPartitioning(nxyz);
       pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
       delete [] partitioning;
    }
@@ -287,12 +396,11 @@ int main(int argc, char *argv[])
    // Refine the mesh further in parallel to increase the resolution.
    for (int lev = 0; lev < rp_levels; lev++) { pmesh->UniformRefinement(); }
 
-   int nzones = pmesh->GetNE(), nzones_min, nzones_max;
-   MPI_Reduce(&nzones, &nzones_min, 1, MPI_INT, MPI_MIN, 0, pmesh->GetComm());
-   MPI_Reduce(&nzones, &nzones_max, 1, MPI_INT, MPI_MAX, 0, pmesh->GetComm());
+   int NE = pmesh->GetNE(), ne_min, ne_max;
+   MPI_Reduce(&NE, &ne_min, 1, MPI_INT, MPI_MIN, 0, pmesh->GetComm());
+   MPI_Reduce(&NE, &ne_max, 1, MPI_INT, MPI_MAX, 0, pmesh->GetComm());
    if (myid == 0)
-   { cout << "Zones min/max: " << nzones_min << " " << nzones_max << endl; }
-
+   { cout << "Zones min/max: " << ne_min << " " << ne_max << endl; }
 
    // Define the parallel finite element spaces. We use:
    // - H1 (Gauss-Lobatto, continuous) for position and velocity.
@@ -309,8 +417,8 @@ int main(int argc, char *argv[])
       Array<int> ess_bdr(pmesh->bdr_attributes.Max()), tdofs1d;
       for (int d = 0; d < pmesh->Dimension(); d++)
       {
-         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e., we must
-         // enforce v_x/y/z = 0 for the velocity components.
+         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries,
+         // i.e., we must enforce v_x/y/z = 0 for the velocity components.
          ess_bdr = 0; ess_bdr[d] = 1;
          H1FESpace.GetEssentialTrueDofs(ess_bdr, tdofs1d, d);
          ess_tdofs.Append(tdofs1d);
@@ -337,9 +445,8 @@ int main(int argc, char *argv[])
          return 3;
    }
 
-   HYPRE_Int glob_size_l2 = L2FESpace.GlobalTrueVSize();
-   HYPRE_Int glob_size_h1 = H1FESpace.GlobalTrueVSize();
-
+   const HYPRE_Int glob_size_l2 = L2FESpace.GlobalTrueVSize();
+   const HYPRE_Int glob_size_h1 = H1FESpace.GlobalTrueVSize();
    if (mpi.Root())
    {
       cout << "Number of kinematic (position, velocity) dofs: "
@@ -348,23 +455,21 @@ int main(int argc, char *argv[])
            << glob_size_l2 << endl;
    }
 
-   int Vsize_l2 = L2FESpace.GetVSize();
-   int Vsize_h1 = H1FESpace.GetVSize();
-
    // The monolithic BlockVector stores unknown fields as:
    // - 0 -> position
    // - 1 -> velocity
    // - 2 -> specific internal energy
-
+   const int Vsize_l2 = L2FESpace.GetVSize();
+   const int Vsize_h1 = H1FESpace.GetVSize();
    Array<int> true_offset(4);
    true_offset[0] = 0;
    true_offset[1] = true_offset[0] + Vsize_h1;
    true_offset[2] = true_offset[1] + Vsize_h1;
    true_offset[3] = true_offset[2] + Vsize_l2;
-   BlockVector S(true_offset);
+   BlockVector S(true_offset, Device::GetMemoryType());
 
    // Define GridFunction objects for the position, velocity and specific
-   // internal energy.  There is no function for the density, as we can always
+   // internal energy. There is no function for the density, as we can always
    // compute the density values given the current mesh position, using the
    // property of pointwise mass conservation.
    ParGridFunction x_gf, v_gf, e_gf;
@@ -374,24 +479,28 @@ int main(int argc, char *argv[])
 
    // Initialize x_gf using the starting mesh coordinates.
    pmesh->SetNodalGridFunction(&x_gf);
+   // Sync the data location of x_gf with its base, S
+   x_gf.SyncAliasMemory(S);
 
    // Initialize the velocity.
    VectorFunctionCoefficient v_coeff(pmesh->Dimension(), v0);
    v_gf.ProjectCoefficient(v_coeff);
+   // Sync the data location of v_gf with its base, S
+   v_gf.SyncAliasMemory(S);
 
    // Initialize density and specific internal energy values. We interpolate in
-   // a non-positive basis to get the correct values at the dofs.  Then we do an
+   // a non-positive basis to get the correct values at the dofs. Then we do an
    // L2 projection to the positive basis in which we actually compute. The goal
    // is to get a high-order representation of the initial condition. Note that
    // this density is a temporary function and it will not be updated during the
    // time evolution.
-   ParGridFunction rho(&L2FESpace);
-   FunctionCoefficient rho_coeff(rho0);
+   ParGridFunction rho0_gf(&L2FESpace);
+   FunctionCoefficient rho0_coeff(rho0);
    L2_FECollection l2_fec(order_e, pmesh->Dimension());
    ParFiniteElementSpace l2_fes(pmesh, &l2_fec);
-   ParGridFunction l2_rho(&l2_fes), l2_e(&l2_fes);
-   l2_rho.ProjectCoefficient(rho_coeff);
-   rho.ProjectGridFunction(l2_rho);
+   ParGridFunction l2_rho0_gf(&l2_fes), l2_e(&l2_fes);
+   l2_rho0_gf.ProjectCoefficient(rho0_coeff);
+   rho0_gf.ProjectGridFunction(l2_rho0_gf);
    if (problem == 1)
    {
       // For the Sedov test, we use a delta function at the origin.
@@ -405,72 +514,72 @@ int main(int argc, char *argv[])
       l2_e.ProjectCoefficient(e_coeff);
    }
    e_gf.ProjectGridFunction(l2_e);
+   // Sync the data location of e_gf with its base, S
+   e_gf.SyncAliasMemory(S);
 
    // Piecewise constant ideal gas coefficient over the Lagrangian mesh. The
-   // gamma values are projected on a function that stays constant on the moving
-   // mesh.
+   // gamma values are projected on function that's constant on the moving mesh.
    L2_FECollection mat_fec(0, pmesh->Dimension());
    ParFiniteElementSpace mat_fes(pmesh, &mat_fec);
    ParGridFunction mat_gf(&mat_fes);
    FunctionCoefficient mat_coeff(gamma);
    mat_gf.ProjectCoefficient(mat_coeff);
-   GridFunctionCoefficient *mat_gf_coeff = new GridFunctionCoefficient(&mat_gf);
 
    // Additional details, depending on the problem.
    int source = 0; bool visc = true;
    switch (problem)
    {
-      case 0: if (pmesh->Dimension() == 2) { source = 1; }
-         visc = false; break;
+      case 0: if (pmesh->Dimension() == 2) { source = 1; } visc = false; break;
       case 1: visc = true; break;
       case 2: visc = true; break;
-      case 3: visc = true; break;
+      case 3: visc = true; S.HostRead(); break;
       case 4: visc = false; break;
+      case 5: visc = true; break;
+      case 6: visc = true; break;
       default: MFEM_ABORT("Wrong problem specification!");
    }
    if (impose_visc) { visc = true; }
 
-   LagrangianHydroOperator oper(S.Size(), H1FESpace, L2FESpace,
-                                ess_tdofs, rho, source, cfl, mat_gf_coeff,
-                                visc, p_assembly, cg_tol, cg_max_iter, ftz_tol,
-                                H1FEC.GetBasisType());
+   hydrodynamics::LagrangianHydroOperator hydro(S.Size(),
+                                                H1FESpace, L2FESpace, ess_tdofs,
+                                                rho0_coeff, rho0_gf,
+                                                mat_coeff, mat_gf,
+                                                source, cfl,
+                                                visc, p_assembly,
+                                                cg_tol, cg_max_iter, ftz_tol,
+                                                order_q);
 
    socketstream vis_rho, vis_v, vis_e;
    char vishost[] = "localhost";
    int  visport   = 19916;
 
    ParGridFunction rho_gf;
-   if (visualization || visit) { oper.ComputeDensity(rho_gf); }
-
-   const double energy_init = oper.InternalEnergy(e_gf) +
-                              oper.KineticEnergy(v_gf);
+   if (visualization || visit) { hydro.ComputeDensity(rho_gf); }
+   const double energy_init = hydro.InternalEnergy(e_gf) +
+                              hydro.KineticEnergy(v_gf);
 
    if (visualization)
    {
       // Make sure all MPI ranks have sent their 'v' solution before initiating
       // another set of GLVis connections (one from each rank):
       MPI_Barrier(pmesh->GetComm());
-
       vis_rho.precision(8);
       vis_v.precision(8);
       vis_e.precision(8);
-
       int Wx = 0, Wy = 0; // window position
       const int Ww = 350, Wh = 350; // window size
       int offx = Ww+10; // window offsets
-
       if (problem != 0 && problem != 4)
       {
-         VisualizeField(vis_rho, vishost, visport, rho_gf,
-                        "Density", Wx, Wy, Ww, Wh);
+         hydrodynamics::VisualizeField(vis_rho, vishost, visport, rho_gf,
+                                       "Density", Wx, Wy, Ww, Wh);
       }
-
       Wx += offx;
-      VisualizeField(vis_v, vishost, visport, v_gf,
-                     "Velocity", Wx, Wy, Ww, Wh);
+      hydrodynamics::VisualizeField(vis_v, vishost, visport, v_gf,
+                                    "Velocity", Wx, Wy, Ww, Wh);
       Wx += offx;
-      VisualizeField(vis_e, vishost, visport, e_gf,
-                     "Specific Internal Energy", Wx, Wy, Ww, Wh);
+      hydrodynamics::VisualizeField(vis_e, vishost, visport, e_gf,
+                                    "Specific Internal Energy", Wx, Wy, Ww, Wh);
    }
 
    // Save data for VisIt visualization.
@@ -488,12 +597,15 @@ int main(int argc, char *argv[])
    // Perform time-integration (looping over the time iterations, ti, with a
    // time-step dt). The object oper is of type LagrangianHydroOperator that
    // defines the Mult() method that used by the time integrators.
-   ode_solver->Init(oper);
-   oper.ResetTimeStepEstimate();
-   double t = 0.0, dt = oper.GetTimeStepEstimate(S), t_old;
+   ode_solver->Init(hydro);
+   hydro.ResetTimeStepEstimate();
+   double t = 0.0, dt = hydro.GetTimeStepEstimate(S), t_old;
    bool last_step = false;
    int steps = 0;
    BlockVector S_old(S);
+   long mem=0, mmax=0, msum=0;
+   int checks = 0;
+
    for (int ti = 1; !last_step; ti++)
    {
       if (t + dt >= t_final)
@@ -502,10 +614,9 @@ int main(int argc, char *argv[])
          last_step = true;
       }
       if (steps == max_tsteps) { last_step = true; }
-
       S_old = S;
       t_old = t;
-      oper.ResetTimeStepEstimate();
+      hydro.ResetTimeStepEstimate();
 
       // S is the vector of dofs, t is the current time, and dt is the time step
       // to advance.
@@ -513,23 +624,30 @@ int main(int argc, char *argv[])
       steps++;
 
       // Adaptive time step control.
-      const double dt_est = oper.GetTimeStepEstimate(S);
+      const double dt_est = hydro.GetTimeStepEstimate(S);
       if (dt_est < dt)
       {
          // Repeat (solve again) with a decreased time step - decrease of the
          // time estimate suggests appearance of oscillations.
          dt *= 0.85;
-         if (dt < numeric_limits<double>::epsilon())
+         if (dt < std::numeric_limits<double>::epsilon())
          { MFEM_ABORT("The time step crashed!"); }
          t = t_old;
          S = S_old;
-         oper.ResetQuadratureData();
+         hydro.ResetQuadratureData();
          if (mpi.Root()) { cout << "Repeating step " << ti << endl; }
          if (steps < max_tsteps) { last_step = false; }
          ti--; continue;
       }
       else if (dt_est > 1.25 * dt) { dt *= 1.02; }
 
+      // Ensure the sub-vectors x_gf, v_gf, and e_gf know the location of the
+      // data in S. This operation simply updates the Memory validity flags of
+      // the sub-vectors to match those of S.
+      x_gf.SyncAliasMemory(S);
+      v_gf.SyncAliasMemory(S);
+      e_gf.SyncAliasMemory(S);
+
       // Make sure that the mesh corresponds to the new solution state. This is
       // needed, because some time integrators use different S-type vectors
       // and the oper object might have redirected the mesh positions to those.
@@ -537,42 +655,53 @@ int main(int argc, char *argv[])
 
       if (last_step || (ti % vis_steps) == 0)
       {
-         double loc_norm = e_gf * e_gf, tot_norm;
-         MPI_Allreduce(&loc_norm, &tot_norm, 1, MPI_DOUBLE, MPI_SUM,
-                       pmesh->GetComm());
+         double lnorm = e_gf * e_gf, norm;
+         MPI_Allreduce(&lnorm, &norm, 1, MPI_DOUBLE, MPI_SUM, pmesh->GetComm());
+         if (mem_usage)
+         {
+            mem = GetMaxRssMB();
+            MPI_Reduce(&mem, &mmax, 1, MPI_LONG, MPI_MAX, 0, pmesh->GetComm());
+            MPI_Reduce(&mem, &msum, 1, MPI_LONG, MPI_SUM, 0, pmesh->GetComm());
+         }
          if (mpi.Root())
          {
-            cout << fixed;
-            cout << "step " << setw(5) << ti
-                 << ",\tt = " << setw(5) << setprecision(4) << t
-                 << ",\tdt = " << setw(5) << setprecision(6) << dt
-                 << ",\t|e| = " << setprecision(10)
-                 << sqrt(tot_norm) << endl;
+            const double sqrt_norm = sqrt(norm);
+            cout << std::fixed;
+            cout << "step " << std::setw(5) << ti
+                 << ",\tt = " << std::setw(5) << std::setprecision(4) << t
+                 << ",\tdt = " << std::setw(5) << std::setprecision(6) << dt
+                 << ",\t|e| = " << std::setprecision(10) << std::scientific
+                 << sqrt_norm;
+            cout << std::fixed;
+            if (mem_usage)
+            {
+               cout << ", mem: " << mmax << "/" << msum << " MB";
+            }
+            cout << endl;
          }
 
          // Make sure all ranks have sent their 'v' solution before initiating
          // another set of GLVis connections (one from each rank):
          MPI_Barrier(pmesh->GetComm());
 
-         if (visualization || visit || gfprint) { oper.ComputeDensity(rho_gf); }
+         if (visualization || visit || gfprint) { hydro.ComputeDensity(rho_gf); }
          if (visualization)
          {
             int Wx = 0, Wy = 0; // window position
             int Ww = 350, Wh = 350; // window size
             int offx = Ww+10; // window offsets
-
             if (problem != 0 && problem != 4)
             {
-               VisualizeField(vis_rho, vishost, visport, rho_gf,
-                              "Density", Wx, Wy, Ww, Wh);
+               hydrodynamics::VisualizeField(vis_rho, vishost, visport, rho_gf,
+                                             "Density", Wx, Wy, Ww, Wh);
             }
-
             Wx += offx;
-            VisualizeField(vis_v, vishost, visport,
-                           v_gf, "Velocity", Wx, Wy, Ww, Wh);
+            hydrodynamics::VisualizeField(vis_v, vishost, visport,
+                                          v_gf, "Velocity", Wx, Wy, Ww, Wh);
             Wx += offx;
-            VisualizeField(vis_e, vishost, visport, e_gf,
-                           "Specific Internal Energy", Wx, Wy, Ww,Wh);
+            hydrodynamics::VisualizeField(vis_e, vishost, visport, e_gf,
+                                          "Specific Internal Energy",
+                                          Wx, Wy, Ww,Wh);
             Wx += offx;
          }
 
@@ -585,38 +714,56 @@ int main(int argc, char *argv[])
 
          if (gfprint)
          {
-            ostringstream mesh_name, rho_name, v_name, e_name;
+            std::ostringstream mesh_name, rho_name, v_name, e_name;
             mesh_name << basename << "_" << ti
-                      << "_mesh." << setfill('0') << setw(6) << myid;
+                      << "_mesh." << std::setfill('0') << std::setw(6) << myid;
             rho_name  << basename << "_" << ti
-                      << "_rho." << setfill('0') << setw(6) << myid;
+                      << "_rho." << std::setfill('0') << std::setw(6) << myid;
             v_name << basename << "_" << ti
-                   << "_v." << setfill('0') << setw(6) << myid;
+                   << "_v." << std::setfill('0') << std::setw(6) << myid;
             e_name << basename << "_" << ti
-                   << "_e." << setfill('0') << setw(6) << myid;
+                   << "_e." << std::setfill('0') << std::setw(6) << myid;
 
-            ofstream mesh_ofs(mesh_name.str().c_str());
+            std::ofstream mesh_ofs(mesh_name.str().c_str());
             mesh_ofs.precision(8);
             pmesh->Print(mesh_ofs);
             mesh_ofs.close();
 
-            ofstream rho_ofs(rho_name.str().c_str());
+            std::ofstream rho_ofs(rho_name.str().c_str());
             rho_ofs.precision(8);
             rho_gf.Save(rho_ofs);
             rho_ofs.close();
 
-            ofstream v_ofs(v_name.str().c_str());
+            std::ofstream v_ofs(v_name.str().c_str());
             v_ofs.precision(8);
             v_gf.Save(v_ofs);
             v_ofs.close();
 
-            ofstream e_ofs(e_name.str().c_str());
+            std::ofstream e_ofs(e_name.str().c_str());
             e_ofs.precision(8);
             e_gf.Save(e_ofs);
             e_ofs.close();
          }
       }
+
+      // Problems checks
+      if (check)
+      {
+         double lnorm = e_gf * e_gf, norm;
+         MPI_Allreduce(&lnorm, &norm, 1, MPI_DOUBLE, MPI_SUM, pmesh->GetComm());
+         const double e_norm = sqrt(norm);
+         MFEM_VERIFY(rs_levels==0 && rp_levels==0, "check: rs, rp");
+         MFEM_VERIFY(order_v==2, "check: order_v");
+         MFEM_VERIFY(order_e==1, "check: order_e");
+         MFEM_VERIFY(ode_solver_type==4, "check: ode_solver_type");
+         MFEM_VERIFY(t_final == 0.6, "check: t_final");
+         MFEM_VERIFY(cfl==0.5, "check: cfl");
+         MFEM_VERIFY(strncmp(mesh_file, "default", 7) == 0, "check: mesh_file");
+         MFEM_VERIFY(dim==2 || dim==3, "check: dimension");
+         Checks(dim, ti, e_norm, checks);
+      }
    }
+   MFEM_VERIFY(!check || checks == 2, "Check error!");
 
    switch (ode_solver_type)
    {
@@ -626,15 +773,31 @@ int main(int argc, char *argv[])
       case 6: steps *= 6; break;
       case 7: steps *= 2;
    }
-   oper.PrintTimingData(mpi.Root(), steps);
 
-   const double energy_final = oper.InternalEnergy(e_gf) +
-                               oper.KineticEnergy(v_gf);
+   hydro.PrintTimingData(mpi.Root(), steps, fom);
+
+   if (mem_usage)
+   {
+      mem = GetMaxRssMB();
+      MPI_Reduce(&mem, &mmax, 1, MPI_LONG, MPI_MAX, 0, pmesh->GetComm());
+      MPI_Reduce(&mem, &msum, 1, MPI_LONG, MPI_SUM, 0, pmesh->GetComm());
+   }
+
+   const double energy_final = hydro.InternalEnergy(e_gf) +
+                               hydro.KineticEnergy(v_gf);
    if (mpi.Root())
    {
       cout << endl;
-      cout << "Energy  diff: " << scientific << setprecision(2)
-           << fabs(energy_init - energy_final) << endl;
+      if (!p_assembly)
+      {
+         cout << "Energy  diff: " << std::scientific << std::setprecision(2)
+              << fabs(energy_init - energy_final) << endl;
+      }
+      if (mem_usage)
+      {
+         cout << "Maximum memory resident set size: "
+              << mmax << "/" << msum << " MB" << endl;
+      }
    }
 
    // Print the error.
@@ -661,7 +824,6 @@ int main(int argc, char *argv[])
    // Free the used memory.
    delete ode_solver;
    delete pmesh;
-   delete mat_gf_coeff;
 
    return 0;
 }
@@ -675,6 +837,18 @@ double rho0(const Vector &x)
       case 2: return (x(0) < 0.5) ? 1.0 : 0.1;
       case 3: return (x(0) > 1.0 && x(1) > 1.5) ? 0.125 : 1.0;
       case 4: return 1.0;
+      case 5:
+      {
+         if (x(0) >= 0.5 && x(1) >= 0.5) { return 0.5313; }
+         if (x(0) <  0.5 && x(1) <  0.5) { return 0.8; }
+         return 1.0;
+      }
+      case 6:
+      {
+         if (x(0) <  0.5 && x(1) >= 0.5) { return 2.0; }
+         if (x(0) >= 0.5 && x(1) <  0.5) { return 3.0; }
+         return 1.0;
+      }
       default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
    }
 }
@@ -688,17 +862,17 @@ double gamma(const Vector &x)
       case 2: return 1.4;
       case 3: return (x(0) > 1.0 && x(1) <= 1.5) ? 1.4 : 1.5;
       case 4: return 5.0 / 3.0;
+      case 5: return 1.4;
+      case 6: return 1.4;
       default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
    }
 }
 
-double rad(double x, double y)
-{
-   return sqrt(x*x + y*y);
-}
+static double rad(double x, double y) { return sqrt(x*x + y*y); }
 
 void v0(const Vector &x, Vector &v)
 {
+   const double atn = pow((x(0)*(1.0-x(0))*4*x(1)*(1.0-x(1))*4.0),0.4);
    switch (problem)
    {
       case 0:
@@ -716,6 +890,7 @@ void v0(const Vector &x, Vector &v)
       case 3: v = 0.0; break;
       case 4:
       {
+         v = 0.0;
          const double r = rad(x(0), x(1));
          if (r < 0.2)
          {
@@ -727,9 +902,29 @@ void v0(const Vector &x, Vector &v)
             v(0) =  2.0 * x(1) / r - 5.0 * x(1);
             v(1) = -2.0 * x(0) / r + 5.0 * x(0);
          }
-         else { v = 0.0; }
+         else { }
          break;
       }
+      case 5:
+      {
+         v = 0.0;
+         if (x(0) >= 0.5 && x(1) >= 0.5) { v(0)=0.0*atn, v(1)=0.0*atn; return;}
+         if (x(0) <  0.5 && x(1) >= 0.5) { v(0)=0.7276*atn, v(1)=0.0*atn; return;}
+         if (x(0) <  0.5 && x(1) <  0.5) { v(0)=0.0*atn, v(1)=0.0*atn; return;}
+         if (x(0) >= 0.5 && x(1) <  0.5) { v(0)=0.0*atn, v(1)=0.7276*atn; return; }
+         MFEM_ABORT("Error in problem 5!");
+         return;
+      }
+      case 6:
+      {
+         v = 0.0;
+         if (x(0) >= 0.5 && x(1) >= 0.5) { v(0)=+0.75*atn, v(1)=-0.5*atn; return;}
+         if (x(0) <  0.5 && x(1) >= 0.5) { v(0)=+0.75*atn, v(1)=+0.5*atn; return;}
+         if (x(0) <  0.5 && x(1) <  0.5) { v(0)=-0.75*atn, v(1)=+0.5*atn; return;}
+         if (x(0) >= 0.5 && x(1) <  0.5) { v(0)=-0.75*atn, v(1)=-0.5*atn; return;}
+         MFEM_ABORT("Error in problem 6!");
+         return;
+      }
       default: MFEM_ABORT("Bad number given for problem id!");
    }
 }
@@ -774,11 +969,31 @@ double e0(const Vector &x)
          }
          else { return (3.0 + 4.0 * log(2.0)) / (gamma - 1.0); }
       }
+      case 5:
+      {
+         const double irg = 1.0 / rho0(x) / (gamma(x) - 1.0);
+         if (x(0) >= 0.5 && x(1) >= 0.5) { return 0.4 * irg; }
+         if (x(0) <  0.5 && x(1) >= 0.5) { return 1.0 * irg; }
+         if (x(0) <  0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         if (x(0) >= 0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         MFEM_ABORT("Error in problem 5!");
+         return 0.0;
+      }
+      case 6:
+      {
+         const double irg = 1.0 / rho0(x) / (gamma(x) - 1.0);
+         if (x(0) >= 0.5 && x(1) >= 0.5) { return 1.0 * irg; }
+         if (x(0) <  0.5 && x(1) >= 0.5) { return 1.0 * irg; }
+         if (x(0) <  0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         if (x(0) >= 0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         MFEM_ABORT("Error in problem 5!");
+         return 0.0;
+      }
       default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
    }
 }
 
-void display_banner(ostream & os)
+static void display_banner(std::ostream &os)
 {
    os << endl
       << "       __                __                 " << endl
@@ -788,3 +1003,91 @@ void display_banner(ostream & os)
       << "   /_____/\\__,_/\\__, /_/ /_/\\____/____/  " << endl
       << "               /____/                       " << endl << endl;
 }
+
+static long GetMaxRssMB()
+{
+   struct rusage usage;
+   if (getrusage(RUSAGE_SELF, &usage)) { return -1; }
+#ifndef __APPLE__
+   const long unit = 1024; // kilo
+#else
+   const long unit = 1024*1024; // mega
+#endif
+   return usage.ru_maxrss/unit; // mega bytes
+}
+
+static bool rerr(const double a, const double v, const double eps)
+{
+   MFEM_VERIFY(fabs(a) > eps && fabs(v) > eps, "One value is near zero!");
+   const double err_a = fabs((a-v)/a);
+   const double err_v = fabs((a-v)/v);
+   return fmax(err_a, err_v) < eps;
+}
+
+static void Checks(const int dim, const int ti, const double nrm, int &chk)
+{
+   const int pb = problem;
+   const double eps = 1.e-13;
+   if (dim==2)
+   {
+      const double p0_05 = 6.54653862453438e+00;
+      const double p0_27 = 7.58857635779292e+00;
+      if (pb==0 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p0_05,eps),"P0, #05");}
+      if (pb==0 && ti==27) {chk++; MFEM_VERIFY(rerr(nrm,p0_27,eps),"P0, #27");}
+      const double p1_05 = 3.50825494522579e+00;
+      const double p1_15 = 2.75644459682321e+00;
+      if (pb==1 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p1_05,eps),"P1, #05");}
+      if (pb==1 && ti==15) {chk++; MFEM_VERIFY(rerr(nrm,p1_15,eps),"P1, #15");}
+      const double p2_05 = 1.02074579565124e+01;
+      const double p2_59 = 1.72159020590190e+01;
+      if (pb==2 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p2_05,eps),"P2, #05");}
+      if (pb==2 && ti==59) {chk++; MFEM_VERIFY(rerr(nrm,p2_59,eps),"P2, #59");}
+      const double p3_05 = 8.0;
+      const double p3_16 = 8.0;
+      if (pb==3 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p3_05,eps),"P3, #05");}
+      if (pb==3 && ti==16) {chk++; MFEM_VERIFY(rerr(nrm,p3_16,eps),"P3, #16");}
+      const double p4_05 = 3.436923188323578e+01;
+      const double p4_52 = 2.682244912720685e+01;
+      if (pb==4 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p4_05,eps),"P4, #05");}
+      if (pb==4 && ti==52) {chk++; MFEM_VERIFY(rerr(nrm,p4_52,eps),"P4, #52");}
+      const double p5_05 = 1.030899557252528e+01;
+      const double p5_36 = 1.057362418574309e+01;
+      if (pb==5 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p5_05,eps),"P5, #05");}
+      if (pb==5 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p5_36,eps),"P5, #36");}
+      const double p6_05 = 8.039707010835693e+00;
+      const double p6_36 = 8.316970976817373e+00;
+      if (pb==6 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p6_05,eps),"P6, #05");}
+      if (pb==6 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p6_36,eps),"P6, #36");}
+   }
+   if (dim==3)
+   {
+      const double  p0_05 = 1.198510951452527e+03;
+      const double p0_188 = 1.199384410059154e+03;
+      if (pb==0 && ti==005) {chk++; MFEM_VERIFY(rerr(nrm,p0_05,eps),"P0, #05");}
+      if (pb==0 && ti==188) {chk++; MFEM_VERIFY(rerr(nrm,p0_188,eps),"P0, #188");}
+      const double p1_05 = 1.33916371859257e+01;
+      const double p1_28 = 7.52107367739800e+00;
+      if (pb==1 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p1_05,eps),"P1, #05");}
+      if (pb==1 && ti==28) {chk++; MFEM_VERIFY(rerr(nrm,p1_28,eps),"P1, #28");}
+      const double p2_05 = 2.041491591302486e+01;
+      const double p2_59 = 3.443180411803796e+01;
+      if (pb==2 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p2_05,eps),"P2, #05");}
+      if (pb==2 && ti==59) {chk++; MFEM_VERIFY(rerr(nrm,p2_59,eps),"P2, #59");}
+      const double p3_05 = 1.600000000000000e+01;
+      const double p3_16 = 1.600000000000000e+01;
+      if (pb==3 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p3_05,eps),"P3, #05");}
+      if (pb==3 && ti==16) {chk++; MFEM_VERIFY(rerr(nrm,p3_16,eps),"P3, #16");}
+      const double p4_05 = 6.873846376647157e+01;
+      const double p4_52 = 5.364489825441373e+01;
+      if (pb==4 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p4_05,eps),"P4, #05");}
+      if (pb==4 && ti==52) {chk++; MFEM_VERIFY(rerr(nrm,p4_52,eps),"P4, #52");}
+      const double p5_05 = 2.061984481890964e+01;
+      const double p5_36 = 2.114519664792607e+01;
+      if (pb==5 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p5_05,eps),"P5, #05");}
+      if (pb==5 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p5_36,eps),"P5, #36");}
+      const double p6_05 = 1.607988713996459e+01;
+      const double p6_36 = 1.662736010353023e+01;
+      if (pb==6 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p6_05,eps),"P6, #05");}
+      if (pb==6 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p6_36,eps),"P6, #36");}
+   }
+}
diff --git a/laghos_assembly.cpp b/laghos_assembly.cpp
index 69b073bd..60ee07f3 100644
--- a/laghos_assembly.cpp
+++ b/laghos_assembly.cpp
@@ -15,8 +15,7 @@
 // testbed platforms, in support of the nation's exascale computing imperative.
 
 #include "laghos_assembly.hpp"
-
-using namespace std;
+#include <unordered_map>
 
 namespace mfem
 {
@@ -24,1163 +23,947 @@ namespace mfem
 namespace hydrodynamics
 {
 
-Tensors1D::Tensors1D(int H1order, int L2order, int nqp1D, bool bernstein_v)
-   : HQshape1D(H1order + 1, nqp1D),
-     HQgrad1D(H1order + 1, nqp1D),
-     LQshape1D(L2order + 1, nqp1D)
+void DensityIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
+                                               ElementTransformation &Tr,
+                                               Vector &elvect)
 {
-   // In this miniapp we assume:
-   // - Gauss-Legendre quadrature points.
-   // - Gauss-Lobatto OR Bernstein continuous kinematic basis.
-   // - Bernstein discontinuous thermodynamic basis.
-
-   const double *quad1D_pos = poly1d.GetPoints(nqp1D - 1,
-                                               Quadrature1D::GaussLegendre);
-   Poly_1D::Basis &basisH1 = poly1d.GetBasis(H1order,
-                                             Quadrature1D::GaussLobatto);
-   Vector col, grad_col;
-   for (int q = 0; q < nqp1D; q++)
-   {
-      HQshape1D.GetColumnReference(q, col);
-      HQgrad1D.GetColumnReference(q, grad_col);
-      if (bernstein_v)
-      {
-         poly1d.CalcBernstein(H1order, quad1D_pos[q],
-                              col.GetData(), grad_col.GetData());
-      }
-      else { basisH1.Eval(quad1D_pos[q], col, grad_col); }
-   }
-   for (int q = 0; q < nqp1D; q++)
+   const int nqp = IntRule->GetNPoints();
+   Vector shape(fe.GetDof());
+   elvect.SetSize(fe.GetDof());
+   elvect = 0.0;
+   for (int q = 0; q < nqp; q++)
    {
-      LQshape1D.GetColumnReference(q, col);
-      poly1d.CalcBernstein(L2order, quad1D_pos[q], col);
+      fe.CalcShape(IntRule->IntPoint(q), shape);
+      // Note that rhoDetJ = rho0DetJ0.
+      shape *= qdata.rho0DetJ0w(Tr.ElementNo*nqp + q);
+      elvect += shape;
    }
 }
 
-void FastEvaluator::GetL2Values(const Vector &vecL2, Vector &vecQ) const
+void ForceIntegrator::AssembleElementMatrix2(const FiniteElement &trial_fe,
+                                             const FiniteElement &test_fe,
+                                             ElementTransformation &Tr,
+                                             DenseMatrix &elmat)
 {
-   const int nL2dof1D = tensors1D->LQshape1D.Height(),
-             nqp1D    = tensors1D->LQshape1D.Width();
-   if (dim == 2)
-   {
-      DenseMatrix E(vecL2.GetData(), nL2dof1D, nL2dof1D);
-      DenseMatrix LQ(nL2dof1D, nqp1D);
-
-      vecQ.SetSize(nqp1D * nqp1D);
-      DenseMatrix QQ(vecQ.GetData(), nqp1D, nqp1D);
-
-      // LQ_j2_k1 = E_j1_j2 LQs_j1_k1  -- contract in x direction.
-      // QQ_k1_k2 = LQ_j2_k1 LQs_j2_k2 -- contract in y direction.
-      MultAtB(E, tensors1D->LQshape1D, LQ);
-      MultAtB(LQ, tensors1D->LQshape1D, QQ);
-   }
-   else
+   const int e = Tr.ElementNo;
+   const int nqp = IntRule->GetNPoints();
+   const int dim = trial_fe.GetDim();
+   const int h1dofs_cnt = test_fe.GetDof();
+   const int l2dofs_cnt = trial_fe.GetDof();
+   elmat.SetSize(h1dofs_cnt*dim, l2dofs_cnt);
+   elmat = 0.0;
+   DenseMatrix vshape(h1dofs_cnt, dim), loc_force(h1dofs_cnt, dim);
+   Vector shape(l2dofs_cnt), Vloc_force(loc_force.Data(), h1dofs_cnt*dim);
+   for (int q = 0; q < nqp; q++)
    {
-      DenseMatrix E(vecL2.GetData(), nL2dof1D*nL2dof1D, nL2dof1D);
-      DenseMatrix LL_Q(nL2dof1D * nL2dof1D, nqp1D),
-                  L_LQ(LL_Q.GetData(), nL2dof1D, nL2dof1D*nqp1D),
-                  Q_LQ(nqp1D, nL2dof1D*nqp1D);
-
-      vecQ.SetSize(nqp1D * nqp1D * nqp1D);
-      DenseMatrix QQ_Q(vecQ.GetData(), nqp1D * nqp1D, nqp1D);
-
-      // LLQ_j1_j2_k3  = E_j1_j2_j3 LQs_j3_k3   -- contract in z direction.
-      // QLQ_k1_j2_k3  = LQs_j1_k1 LLQ_j1_j2_k3 -- contract in x direction.
-      // QQQ_k1_k2_k3  = QLQ_k1_j2_k3 LQs_j2_k2 -- contract in y direction.
-      // The last step does some reordering (it's not product of matrices).
-      mfem::Mult(E, tensors1D->LQshape1D, LL_Q);
-      MultAtB(tensors1D->LQshape1D, L_LQ, Q_LQ);
-      for (int k1 = 0; k1 < nqp1D; k1++)
+      const IntegrationPoint &ip = IntRule->IntPoint(q);
+      // Form stress:grad_shape at the current point.
+      test_fe.CalcDShape(ip, vshape);
+      for (int i = 0; i < h1dofs_cnt; i++)
       {
-         for (int k2 = 0; k2 < nqp1D; k2++)
+         for (int vd = 0; vd < dim; vd++) // Velocity components.
          {
-            for (int k3 = 0; k3 < nqp1D; k3++)
+            loc_force(i, vd) = 0.0;
+            for (int gd = 0; gd < dim; gd++) // Gradient components.
             {
-               QQ_Q(k1 + nqp1D*k2, k3) = 0.0;
-               for (int j2 = 0; j2 < nL2dof1D; j2++)
-               {
-                  QQ_Q(k1 + nqp1D*k2, k3) +=
-                     Q_LQ(k1, j2 + k3*nL2dof1D) * tensors1D->LQshape1D(j2, k2);
-               }
+               const int eq = e*nqp + q;
+               const double stressJinvT = qdata.stressJinvT(vd)(eq, gd);
+               loc_force(i, vd) +=  stressJinvT * vshape(i,gd);
             }
          }
       }
+      trial_fe.CalcShape(ip, shape);
+      AddMultVWt(Vloc_force, shape, elmat);
    }
 }
 
-void FastEvaluator::GetVectorGrad(const DenseMatrix &vec, DenseTensor &J) const
+MassPAOperator::MassPAOperator(ParFiniteElementSpace &pfes,
+                               const IntegrationRule &ir,
+                               Coefficient &Q) :
+   Operator(pfes.GetTrueVSize()),
+   comm(pfes.GetParMesh()->GetComm()),
+   dim(pfes.GetMesh()->Dimension()),
+   NE(pfes.GetMesh()->GetNE()),
+   vsize(pfes.GetVSize()),
+   pabf(&pfes),
+   ess_tdofs_count(0),
+   ess_tdofs(0)
 {
-   const int nH1dof1D = tensors1D->HQshape1D.Height(),
-             nqp1D    = tensors1D->LQshape1D.Width();
-   DenseMatrix X;
+   pabf.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+   pabf.AddDomainIntegrator(new mfem::MassIntegrator(Q, &ir));
+   pabf.Assemble();
+   pabf.FormSystemMatrix(mfem::Array<int>(), mass);
+}
 
-   if (dim == 2)
+void MassPAOperator::SetEssentialTrueDofs(Array<int> &dofs)
+{
+   ess_tdofs_count = dofs.Size();
+   if (ess_tdofs.Size() == 0)
    {
-      const int nH1dof = nH1dof1D * nH1dof1D;
-      DenseMatrix HQ(nH1dof1D, nqp1D), QQ(nqp1D, nqp1D);
-      Vector x(nH1dof);
+      int ess_tdofs_sz;
+      MPI_Allreduce(&ess_tdofs_count,&ess_tdofs_sz, 1, MPI_INT, MPI_SUM, comm);
+      MFEM_ASSERT(ess_tdofs_sz > 0, "ess_tdofs_sz should be positive!");
+      ess_tdofs.SetSize(ess_tdofs_sz);
+   }
+   if (ess_tdofs_count == 0) { return; }
+   ess_tdofs = dofs;
+}
 
-      const TensorBasisElement *fe =
-         dynamic_cast<const TensorBasisElement *>(H1FESpace.GetFE(0));
-      const Array<int> &dof_map = fe->GetDofMap();
+void MassPAOperator::EliminateRHS(Vector &b) const
+{
+   if (ess_tdofs_count > 0) { b.SetSubVector(ess_tdofs, 0.0); }
+}
 
-      for (int c = 0; c < 2; c++)
-      {
-         // Transfer from the mfem's H1 local numbering to the tensor structure
-         // numbering.
-         for (int j = 0; j < nH1dof; j++) { x[j] = vec(dof_map[j], c); }
-         X.UseExternalData(x.GetData(), nH1dof1D, nH1dof1D);
+void MassPAOperator::Mult(const Vector &x, Vector &y) const
+{
+   mass->Mult(x, y);
+   if (ess_tdofs_count > 0) { y.SetSubVector(ess_tdofs, 0.0); }
+}
 
-         // HQ_i2_k1  = X_i1_i2 HQg_i1_k1  -- gradients in x direction.
-         // QQ_k1_k2  = HQ_i2_k1 HQs_i2_k2 -- contract  in y direction.
-         MultAtB(X, tensors1D->HQgrad1D, HQ);
-         MultAtB(HQ, tensors1D->HQshape1D, QQ);
+ForcePAOperator::ForcePAOperator(const QuadratureData &qdata,
+                                 ParFiniteElementSpace &h1,
+                                 ParFiniteElementSpace &l2,
+                                 const IntegrationRule &ir) :
+   Operator(),
+   dim(h1.GetMesh()->Dimension()),
+   NE(h1.GetMesh()->GetNE()),
+   qdata(qdata),
+   H1(h1),
+   L2(l2),
+   H1R(H1.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
+   L2R(L2.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
+   ir1D(IntRules.Get(Geometry::SEGMENT, ir.GetOrder())),
+   D1D(H1.GetFE(0)->GetOrder()+1),
+   Q1D(ir1D.GetNPoints()),
+   L1D(L2.GetFE(0)->GetOrder()+1),
+   H1sz(H1.GetVDim() * H1.GetFE(0)->GetDof() * NE),
+   L2sz(L2.GetFE(0)->GetDof() * NE),
+   L2D2Q(&L2.GetFE(0)->GetDofToQuad(ir, DofToQuad::TENSOR)),
+   H1D2Q(&H1.GetFE(0)->GetDofToQuad(ir, DofToQuad::TENSOR)),
+   X(L2sz), Y(H1sz) { }
+
+template<int DIM, int D1D, int Q1D, int L1D, int NBZ = 1> static
+void ForceMult2D(const int NE,
+                 const Array<double> &B_,
+                 const Array<double> &Bt_,
+                 const Array<double> &Gt_,
+                 const DenseTensor &sJit_,
+                 const Vector &x, Vector &y)
+{
+   auto b = Reshape(B_.Read(), Q1D, L1D);
+   auto bt = Reshape(Bt_.Read(), D1D, Q1D);
+   auto gt = Reshape(Gt_.Read(), D1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, NE, DIM, DIM);
+   auto energy = Reshape(x.Read(), L1D, L1D, NE);
+   const double eps1 = std::numeric_limits<double>::epsilon();
+   const double eps2 = eps1*eps1;
+   auto velocity = Reshape(y.Write(), D1D, D1D, DIM, NE);
+
+   MFEM_FORALL_2D(e, NE, Q1D, Q1D, 1,
+   {
+      const int z = MFEM_THREAD_ID(z);
 
-         // Set the (c,0) component of the Jacobians at all quadrature points.
-         for (int k1 = 0; k1 < nqp1D; k1++)
-         {
-            for (int k2 = 0; k2 < nqp1D; k2++)
-            {
-               const int idx = k2 * nqp1D + k1;
-               J(idx)(c, 0) = QQ(k1, k2);
-            }
-         }
+      MFEM_SHARED double B[Q1D][L1D];
+      MFEM_SHARED double Bt[D1D][Q1D];
+      MFEM_SHARED double Gt[D1D][Q1D];
 
-         // HQ_i2_k1  = X_i1_i2 HQs_i1_k1  -- contract  in x direction.
-         // QQ_k1_k2  = HQ_i2_k1 HQg_i2_k2 -- gradients in y direction.
-         MultAtB(X, tensors1D->HQshape1D, HQ);
-         MultAtB(HQ, tensors1D->HQgrad1D, QQ);
+      MFEM_SHARED double Ez[NBZ][L1D][L1D];
+      double (*E)[L1D] = (double (*)[L1D])(Ez + z);
 
-         // Set the (c,1) component of the Jacobians at all quadrature points.
-         for (int k1 = 0; k1 < nqp1D; k1++)
+      MFEM_SHARED double LQz[2][NBZ][D1D][Q1D];
+      double (*LQ0)[Q1D] = (double (*)[Q1D])(LQz[0] + z);
+      double (*LQ1)[Q1D] = (double (*)[Q1D])(LQz[1] + z);
+
+      MFEM_SHARED double QQz[3][NBZ][Q1D][Q1D];
+      double (*QQ)[Q1D] = (double (*)[Q1D])(QQz[0] + z);
+      double (*QQ0)[Q1D] = (double (*)[Q1D])(QQz[1] + z);
+      double (*QQ1)[Q1D] = (double (*)[Q1D])(QQz[2] + z);
+
+      if (z == 0)
+      {
+         MFEM_FOREACH_THREAD(q,x,Q1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(l,y,Q1D)
             {
-               const int idx = k2 * nqp1D + k1;
-               J(idx)(c, 1) = QQ(k1, k2);
+               if (l < L1D) { B[q][l] = b(q,l); }
+               if (l < D1D) { Bt[l][q] = bt(l,q); }
+               if (l < D1D) { Gt[l][q] = gt(l,q); }
             }
          }
       }
-   }
-   else
-   {
-      const int nH1dof = nH1dof1D * nH1dof1D * nH1dof1D;
-      DenseMatrix HH_Q(nH1dof1D * nH1dof1D, nqp1D),
-                  H_HQ(HH_Q.GetData(), nH1dof1D, nH1dof1D * nqp1D),
-                  Q_HQ(nqp1D, nH1dof1D*nqp1D), QQ_Q(nqp1D * nqp1D, nqp1D);
-      Vector x(nH1dof);
-
-      const TensorBasisElement *fe =
-         dynamic_cast<const TensorBasisElement *>(H1FESpace.GetFE(0));
-      const Array<int> &dof_map = fe->GetDofMap();
+      MFEM_SYNC_THREAD;
 
-      for (int c = 0; c < 3; c++)
+      MFEM_FOREACH_THREAD(lx,x,L1D)
       {
-         // Transfer from the mfem's H1 local numbering to the tensor structure
-         // numbering.
-         for (int j = 0; j < nH1dof; j++) { x[j] = vec(dof_map[j], c); }
-         X.UseExternalData(x.GetData(), nH1dof1D * nH1dof1D, nH1dof1D);
+         MFEM_FOREACH_THREAD(ly,y,L1D)
+         {
+            E[lx][ly] = energy(lx,ly,e);
+         }
+      }
+      MFEM_SYNC_THREAD;
 
-         // HHQ_i1_i2_k3 = X_i1_i2_i3 HQs_i3_k3   -- contract  in z direction.
-         // QHQ_k1_i2_k3 = HQg_i1_k1 HHQ_i1_i2_k3 -- gradients in x direction.
-         // QQQ_k1_k2_k3 = QHQ_k1_i2_k3 HQs_i2_k2 -- contract  in y direction.
-         // The last step does some reordering (it's not product of matrices).
-         mfem::Mult(X, tensors1D->HQshape1D, HH_Q);
-         MultAtB(tensors1D->HQgrad1D, H_HQ, Q_HQ);
-         for (int k1 = 0; k1 < nqp1D; k1++)
+      MFEM_FOREACH_THREAD(ly,y,L1D)
+      {
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            double u = 0.0;
+            for (int lx = 0; lx < L1D; ++lx)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
-               {
-                  QQ_Q(k1 + nqp1D*k2, k3) = 0.0;
-                  for (int i2 = 0; i2 < nH1dof1D; i2++)
-                  {
-                     QQ_Q(k1 + nqp1D*k2, k3) += Q_HQ(k1, i2 + k3*nH1dof1D) *
-                                                tensors1D->HQshape1D(i2, k2);
-                  }
-               }
+               u += B[qx][lx] * E[lx][ly];
             }
+            LQ0[ly][qx] = u;
          }
-         // Set the (c,0) component of the Jacobians at all quadrature points.
-         for (int k1 = 0; k1 < nqp1D; k1++)
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qy,y,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            double u = 0.0;
+            for (int ly = 0; ly < L1D; ++ly)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
-               {
-                  const int idx = k3*nqp1D*nqp1D + k2*nqp1D + k1;
-                  J(idx)(c, 0) = QQ_Q(k1 + k2*nqp1D, k3);
-               }
+               u += B[qy][ly] * LQ0[ly][qx];
             }
+            QQ[qy][qx] = u;
          }
+      }
+      MFEM_SYNC_THREAD;
 
-         // HHQ_i1_i2_k3 = X_i1_i2_i3 HQs_i3_k3   -- contract  in z direction.
-         // QHQ_k1_i2_k3 = HQs_i1_k1 HHQ_i1_i2_k3 -- contract  in x direction.
-         // QQQ_k1_k2_k3 = QHQ_k1_i2_k3 HQg_i2_k2 -- gradients in y direction.
-         // The last step does some reordering (it's not product of matrices).
-         mfem::Mult(X, tensors1D->HQshape1D, HH_Q);
-         MultAtB(tensors1D->HQshape1D, H_HQ, Q_HQ);
-         for (int k1 = 0; k1 < nqp1D; k1++)
+      for (int c = 0; c < DIM; ++c)
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
-               {
-                  QQ_Q(k1 + nqp1D*k2, k3) = 0.0;
-                  for (int i2 = 0; i2 < nH1dof1D; i2++)
-                  {
-                     QQ_Q(k1 + nqp1D*k2, k3) += Q_HQ(k1, i2 + k3*nH1dof1D) *
-                                                tensors1D->HQgrad1D(i2, k2);
-                  }
-               }
+               const double esx = QQ[qy][qx] * sJit(qx,qy,e,0,c);
+               const double esy = QQ[qy][qx] * sJit(qx,qy,e,1,c);
+               QQ0[qy][qx] = esx;
+               QQ1[qy][qx] = esy;
             }
          }
-         // Set the (c,1) component of the Jacobians at all quadrature points.
-         for (int k1 = 0; k1 < nqp1D; k1++)
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(dx,x,D1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               double u = 0.0;
+               double v = 0.0;
+               for (int qx = 0; qx < Q1D; ++qx)
                {
-                  const int idx = k3*nqp1D*nqp1D + k2*nqp1D + k1;
-                  J(idx)(c, 1) = QQ_Q(k1 + k2*nqp1D, k3);
+                  u += Gt[dx][qx] * QQ0[qy][qx];
+                  v += Bt[dx][qx] * QQ1[qy][qx];
                }
+               LQ0[dx][qy] = u;
+               LQ1[dx][qy] = v;
             }
          }
-
-         // HHQ_i1_i2_k3 = X_i1_i2_i3 HQg_i3_k3   -- gradients in z direction.
-         // QHQ_k1_i2_k3 = HQs_i1_k1 HHQ_i1_i2_k3 -- contract  in x direction.
-         // QQQ_k1_k2_k3 = QHQ_k1_i2_k3 HQs_i2_k2 -- contract  in y direction.
-         // The last step does some reordering (it's not product of matrices).
-         mfem::Mult(X, tensors1D->HQgrad1D, HH_Q);
-         MultAtB(tensors1D->HQshape1D, H_HQ, Q_HQ);
-         for (int k1 = 0; k1 < nqp1D; k1++)
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dy,y,D1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(dx,x,D1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               double u = 0.0;
+               double v = 0.0;
+               for (int qy = 0; qy < Q1D; ++qy)
                {
-                  QQ_Q(k1 + nqp1D*k2, k3) = 0.0;
-                  for (int i2 = 0; i2 < nH1dof1D; i2++)
-                  {
-                     QQ_Q(k1 + nqp1D*k2, k3) += Q_HQ(k1, i2 + k3*nH1dof1D) *
-                                                tensors1D->HQshape1D(i2, k2);
-                  }
+                  u += LQ0[dx][qy] * Bt[dy][qy];
+                  v += LQ1[dx][qy] * Gt[dy][qy];
                }
+               velocity(dx,dy,c,e) = u + v;
             }
          }
-         // Set the (c,2) component of the Jacobians at all quadrature points.
-         for (int k1 = 0; k1 < nqp1D; k1++)
+         MFEM_SYNC_THREAD;
+      }
+      for (int c = 0; c < DIM; ++c)
+      {
+         MFEM_FOREACH_THREAD(dy,y,D1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(dx,x,D1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               const double v = velocity(dx,dy,c,e);
+               if (fabs(v) < eps2)
                {
-                  const int idx = k3*nqp1D*nqp1D + k2*nqp1D + k1;
-                  J(idx)(c, 2) = QQ_Q(k1 + k2*nqp1D, k3);
+                  velocity(dx,dy,c,e) = 0.0;
                }
             }
          }
+         MFEM_SYNC_THREAD;
       }
-   }
+   });
 }
 
-void DensityIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
-                                               ElementTransformation &Tr,
-                                               Vector &elvect)
+template<int DIM, int D1D, int Q1D, int L1D> static
+void ForceMult3D(const int NE,
+                 const Array<double> &B_,
+                 const Array<double> &Bt_,
+                 const Array<double> &Gt_,
+                 const DenseTensor &sJit_,
+                 const Vector &x, Vector &y)
 {
-   const int ip_cnt = IntRule->GetNPoints();
-   Vector shape(fe.GetDof());
+   auto b = Reshape(B_.Read(), Q1D, L1D);
+   auto bt = Reshape(Bt_.Read(), D1D, Q1D);
+   auto gt = Reshape(Gt_.Read(), D1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, Q1D, NE, DIM, DIM);
+   auto energy = Reshape(x.Read(), L1D, L1D, L1D, NE);
+   const double eps1 = std::numeric_limits<double>::epsilon();
+   const double eps2 = eps1*eps1;
+   auto velocity = Reshape(y.Write(), D1D, D1D, D1D, DIM, NE);
+
+   MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
+   {
+      const int z = MFEM_THREAD_ID(z);
 
-   elvect.SetSize(fe.GetDof());
-   elvect = 0.0;
+      MFEM_SHARED double B[Q1D][L1D];
+      MFEM_SHARED double Bt[D1D][Q1D];
+      MFEM_SHARED double Gt[D1D][Q1D];
 
-   for (int q = 0; q < ip_cnt; q++)
-   {
-      fe.CalcShape(IntRule->IntPoint(q), shape);
-      // Note that rhoDetJ = rho0DetJ0.
-      shape *= quad_data.rho0DetJ0w(Tr.ElementNo*ip_cnt + q);
-      elvect += shape;
-   }
-}
+      MFEM_SHARED double E[L1D][L1D][L1D];
 
-void ForceIntegrator::AssembleElementMatrix2(const FiniteElement &trial_fe,
-                                             const FiniteElement &test_fe,
-                                             ElementTransformation &Trans,
-                                             DenseMatrix &elmat)
-{
-   const int nqp = IntRule->GetNPoints();
-   const int dim = trial_fe.GetDim();
-   const int zone_id = Trans.ElementNo;
-   const int h1dofs_cnt = test_fe.GetDof();
-   const int l2dofs_cnt = trial_fe.GetDof();
+      MFEM_SHARED double sm0[3][Q1D*Q1D*Q1D];
+      MFEM_SHARED double sm1[3][Q1D*Q1D*Q1D];
 
-   elmat.SetSize(h1dofs_cnt*dim, l2dofs_cnt);
-   elmat = 0.0;
+      double (*MMQ0)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+0);
+      double (*MMQ1)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+1);
+      double (*MMQ2)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+2);
 
-   DenseMatrix vshape(h1dofs_cnt, dim), loc_force(h1dofs_cnt, dim);
-   Vector shape(l2dofs_cnt), Vloc_force(loc_force.Data(), h1dofs_cnt*dim);
+      double (*MQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+0);
+      double (*MQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+1);
+      double (*MQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+2);
 
-   for (int q = 0; q < nqp; q++)
-   {
-      const IntegrationPoint &ip = IntRule->IntPoint(q);
+      MFEM_SHARED double QQQ[Q1D][Q1D][Q1D];
+      double (*QQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+0);
+      double (*QQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+1);
+      double (*QQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+2);
 
-      // Form stress:grad_shape at the current point.
-      test_fe.CalcDShape(ip, vshape);
-      for (int i = 0; i < h1dofs_cnt; i++)
+      if (z == 0)
       {
-         for (int vd = 0; vd < dim; vd++) // Velocity components.
+         MFEM_FOREACH_THREAD(q,x,Q1D)
          {
-            loc_force(i, vd) = 0.0;
-            for (int gd = 0; gd < dim; gd++) // Gradient components.
+            MFEM_FOREACH_THREAD(l,y,Q1D)
             {
-               loc_force(i, vd) +=
-                  quad_data.stressJinvT(vd)(zone_id*nqp + q, gd) * vshape(i,gd);
+               if (l < L1D) { B[q][l] = b(q,l); }
+               if (l < D1D) { Bt[l][q] = bt(l,q); }
+               if (l < D1D) { Gt[l][q] = gt(l,q); }
             }
          }
       }
-
-      trial_fe.CalcShape(ip, shape);
-      AddMultVWt(Vloc_force, shape, elmat);
-   }
-}
-
-void ForcePAOperator::Mult(const Vector &vecL2, Vector &vecH1) const
-{
-   if      (dim == 2) { MultQuad(vecL2, vecH1); }
-   else if (dim == 3) { MultHex(vecL2, vecH1); }
-   else { MFEM_ABORT("Unsupported dimension"); }
-}
-
-void ForcePAOperator::MultTranspose(const Vector &vecH1, Vector &vecL2) const
-{
-   if      (dim == 2) { MultTransposeQuad(vecH1, vecL2); }
-   else if (dim == 3) { MultTransposeHex(vecH1, vecL2); }
-   else { MFEM_ABORT("Unsupported dimension"); }
-}
-
-// Force matrix action on quadrilateral elements in 2D.
-void ForcePAOperator::MultQuad(const Vector &vecL2, Vector &vecH1) const
-{
-   const int nH1dof1D = tensors1D->HQshape1D.Height(),
-             nL2dof1D = tensors1D->LQshape1D.Height(),
-             nqp1D    = tensors1D->HQshape1D.Width(),
-             nqp      =  nqp1D * nqp1D,
-             nH1dof   = nH1dof1D * nH1dof1D;
-   Array<int> h1dofs, l2dofs;
-   Vector e(nL2dof1D * nL2dof1D);
-   DenseMatrix E(e.GetData(), nL2dof1D, nL2dof1D);
-   DenseMatrix LQ(nL2dof1D, nqp1D), HQ(nH1dof1D, nqp1D), QQ(nqp1D, nqp1D),
-               HHx(nH1dof1D, nH1dof1D), HHy(nH1dof1D, nH1dof1D);
-   // Quadrature data for a specific direction.
-   DenseMatrix QQd(nqp1D, nqp1D);
-   double *data_qd = QQd.GetData(), *data_q = QQ.GetData();
-
-   const TensorBasisElement *fe =
-      dynamic_cast<const TensorBasisElement *>(H1FESpace.GetFE(0));
-   const Array<int> &dof_map = fe->GetDofMap();
-
-   vecH1 = 0.0;
-   for (int z = 0; z < nzones; z++)
-   {
-      // Note that the local numbering for L2 is the tensor numbering.
-      L2FESpace.GetElementDofs(z, l2dofs);
-      vecL2.GetSubVector(l2dofs, e);
-
-      // LQ_j2_k1 = E_j1_j2 LQs_j1_k1  -- contract in x direction.
-      // QQ_k1_k2 = LQ_j2_k1 LQs_j2_k2 -- contract in y direction.
-      MultAtB(E, tensors1D->LQshape1D, LQ);
-      MultAtB(LQ, tensors1D->LQshape1D, QQ);
-
-      // Iterate over the components (x and y) of the result.
-      for (int c = 0; c < 2; c++)
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lx,x,L1D)
       {
-         // QQd_k1_k2 *= stress_k1_k2(c,0)  -- stress that scales d[v_c]_dx.
-         // HQ_i2_k1   = HQs_i2_k2 QQ_k1_k2 -- contract in y direction.
-         // HHx_i1_i2  = HQg_i1_k1 HQ_i2_k1 -- gradients in x direction.
-         double *d = quad_data->stressJinvT(c).GetData() + z*nqp;
-         for (int q = 0; q < nqp; q++) { data_qd[q] = data_q[q] * d[q]; }
-         MultABt(tensors1D->HQshape1D, QQd, HQ);
-         MultABt(tensors1D->HQgrad1D, HQ, HHx);
-
-         // QQd_k1_k2 *= stress_k1_k2(c,1) -- stress that scales d[v_c]_dy.
-         // HQ_i2_k1  = HQg_i2_k2 QQ_k1_k2 -- gradients in y direction.
-         // HHy_i1_i2 = HQ_i1_k1 HQ_i2_k1  -- contract in x direction.
-         d = quad_data->stressJinvT(c).GetData() + 1*nzones*nqp + z*nqp;
-         for (int q = 0; q < nqp; q++) { data_qd[q] = data_q[q] * d[q]; }
-         MultABt(tensors1D->HQgrad1D, QQd, HQ);
-         MultABt(tensors1D->HQshape1D, HQ, HHy);
-
-         // Set the c-component of the result.
-         H1FESpace.GetElementVDofs(z, h1dofs);
-         for (int i1 = 0; i1 < nH1dof1D; i1++)
+         MFEM_FOREACH_THREAD(ly,y,L1D)
          {
-            for (int i2 = 0; i2 < nH1dof1D; i2++)
+            MFEM_FOREACH_THREAD(lz,z,L1D)
             {
-               // Transfer from the mfem's H1 local numbering to the tensor
-               // structure numbering.
-               const int idx = i2 * nH1dof1D + i1;
-               vecH1[h1dofs[c*nH1dof + dof_map[idx]]] +=
-                  HHx(i1, i2) + HHy(i1, i2);
+               E[lx][ly][lz] = energy(lx,ly,lz,e);
             }
          }
       }
-   }
-}
-
-// Force matrix action on hexahedral elements in 3D.
-void ForcePAOperator::MultHex(const Vector &vecL2, Vector &vecH1) const
-{
-   const int nH1dof1D = tensors1D->HQshape1D.Height(),
-             nL2dof1D = tensors1D->LQshape1D.Height(),
-             nqp1D    = tensors1D->HQshape1D.Width(),
-             nqp      = nqp1D * nqp1D * nqp1D,
-             nH1dof   = nH1dof1D * nH1dof1D * nH1dof1D;
-   Array<int> h1dofs, l2dofs;
-
-   Vector e(nL2dof1D * nL2dof1D * nL2dof1D);
-   DenseMatrix E(e.GetData(), nL2dof1D*nL2dof1D, nL2dof1D);
-
-   DenseMatrix HH_Q(nH1dof1D * nH1dof1D, nqp1D),
-               H_HQ(HH_Q.GetData(), nH1dof1D, nH1dof1D*nqp1D),
-               Q_HQ(nqp1D, nH1dof1D*nqp1D);
-   DenseMatrix LL_Q(nL2dof1D * nL2dof1D, nqp1D),
-               L_LQ(LL_Q.GetData(), nL2dof1D, nL2dof1D*nqp1D),
-               Q_LQ(nqp1D, nL2dof1D*nqp1D);
-   DenseMatrix QQ_Q(nqp1D * nqp1D, nqp1D), QQ_Qc(nqp1D * nqp1D, nqp1D);
-   double *qqq = QQ_Q.GetData(), *qqqc = QQ_Qc.GetData();
-   DenseMatrix HHHx(nH1dof1D * nH1dof1D, nH1dof1D),
-               HHHy(nH1dof1D * nH1dof1D, nH1dof1D),
-               HHHz(nH1dof1D * nH1dof1D, nH1dof1D);
-
-   const TensorBasisElement *fe =
-      dynamic_cast<const TensorBasisElement *>(H1FESpace.GetFE(0));
-   const Array<int> &dof_map = fe->GetDofMap();
-
-   vecH1 = 0.0;
-   for (int z = 0; z < nzones; z++)
-   {
-      // Note that the local numbering for L2 is the tensor numbering.
-      L2FESpace.GetElementDofs(z, l2dofs);
-      vecL2.GetSubVector(l2dofs, e);
-
-      // LLQ_j1_j2_k3  = E_j1_j2_j3 LQs_j3_k3   -- contract in z direction.
-      // QLQ_k1_j2_k3  = LQs_j1_k1 LLQ_j1_j2_k3 -- contract in x direction.
-      // QQQ_k1_k2_k3  = QLQ_k1_j2_k3 LQs_j2_k2 -- contract in y direction.
-      // The last step does some reordering (it's not product of matrices).
-      mfem::Mult(E, tensors1D->LQshape1D, LL_Q);
-      MultAtB(tensors1D->LQshape1D, L_LQ, Q_LQ);
-      for (int k1 = 0; k1 < nqp1D; k1++)
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lz,z,L1D)
       {
-         for (int k2 = 0; k2 < nqp1D; k2++)
+         MFEM_FOREACH_THREAD(ly,y,L1D)
          {
-            for (int k3 = 0; k3 < nqp1D; k3++)
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
             {
-               QQ_Q(k1 + nqp1D*k2, k3) = 0.0;
-               for (int j2 = 0; j2 < nL2dof1D; j2++)
+               double u = 0.0;
+               for (int lx = 0; lx < L1D; ++lx)
                {
-                  QQ_Q(k1 + nqp1D*k2, k3) +=
-                     Q_LQ(k1, j2 + k3*nL2dof1D) * tensors1D->LQshape1D(j2, k2);
+                  u += B[qx][lx] * E[lx][ly][lz];
                }
+               MMQ0[lz][ly][qx] = u;
             }
          }
       }
-
-      // Iterate over the components (x, y, z) of the result.
-      for (int c = 0; c < 3; c++)
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lz,z,L1D)
       {
-         // QQQc_k1_k2_k3 *= stress_k1_k2_k3(c,0) -- stress scaling d[v_c]_dx.
-         double *d = quad_data->stressJinvT(c).GetData() + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqqc[q] = qqq[q] * d[q]; }
-
-         // QHQ_k1_i2_k3  = QQQc_k1_k2_k3 HQs_i2_k2 -- contract  in y direction.
-         // The first step does some reordering (it's not product of matrices).
-         // HHQ_i1_i2_k3  = HQg_i1_k1 QHQ_k1_i2_k3  -- gradients in x direction.
-         // HHHx_i1_i2_i3 = HHQ_i1_i2_k3 HQs_i3_k3  -- contract  in z direction.
-         for (int k1 = 0; k1 < nqp1D; k1++)
-         {
-            for (int i2 = 0; i2 < nH1dof1D; i2++)
-            {
-               for (int k3 = 0; k3 < nqp1D; k3++)
-               {
-                  Q_HQ(k1, i2 + nH1dof1D*k3) = 0.0;
-                  for (int k2 = 0; k2 < nqp1D; k2++)
-                  {
-                     Q_HQ(k1, i2 + nH1dof1D*k3) +=
-                        QQ_Qc(k1 + nqp1D*k2, k3) * tensors1D->HQshape1D(i2, k2);
-                  }
-               }
-            }
-         }
-         mfem::Mult(tensors1D->HQgrad1D, Q_HQ, H_HQ);
-         MultABt(HH_Q, tensors1D->HQshape1D, HHHx);
-
-         // QQQc_k1_k2_k3 *= stress_k1_k2_k3(c,1) -- stress scaling d[v_c]_dy.
-         d = quad_data->stressJinvT(c).GetData() + 1*nzones*nqp + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqqc[q] = qqq[q] * d[q]; }
-
-         // QHQ_k1_i2_k3  = QQQc_k1_k2_k3 HQg_i2_k2 -- gradients in y direction.
-         // The first step does some reordering (it's not product of matrices).
-         // HHQ_i1_i2_k3  = HQs_i1_k1 QHQ_k1_i2_k3  -- contract  in x direction.
-         // HHHy_i1_i2_i3 = HHQ_i1_i2_k3 HQs_i3_k3  -- contract  in z direction.
-         for (int k1 = 0; k1 < nqp1D; k1++)
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
          {
-            for (int i2 = 0; i2 < nH1dof1D; i2++)
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               double u = 0.0;
+               for (int ly = 0; ly < L1D; ++ly)
                {
-                  Q_HQ(k1, i2 + nH1dof1D*k3) = 0.0;
-                  for (int k2 = 0; k2 < nqp1D; k2++)
-                  {
-                     Q_HQ(k1, i2 + nH1dof1D*k3) +=
-                        QQ_Qc(k1 + nqp1D*k2, k3) * tensors1D->HQgrad1D(i2, k2);
-                  }
+                  u += B[qy][ly] * MMQ0[lz][ly][qx];
                }
+               MQQ0[lz][qy][qx] = u;
             }
          }
-         mfem::Mult(tensors1D->HQshape1D, Q_HQ, H_HQ);
-         MultABt(HH_Q, tensors1D->HQshape1D, HHHy);
-
-         // QQQc_k1_k2_k3 *= stress_k1_k2_k3(c,2) -- stress scaling d[v_c]_dz.
-         d = quad_data->stressJinvT(c).GetData() + 2*nzones*nqp + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqqc[q] = qqq[q] * d[q]; }
-
-         // QHQ_k1_i2_k3  = QQQc_k1_k2_k3 HQg_i2_k2 -- contract  in y direction.
-         // The first step does some reordering (it's not product of matrices).
-         // HHQ_i1_i2_k3  = HQs_i1_k1 QHQ_k1_i2_k3  -- contract  in x direction.
-         // HHHz_i1_i2_i3 = HHQ_i1_i2_k3 HQs_i3_k3  -- gradients in z direction.
-         for (int k1 = 0; k1 < nqp1D; k1++)
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
          {
-            for (int i2 = 0; i2 < nH1dof1D; i2++)
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               double u = 0.0;
+               for (int lz = 0; lz < L1D; ++lz)
                {
-                  Q_HQ(k1, i2 + nH1dof1D*k3) = 0.0;
-                  for (int k2 = 0; k2 < nqp1D; k2++)
-                  {
-                     Q_HQ(k1, i2 + nH1dof1D*k3) +=
-                        QQ_Qc(k1 + nqp1D*k2, k3) * tensors1D->HQshape1D(i2, k2);
-                  }
+                  u += B[qz][lz] * MQQ0[lz][qy][qx];
                }
+               QQQ[qz][qy][qx] = u;
             }
          }
-         mfem::Mult(tensors1D->HQshape1D, Q_HQ, H_HQ);
-         MultABt(HH_Q, tensors1D->HQgrad1D, HHHz);
-
-         // Set the c-component of the result.
-         H1FESpace.GetElementVDofs(z, h1dofs);
-         for (int i1 = 0; i1 < nH1dof1D; i1++)
+      }
+      MFEM_SYNC_THREAD;
+      for (int c = 0; c < 3; ++c)
+      {
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
          {
-            for (int i2 = 0; i2 < nH1dof1D; i2++)
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
             {
-               for (int i3 = 0; i3 < nH1dof1D; i3++)
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
                {
-                  // Transfer from the mfem's H1 local numbering to the tensor
-                  // structure numbering.
-                  const int idx = i3*nH1dof1D*nH1dof1D + i2*nH1dof1D + i1;
-                  vecH1[h1dofs[c*nH1dof + dof_map[idx]]] +=
-                     HHHx(i1 + i2*nH1dof1D, i3) +
-                     HHHy(i1 + i2*nH1dof1D, i3) +
-                     HHHz(i1 + i2*nH1dof1D, i3);
+                  const double esx = QQQ[qz][qy][qx] * sJit(qx,qy,qz,e,0,c);
+                  const double esy = QQQ[qz][qy][qx] * sJit(qx,qy,qz,e,1,c);
+                  const double esz = QQQ[qz][qy][qx] * sJit(qx,qy,qz,e,2,c);
+                  QQQ0[qz][qy][qx] = esx;
+                  QQQ1[qz][qy][qx] = esy;
+                  QQQ2[qz][qy][qx] = esz;
                }
             }
          }
-      }
-   }
-}
-
-// Transpose force matrix action on quadrilateral elements in 2D.
-void ForcePAOperator::MultTransposeQuad(const Vector &vecH1,
-                                        Vector &vecL2) const
-{
-   const int nH1dof1D = tensors1D->HQshape1D.Height(),
-             nL2dof1D = tensors1D->LQshape1D.Height(),
-             nqp1D    = tensors1D->HQshape1D.Width(),
-             nqp      = nqp1D * nqp1D,
-             nH1dof   = nH1dof1D * nH1dof1D;
-   Array<int> h1dofs, l2dofs;
-   Vector v(nH1dof * 2), e(nL2dof1D * nL2dof1D);
-   DenseMatrix V, E(e.GetData(), nL2dof1D, nL2dof1D);
-   DenseMatrix HQ(nH1dof1D, nqp1D), LQ(nL2dof1D, nqp1D),
-               QQc(nqp1D, nqp1D), QQ(nqp1D, nqp1D);
-   double *qqc = QQc.GetData();
-
-   const TensorBasisElement *fe =
-      dynamic_cast<const TensorBasisElement *>(H1FESpace.GetFE(0));
-   const Array<int> &dof_map = fe->GetDofMap();
-
-   for (int z = 0; z < nzones; z++)
-   {
-      H1FESpace.GetElementVDofs(z, h1dofs);
-
-      // Form (stress:grad_v) at all quadrature points.
-      QQ = 0.0;
-      for (int c = 0; c < 2; c++)
-      {
-         // Transfer from the mfem's H1 local numbering to the tensor structure
-         // numbering.
-         for (int j = 0; j < nH1dof; j++)
-         {
-            v[c*nH1dof + j] = vecH1[h1dofs[c*nH1dof + dof_map[j]]];
-         }
-         // Connect to [v_c], i.e., the c-component of v.
-         V.UseExternalData(v.GetData() + c*nH1dof, nH1dof1D, nH1dof1D);
-
-         // HQ_i2_k1   = V_i1_i2 HQg_i1_k1  -- gradients in x direction.
-         // QQc_k1_k2  = HQ_i2_k1 HQs_i2_k2 -- contract  in y direction.
-         // QQc_k1_k2 *= stress_k1_k2(c,0)  -- stress that scales d[v_c]_dx.
-         MultAtB(V, tensors1D->HQgrad1D, HQ);
-         MultAtB(HQ, tensors1D->HQshape1D, QQc);
-         double *d = quad_data->stressJinvT(c).GetData() + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqc[q] *= d[q]; }
-         // Add the (stress(c,0) * d[v_c]_dx) part of (stress:grad_v).
-         QQ += QQc;
-
-         // HQ_i2_k1   = V_i1_i2 HQs_i1_k1  -- contract  in x direction.
-         // QQc_k1_k2  = HQ_i2_k1 HQg_i2_k2 -- gradients in y direction.
-         // QQc_k1_k2 *= stress_k1_k2(c,1)  -- stress that scales d[v_c]_dy.
-         MultAtB(V, tensors1D->HQshape1D, HQ);
-         MultAtB(HQ, tensors1D->HQgrad1D, QQc);
-         d = quad_data->stressJinvT(c).GetData() + 1*nzones*nqp + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqc[q] *= d[q]; }
-         // Add the (stress(c,1) * d[v_c]_dy) part of (stress:grad_v).
-         QQ += QQc;
-      }
-
-      // LQ_j1_k2 = LQs_j1_k1 QQ_k1_k2 -- contract in x direction.
-      // E_j1_j2  = LQ_j1_k2 LQs_j2_k2 -- contract in y direction.
-      mfem::Mult(tensors1D->LQshape1D, QQ, LQ);
-      MultABt(LQ, tensors1D->LQshape1D, E);
-
-      L2FESpace.GetElementDofs(z, l2dofs);
-      vecL2.SetSubVector(l2dofs, e);
-   }
-}
-
-// Transpose force matrix action on hexahedral elements in 3D.
-void ForcePAOperator::MultTransposeHex(const Vector &vecH1, Vector &vecL2) const
-{
-   const int nH1dof1D = tensors1D->HQshape1D.Height(),
-             nL2dof1D = tensors1D->LQshape1D.Height(),
-             nqp1D    = tensors1D->HQshape1D.Width(),
-             nqp      = nqp1D * nqp1D * nqp1D,
-             nH1dof   = nH1dof1D * nH1dof1D * nH1dof1D;
-   Array<int> h1dofs, l2dofs;
-
-   Vector v(nH1dof * 3), e(nL2dof1D * nL2dof1D * nL2dof1D);
-   DenseMatrix V, E(e.GetData(), nL2dof1D * nL2dof1D, nL2dof1D);
-
-   DenseMatrix HH_Q(nH1dof1D * nH1dof1D, nqp1D),
-               H_HQ(HH_Q.GetData(), nH1dof1D, nH1dof1D * nqp1D),
-               Q_HQ(nqp1D, nH1dof1D*nqp1D);
-   DenseMatrix LL_Q(nL2dof1D * nL2dof1D, nqp1D),
-               L_LQ(LL_Q.GetData(), nL2dof1D, nL2dof1D * nqp1D),
-               Q_LQ(nqp1D, nL2dof1D*nqp1D);
-   DenseMatrix QQ_Q(nqp1D * nqp1D, nqp1D),  QQ_Qc(nqp1D * nqp1D, nqp1D);
-   double *qqqc = QQ_Qc.GetData();
-
-   const TensorBasisElement *fe =
-      dynamic_cast<const TensorBasisElement *>(H1FESpace.GetFE(0));
-   const Array<int> &dof_map = fe->GetDofMap();
-
-   for (int z = 0; z < nzones; z++)
-   {
-      H1FESpace.GetElementVDofs(z, h1dofs);
-
-      // Form (stress:grad_v) at all quadrature points.
-      QQ_Q = 0.0;
-      for (int c = 0; c < 3; c++)
-      {
-         // Transfer from the mfem's H1 local numbering to the tensor structure
-         // numbering.
-         for (int j = 0; j < nH1dof; j++)
-         {
-            v[c*nH1dof + j] = vecH1[h1dofs[c*nH1dof + dof_map[j]]];
-         }
-         // Connect to [v_c], i.e., the c-component of v.
-         V.UseExternalData(v.GetData() + c*nH1dof, nH1dof1D*nH1dof1D, nH1dof1D);
-
-         // HHQ_i1_i2_k3  = V_i1_i2_i3 HQs_i3_k3   -- contract  in z direction.
-         // QHQ_k1_i2_k3  = HQg_i1_k1 HHQ_i1_i2_k3 -- gradients in x direction.
-         // QQQc_k1_k2_k3 = QHQ_k1_i2_k3 HQs_i2_k2 -- contract  in y direction.
-         // The last step does some reordering (it's not product of matrices).
-         mfem::Mult(V, tensors1D->HQshape1D, HH_Q);
-         MultAtB(tensors1D->HQgrad1D, H_HQ, Q_HQ);
-         for (int k1 = 0; k1 < nqp1D; k1++)
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               MFEM_FOREACH_THREAD(hx,x,D1D)
                {
-                  QQ_Qc(k1 + nqp1D*k2, k3) = 0.0;
-                  for (int i2 = 0; i2 < nH1dof1D; i2++)
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int qx = 0; qx < Q1D; ++qx)
                   {
-                     QQ_Qc(k1 + nqp1D*k2, k3) += Q_HQ(k1, i2 + k3*nH1dof1D) *
-                                                 tensors1D->HQshape1D(i2, k2);
+                     u += Gt[hx][qx] * QQQ0[qz][qy][qx];
+                     v += Bt[hx][qx] * QQQ1[qz][qy][qx];
+                     w += Bt[hx][qx] * QQQ2[qz][qy][qx];
                   }
+                  MQQ0[hx][qy][qz] = u;
+                  MQQ1[hx][qy][qz] = v;
+                  MQQ2[hx][qy][qz] = w;
                }
             }
          }
-         // QQQc_k1_k2_k3 *= stress_k1_k2_k3(c,0) -- stress scaling d[v_c]_dx.
-         double *d = quad_data->stressJinvT(c).GetData() + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqqc[q] *= d[q]; }
-         // Add the (stress(c,0) * d[v_c]_dx) part of (stress:grad_v).
-         QQ_Q += QQ_Qc;
-
-         // HHQ_i1_i2_k3  = V_i1_i2_i3 HQs_i3_k3   -- contract  in z direction.
-         // QHQ_k1_i2_k3  = HQs_i1_k1 HHQ_i1_i2_k3 -- contract  in x direction.
-         // QQQc_k1_k2_k3 = QHQ_k1_i2_k3 HQg_i2_k2 -- gradients in y direction.
-         // The last step does some reordering (it's not product of matrices).
-         mfem::Mult(V, tensors1D->HQshape1D, HH_Q);
-         MultAtB(tensors1D->HQshape1D, H_HQ, Q_HQ);
-         for (int k1 = 0; k1 < nqp1D; k1++)
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(hy,y,D1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               MFEM_FOREACH_THREAD(hx,x,D1D)
                {
-                  QQ_Qc(k1 + nqp1D*k2, k3) = 0.0;
-                  for (int i2 = 0; i2 < nH1dof1D; i2++)
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int qy = 0; qy < Q1D; ++qy)
                   {
-                     QQ_Qc(k1 + nqp1D*k2, k3) += Q_HQ(k1, i2 + k3*nH1dof1D) *
-                                                 tensors1D->HQgrad1D(i2, k2);
+                     u += MQQ0[hx][qy][qz] * Bt[hy][qy];
+                     v += MQQ1[hx][qy][qz] * Gt[hy][qy];
+                     w += MQQ2[hx][qy][qz] * Bt[hy][qy];
                   }
+                  MMQ0[hx][hy][qz] = u;
+                  MMQ1[hx][hy][qz] = v;
+                  MMQ2[hx][hy][qz] = w;
                }
             }
          }
-         // QQQc_k1_k2_k3 *= stress_k1_k2_k3(c,1) -- stress scaling d[v_c]_dy.
-         d = quad_data->stressJinvT(c).GetData() + 1*nzones*nqp + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqqc[q] *= d[q]; }
-         // Add the (stress(c,1) * d[v_c]_dy) part of (stress:grad_v).
-         QQ_Q += QQ_Qc;
-
-         // HHQ_i1_i2_k3  = V_i1_i2_i3 HQg_i3_k3   -- gradients in z direction.
-         // QHQ_k1_i2_k3  = HQs_i1_k1 HHQ_i1_i2_k3 -- contract  in x direction.
-         // QQQc_k1_k2_k3 = QHQ_k1_i2_k3 HQs_i2_k2 -- contract  in y direction.
-         // The last step does some reordering (it's not product of matrices).
-         mfem::Mult(V, tensors1D->HQgrad1D, HH_Q);
-         MultAtB(tensors1D->HQshape1D, H_HQ, Q_HQ);
-         for (int k1 = 0; k1 < nqp1D; k1++)
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(hz,z,D1D)
          {
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(hy,y,D1D)
             {
-               for (int k3 = 0; k3 < nqp1D; k3++)
+               MFEM_FOREACH_THREAD(hx,x,D1D)
                {
-                  QQ_Qc(k1 + nqp1D*k2, k3) = 0.0;
-                  for (int i2 = 0; i2 < nH1dof1D; i2++)
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int qz = 0; qz < Q1D; ++qz)
                   {
-                     QQ_Qc(k1 + nqp1D*k2, k3) += Q_HQ(k1, i2 + k3*nH1dof1D) *
-                                                 tensors1D->HQshape1D(i2, k2);
+                     u += MMQ0[hx][hy][qz] * Bt[hz][qz];
+                     v += MMQ1[hx][hy][qz] * Bt[hz][qz];
+                     w += MMQ2[hx][hy][qz] * Gt[hz][qz];
                   }
+                  velocity(hx,hy,hz,c,e) = u + v + w;
                }
             }
          }
-         // QQQc_k1_k2_k3 *= stress_k1_k2_k3(c,2) -- stress scaling d[v_c]_dz.
-         d = quad_data->stressJinvT(c).GetData() + 2*nzones*nqp + z*nqp;
-         for (int q = 0; q < nqp; q++) { qqqc[q] *= d[q]; }
-         // Add the (stress(c,2) * d[v_c]_dz) part of (stress:grad_v).
-         QQ_Q += QQ_Qc;
+         MFEM_SYNC_THREAD;
       }
-
-      // QLQ_k1_j2_k3 = QQQ_k1_k2_k3 LQs_j2_k2 -- contract in y direction.
-      // The first step does some reordering (it's not product of matrices).
-      // LLQ_j1_j2_k3 = LQs_j1_k1 QLQ_k1_j2_k3 -- contract in x direction.
-      // E_j1_j2_i3   = LLQ_j1_j2_k3 LQs_j3_k3 -- contract in z direction.
-      for (int k1 = 0; k1 < nqp1D; k1++)
+      for (int c = 0; c < 3; ++c)
       {
-         for (int j2 = 0; j2 < nL2dof1D; j2++)
+         MFEM_FOREACH_THREAD(hz,z,D1D)
          {
-            for (int k3 = 0; k3 < nqp1D; k3++)
+            MFEM_FOREACH_THREAD(hy,y,D1D)
             {
-               Q_LQ(k1, j2 + nL2dof1D*k3) = 0.0;
-               for (int k2 = 0; k2 < nqp1D; k2++)
+               MFEM_FOREACH_THREAD(hx,x,D1D)
                {
-                  Q_LQ(k1, j2 + nL2dof1D*k3) +=
-                     QQ_Q(k1 + nqp1D*k2, k3) * tensors1D->LQshape1D(j2, k2);
+                  const double v = velocity(hx,hy,hz,c,e);
+                  if (fabs(v) < eps2)
+                  {
+                     velocity(hx,hy,hz,c,e) = 0.0;
+                  }
                }
             }
          }
+         MFEM_SYNC_THREAD;
       }
-      mfem::Mult(tensors1D->LQshape1D, Q_LQ, L_LQ);
-      MultABt(LL_Q, tensors1D->LQshape1D, E);
-
-      L2FESpace.GetElementDofs(z, l2dofs);
-      vecL2.SetSubVector(l2dofs, e);
-   }
+   });
 }
 
-void MassPAOperator::ComputeDiagonal2D(Vector &diag) const
+typedef void (*fForceMult)(const int E,
+                           const Array<double> &B,
+                           const Array<double> &Bt,
+                           const Array<double> &Gt,
+                           const DenseTensor &stressJinvT,
+                           const Vector &X, Vector &Y);
+
+static void ForceMult(const int DIM, const int D1D, const int Q1D,
+                      const int L1D, const int H1D, const int NE,
+                      const Array<double> &B,
+                      const Array<double> &Bt,
+                      const Array<double> &Gt,
+                      const DenseTensor &stressJinvT,
+                      const Vector &e,
+                      Vector &v)
 {
-   const TensorBasisElement *fe_H1 =
-      dynamic_cast<const TensorBasisElement *>(FESpace.GetFE(0));
-   const Array<int> &dof_map = fe_H1->GetDofMap();
-   const DenseMatrix &HQs = tensors1D->HQshape1D;
-
-   const int ndof1D = HQs.Height(), nqp1D = HQs.Width(), nqp = nqp1D * nqp1D;
-   Vector dz(ndof1D * ndof1D);
-   DenseMatrix HQ(ndof1D, nqp1D), D(dz.GetData(), ndof1D, ndof1D);
-   Array<int> dofs;
-
-   diag.SetSize(height);
-   diag = 0.0;
-
-   // Squares of the shape functions at all quadrature points.
-   DenseMatrix HQs_sq(ndof1D, nqp1D);
-   for (int i = 0; i < ndof1D; i++)
+   MFEM_VERIFY(D1D==H1D, "D1D!=H1D");
+   MFEM_VERIFY(L1D==D1D-1,"L1D!=D1D-1");
+   const int id = ((DIM)<<8)|(D1D)<<4|(Q1D);
+   static std::unordered_map<int, fForceMult> call =
    {
-      for (int k = 0; k < nqp1D; k++)
-      {
-         HQs_sq(i, k) = HQs(i, k) * HQs(i, k);
-      }
-   }
-
-   for (int z = 0; z < nzones; z++)
+      // 2D
+      {0x234,&ForceMult2D<2,3,4,2>},
+      {0x246,&ForceMult2D<2,4,6,3>},
+      {0x258,&ForceMult2D<2,5,8,4>},
+      // 3D
+      {0x334,&ForceMult3D<3,3,4,2>},
+      {0x346,&ForceMult3D<3,4,6,3>},
+      {0x358,&ForceMult3D<3,5,8,4>},
+   };
+   if (!call[id])
    {
-      DenseMatrix QQ(quad_data->rho0DetJ0w.GetData() + z*nqp, nqp1D, nqp1D);
-
-      // HQ_i1_k2 = HQs_i1_k1^2 QQ_k1_k2    -- contract in x direction.
-      // Y_i1_i2  = HQ_i1_k2    HQs_i2_k2^2 -- contract in y direction.
-      mfem::Mult(HQs_sq, QQ, HQ);
-      MultABt(HQ, HQs_sq, D);
-
-      // Transfer from the tensor structure numbering to mfem's H1 numbering.
-      FESpace.GetElementDofs(z, dofs);
-      for (int j = 0; j < dz.Size(); j++)
-      {
-         diag[dofs[dof_map[j]]] += dz[j];
-      }
+      mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
+      MFEM_ABORT("Unknown kernel");
    }
+   call[id](NE, B, Bt, Gt, stressJinvT, e, v);
+}
 
-   for (int i = 0; i < height / 2; i++)
-   {
-      diag(i + height / 2) = diag(i);
-   }
+void ForcePAOperator::Mult(const Vector &x, Vector &y) const
+{
+   if (L2R) { L2R->Mult(x, X); }
+   else { X = x; }
+   ForceMult(dim, D1D, Q1D, L1D, D1D, NE,
+             L2D2Q->B, H1D2Q->Bt, H1D2Q->Gt,
+             qdata.stressJinvT, X, Y);
+   H1R->MultTranspose(Y, y);
 }
 
-void MassPAOperator::ComputeDiagonal3D(Vector &diag) const
+template<int DIM, int D1D, int Q1D, int L1D, int NBZ = 1> static
+void ForceMultTranspose2D(const int NE,
+                          const Array<double> &Bt_,
+                          const Array<double> &B_,
+                          const Array<double> &G_,
+                          const DenseTensor &sJit_,
+                          const Vector &x, Vector &y)
 {
-   const TensorBasisElement *fe_H1 =
-      dynamic_cast<const TensorBasisElement *>(FESpace.GetFE(0));
-   const Array<int> &dof_map = fe_H1->GetDofMap();
-   const DenseMatrix &HQs = tensors1D->HQshape1D;
+   auto b = Reshape(B_.Read(), Q1D, D1D);
+   auto g = Reshape(G_.Read(), Q1D, D1D);
+   auto bt = Reshape(Bt_.Read(), L1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, NE, DIM, DIM);
+   auto velocity = Reshape(x.Read(), D1D, D1D, DIM, NE);
+   auto energy = Reshape(y.Write(), L1D, L1D, NE);
+
+   MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
+   {
+      const int z = MFEM_THREAD_ID(z);
 
-   const int ndof1D = HQs.Height(), nqp1D = HQs.Width(),
-             nqp = nqp1D * nqp1D * nqp1D;
-   DenseMatrix HH_Q(ndof1D * ndof1D, nqp1D), Q_HQ(nqp1D, ndof1D*nqp1D);
-   DenseMatrix H_HQ(HH_Q.GetData(), ndof1D, ndof1D*nqp1D);
-   Vector dz(ndof1D * ndof1D * ndof1D);
-   DenseMatrix D(dz.GetData(), ndof1D*ndof1D, ndof1D);
-   Array<int> dofs;
+      MFEM_SHARED double Bt[L1D][Q1D];
+      MFEM_SHARED double B[Q1D][D1D];
+      MFEM_SHARED double G[Q1D][D1D];
 
-   diag.SetSize(height);
-   diag = 0.0;
+      MFEM_SHARED double Vz[NBZ][D1D*D1D];
+      double (*V)[D1D] = (double (*)[D1D])(Vz + z);
 
-   // Squares of the shape functions at all quadrature points.
-   DenseMatrix HQs_sq(ndof1D, nqp1D);
-   for (int i = 0; i < ndof1D; i++)
-   {
-      for (int k = 0; k < nqp1D; k++)
+      MFEM_SHARED double DQz[DIM][NBZ][D1D*Q1D];
+      double (*DQ0)[Q1D] = (double (*)[Q1D])(DQz[0] + z);
+      double (*DQ1)[Q1D] = (double (*)[Q1D])(DQz[1] + z);
+
+      MFEM_SHARED double QQz[3][NBZ][Q1D*Q1D];
+      double (*QQ)[Q1D] = (double (*)[Q1D])(QQz[0] + z);
+      double (*QQ0)[Q1D] = (double (*)[Q1D])(QQz[1] + z);
+      double (*QQ1)[Q1D] = (double (*)[Q1D])(QQz[2] + z);
+
+      MFEM_SHARED double QLz[NBZ][Q1D*L1D];
+      double (*QL)[L1D] = (double (*)[L1D]) (QLz + z);
+
+      if (z == 0)
       {
-         HQs_sq(i, k) = HQs(i, k) * HQs(i, k);
+         MFEM_FOREACH_THREAD(q,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(h,y,Q1D)
+            {
+               if (h < D1D) { B[q][h] = b(q,h); }
+               if (h < D1D) { G[q][h] = g(q,h); }
+               const int l = h;
+               if (l < L1D) { Bt[l][q] = bt(l,q); }
+            }
+         }
       }
-   }
-
-   for (int z = 0; z < nzones; z++)
-   {
-      DenseMatrix QQ_Q(quad_data->rho0DetJ0w.GetData() + z*nqp,
-                       nqp1D * nqp1D, nqp1D);
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qy,y,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            QQ[qy][qx] = 0.0;
+         }
+      }
+      MFEM_SYNC_THREAD;
 
-      // QHQ_k1_i2_k3 = QQQ_k1_k2_k3 HQs_i2_k2^2  -- contract in y direction.
-      // The first step does some reordering (it's not product of matrices).
-      // HHQ_i1_i2_k3 = HQs_i1_k1^2  QHQ_k1_i2_k3 -- contract in x direction.
-      // D_i1_i2_i3   = HHQ_i1_i2_k3 HQs_i3_k3^2  -- contract in z direction.
-      for (int k1 = 0; k1 < nqp1D; k1++)
+      for (int c = 0; c < DIM; ++c)
       {
-         for (int i2 = 0; i2 < ndof1D; i2++)
+
+         MFEM_FOREACH_THREAD(dx,x,D1D)
+         {
+            MFEM_FOREACH_THREAD(dy,y,D1D)
+            {
+               V[dx][dy] = velocity(dx,dy,c,e);
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dy,y,D1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               double u = 0.0;
+               double v = 0.0;
+               for (int dx = 0; dx < D1D; ++dx)
+               {
+                  const double input = V[dx][dy];
+                  u += B[qx][dx] * input;
+                  v += G[qx][dx] * input;
+               }
+               DQ0[dy][qx] = u;
+               DQ1[dy][qx] = v;
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
          {
-            for (int k3 = 0; k3 < nqp1D; k3++)
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
             {
-               Q_HQ(k1, i2 + ndof1D*k3) = 0.0;
-               for (int k2 = 0; k2 < nqp1D; k2++)
+               double u = 0.0;
+               double v = 0.0;
+               for (int dy = 0; dy < D1D; ++dy)
                {
-                  Q_HQ(k1, i2 + ndof1D*k3) +=
-                     QQ_Q(k1 + nqp1D*k2, k3) * HQs_sq(i2, k2);
+                  u += DQ1[dy][qx] * B[qy][dy];
+                  v += DQ0[dy][qx] * G[qy][dy];
                }
+               QQ0[qy][qx] = u;
+               QQ1[qy][qx] = v;
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               const double esx = QQ0[qy][qx] * sJit(qx,qy,e,0,c);
+               const double esy = QQ1[qy][qx] * sJit(qx,qy,e,1,c);
+               QQ[qy][qx] += esx + esy;
             }
          }
+         MFEM_SYNC_THREAD;
       }
-      mfem::Mult(HQs_sq, Q_HQ, H_HQ);
-      MultABt(HH_Q, HQs_sq, D);
+      MFEM_SYNC_THREAD;
 
-      // Transfer from the tensor structure numbering to mfem's H1 numbering.
-      FESpace.GetElementDofs(z, dofs);
-      for (int j = 0; j < dz.Size(); j++)
+      MFEM_FOREACH_THREAD(qy,y,Q1D)
       {
-         diag[dofs[dof_map[j]]] += dz[j];
+         MFEM_FOREACH_THREAD(lx,x,L1D)
+         {
+            double u = 0.0;
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               u += QQ[qy][qx] * Bt[lx][qx];
+            }
+            QL[qy][lx] = u;
+         }
       }
-   }
-
-   for (int i = 0; i < height / 3; i++)
-   {
-      diag(i + height / 3) = diag(i);
-      diag(i + 2 * height / 3) = diag(i);
-   }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(ly,y,L1D)
+      {
+         MFEM_FOREACH_THREAD(lx,x,L1D)
+         {
+            double u = 0.0;
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               u += QL[qy][lx] * Bt[ly][qy];
+            }
+            energy(lx,ly,e) = u;
+         }
+      }
+      MFEM_SYNC_THREAD;
+   });
 }
 
-void MassPAOperator::Mult(const Vector &x, Vector &y) const
+template<int DIM, int D1D, int Q1D, int L1D> static
+void ForceMultTranspose3D(const int NE,
+                          const Array<double> &Bt_,
+                          const Array<double> &B_,
+                          const Array<double> &G_,
+                          const DenseTensor &sJit_,
+                          const Vector &v_,
+                          Vector &e_)
 {
-   const int comp_size = FESpace.GetNDofs();
-   for (int c = 0; c < dim; c++)
+   auto b = Reshape(B_.Read(), Q1D, D1D);
+   auto g = Reshape(G_.Read(), Q1D, D1D);
+   auto bt = Reshape(Bt_.Read(), L1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, Q1D, NE, DIM, DIM);
+   auto velocity = Reshape(v_.Read(), D1D, D1D, D1D, DIM, NE);
+   auto energy = Reshape(e_.Write(), L1D, L1D, L1D, NE);
+
+   MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
    {
-      Vector x_comp(x.GetData() + c * comp_size, comp_size),
-             y_comp(y.GetData() + c * comp_size, comp_size);
-      if      (dim == 2) { MultQuad(x_comp, y_comp); }
-      else if (dim == 3) { MultHex(x_comp, y_comp); }
-      else { MFEM_ABORT("Unsupported dimension"); }
-   }
-}
-
-// Mass matrix action on quadrilateral elements in 2D.
-void MassPAOperator::MultQuad(const Vector &x, Vector &y) const
-{
-   const TensorBasisElement *fe_H1 =
-      dynamic_cast<const TensorBasisElement *>(FESpace.GetFE(0));
-   const DenseMatrix &HQs = tensors1D->HQshape1D;
+      const int z = MFEM_THREAD_ID(z);
 
-   const int ndof1D = HQs.Height(), nqp1D = HQs.Width();
-   DenseMatrix HQ(ndof1D, nqp1D), QQ(nqp1D, nqp1D);
-   Vector xz(ndof1D * ndof1D), yz(ndof1D * ndof1D);
-   DenseMatrix X(xz.GetData(), ndof1D, ndof1D),
-               Y(yz.GetData(), ndof1D, ndof1D);
-   Array<int> dofs;
-   double *qq = QQ.GetData();
-   const int nqp = nqp1D * nqp1D;
+      MFEM_SHARED double Bt[L1D][Q1D];
+      MFEM_SHARED double B[Q1D][D1D];
+      MFEM_SHARED double G[Q1D][D1D];
 
-   y.SetSize(x.Size());
-   y = 0.0;
-
-   for (int z = 0; z < nzones; z++)
-   {
-      FESpace.GetElementDofs(z, dofs);
-      // Transfer from the mfem's H1 local numbering to the tensor structure
-      // numbering.
-      const Array<int> &dof_map = fe_H1->GetDofMap();
-      for (int j = 0; j < xz.Size(); j++)
-      {
-         xz[j] = x[dofs[dof_map[j]]];
-      }
+      MFEM_SHARED double sm0[3][Q1D*Q1D*Q1D];
+      MFEM_SHARED double sm1[3][Q1D*Q1D*Q1D];
+      double (*V)[D1D][D1D]    = (double (*)[D1D][D1D]) (sm0+0);
+      double (*MMQ0)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+1);
+      double (*MMQ1)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+2);
 
-      // HQ_i1_k2 = X_i1_i2 HQs_i2_k2  -- contract in y direction.
-      // QQ_k1_k2 = HQs_i1_k1 HQ_i1_k2 -- contract in x direction.
-      mfem::Mult(X, HQs, HQ);
-      MultAtB(HQs, HQ, QQ);
+      double (*MQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+0);
+      double (*MQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+1);
+      double (*MQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+2);
 
-      // QQ_k1_k2 *= quad_data_k1_k2 -- scaling with quadrature values.
-      double *d = quad_data->rho0DetJ0w.GetData() + z*nqp;
-      for (int q = 0; q < nqp; q++) { qq[q] *= d[q]; }
+      double (*QQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+0);
+      double (*QQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+1);
+      double (*QQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+2);
 
-      // HQ_i1_k2 = HQs_i1_k1 QQ_k1_k2 -- contract in x direction.
-      // Y_i1_i2  = HQ_i1_k2 HQs_i2_k2 -- contract in y direction.
-      mfem::Mult(HQs, QQ, HQ);
-      MultABt(HQ, HQs, Y);
+      MFEM_SHARED double QQQ[Q1D][Q1D][Q1D];
 
-      for (int j = 0; j < yz.Size(); j++)
+      if (z == 0)
       {
-         y[dofs[dof_map[j]]] += yz[j];
+         MFEM_FOREACH_THREAD(q,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(h,y,Q1D)
+            {
+               if (h < D1D) { B[q][h] = b(q,h); }
+               if (h < D1D) { G[q][h] = g(q,h); }
+               const int l = h;
+               if (l < L1D) { Bt[l][q] = bt(l,q); }
+            }
+         }
       }
-   }
-}
-
-// Mass matrix action on hexahedral elements in 3D.
-void MassPAOperator::MultHex(const Vector &x, Vector &y) const
-{
-   const TensorBasisElement *fe_H1 =
-      dynamic_cast<const TensorBasisElement *>(FESpace.GetFE(0));
-   const DenseMatrix &HQs = tensors1D->HQshape1D;
-
-   const int ndof1D = HQs.Height(), nqp1D = HQs.Width();
-   DenseMatrix HH_Q(ndof1D * ndof1D, nqp1D);
-   DenseMatrix H_HQ(HH_Q.GetData(), ndof1D, ndof1D*nqp1D);
-   DenseMatrix Q_HQ(nqp1D, ndof1D*nqp1D);
-   DenseMatrix QQ_Q(nqp1D*nqp1D, nqp1D);
-   double *qqq = QQ_Q.GetData();
-   Vector xz(ndof1D * ndof1D * ndof1D), yz(ndof1D * ndof1D * ndof1D);
-   DenseMatrix X(xz.GetData(), ndof1D*ndof1D, ndof1D),
-               Y(yz.GetData(), ndof1D*ndof1D, ndof1D);
-   const int nqp = nqp1D * nqp1D * nqp1D;
-   Array<int> dofs;
-
-   y.SetSize(x.Size());
-   y = 0.0;
-
-   for (int z = 0; z < nzones; z++)
-   {
-      FESpace.GetElementDofs(z, dofs);
-      // Transfer from the mfem's H1 local numbering to the tensor structure
-      // numbering.
-      const Array<int> &dof_map = fe_H1->GetDofMap();
-      for (int j = 0; j < xz.Size(); j++)
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
       {
-         xz[j] = x[dofs[dof_map[j]]];
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               QQQ[qz][qy][qx] = 0.0;
+            }
+         }
       }
+      MFEM_SYNC_THREAD;
 
-      // HHQ_i1_i2_k3  = X_i1_i2_i3 HQs_i3_k3   -- contract in z direction.
-      // QHQ_k1_i2_k3  = HQs_i1_k1 HHQ_i1_i2_k3 -- contract in x direction.
-      // QQQ_k1_k2_k3  = QHQ_k1_i2_k3 HQs_i2_k2 -- contract in y direction.
-      // The last step does some reordering (it's not product of matrices).
-      mfem::Mult(X, HQs, HH_Q);
-      MultAtB(HQs, H_HQ, Q_HQ);
-      for (int k1 = 0; k1 < nqp1D; k1++)
+      for (int c = 0; c < DIM; ++c)
       {
-         for (int k2 = 0; k2 < nqp1D; k2++)
+         MFEM_FOREACH_THREAD(dx,x,D1D)
          {
-            for (int k3 = 0; k3 < nqp1D; k3++)
+            MFEM_FOREACH_THREAD(dy,y,D1D)
             {
-               QQ_Q(k1 + nqp1D*k2, k3) = 0.0;
-               for (int i2 = 0; i2 < ndof1D; i2++)
+               MFEM_FOREACH_THREAD(dz,z,D1D)
                {
-                  QQ_Q(k1 + nqp1D*k2, k3) +=
-                     Q_HQ(k1, i2 + k3*ndof1D) * HQs(i2, k2);
+                  V[dx][dy][dz] = velocity(dx,dy,dz,c,e);
                }
             }
          }
-      }
-
-      // QQQ_k1_k2_k3 *= quad_data_k1_k2_k3 -- scaling with quadrature values.
-      double *d = quad_data->rho0DetJ0w.GetData() + z*nqp;
-      for (int q = 0; q < nqp; q++) { qqq[q] *= d[q]; }
-
-      // QHQ_k1_i2_k3 = QQQ_k1_k2_k3 HQs_i2_k2 -- contract in y direction.
-      // The first step does some reordering (it's not product of matrices).
-      // HHQ_i1_i2_k3 = HQs_i1_k1 QHQ_k1_i2_k3 -- contract in x direction.
-      // Y_i1_i2_i3   = HHQ_i1_i2_k3 HQs_i3_k3 -- contract in z direction.
-      for (int k1 = 0; k1 < nqp1D; k1++)
-      {
-         for (int i2 = 0; i2 < ndof1D; i2++)
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dz,z,D1D)
          {
-            for (int k3 = 0; k3 < nqp1D; k3++)
+            MFEM_FOREACH_THREAD(dy,y,D1D)
             {
-               Q_HQ(k1, i2 + ndof1D*k3) = 0.0;
-               for (int k2 = 0; k2 < nqp1D; k2++)
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
                {
-                  Q_HQ(k1, i2 + ndof1D*k3) +=
-                     QQ_Q(k1 + nqp1D*k2, k3) * HQs(i2, k2);
+                  double u = 0.0;
+                  double v = 0.0;
+                  for (int dx = 0; dx < D1D; ++dx)
+                  {
+                     const double input = V[dx][dy][dz];
+                     u += G[qx][dx] * input;
+                     v += B[qx][dx] * input;
+                  }
+                  MMQ0[dz][dy][qx] = u;
+                  MMQ1[dz][dy][qx] = v;
                }
             }
          }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dz,z,D1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int dy = 0; dy < D1D; ++dy)
+                  {
+                     u += MMQ0[dz][dy][qx] * B[qy][dy];
+                     v += MMQ1[dz][dy][qx] * G[qy][dy];
+                     w += MMQ1[dz][dy][qx] * B[qy][dy];
+                  }
+                  MQQ0[dz][qy][qx] = u;
+                  MQQ1[dz][qy][qx] = v;
+                  MQQ2[dz][qy][qx] = w;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int dz = 0; dz < D1D; ++dz)
+                  {
+                     u += MQQ0[dz][qy][qx] * B[qz][dz];
+                     v += MQQ1[dz][qy][qx] * B[qz][dz];
+                     w += MQQ2[dz][qy][qx] * G[qz][dz];
+                  }
+                  QQQ0[qz][qy][qx] = u;
+                  QQQ1[qz][qy][qx] = v;
+                  QQQ2[qz][qy][qx] = w;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  const double esx = QQQ0[qz][qy][qx] * sJit(qx,qy,qz,e,0,c);
+                  const double esy = QQQ1[qz][qy][qx] * sJit(qx,qy,qz,e,1,c);
+                  const double esz = QQQ2[qz][qy][qx] * sJit(qx,qy,qz,e,2,c);
+                  QQQ[qz][qy][qx] += esx + esy + esz;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
       }
-      mfem::Mult(HQs, Q_HQ, H_HQ);
-      MultABt(HH_Q, HQs, Y);
-
-      for (int j = 0; j < yz.Size(); j++)
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
       {
-         y[dofs[dof_map[j]]] += yz[j];
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(lx,x,L1D)
+            {
+               double u = 0.0;
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  u += QQQ[qz][qy][qx] * Bt[lx][qx];
+               }
+               MQQ0[qz][qy][lx] = u;
+            }
+         }
       }
-   }
-}
-
-void LocalMassPAOperator::Mult(const Vector &x, Vector &y) const
-{
-   if      (dim == 2) { MultQuad(x, y); }
-   else if (dim == 3) { MultHex(x, y); }
-   else { MFEM_ABORT("Unsupported dimension"); }
-}
-
-// L2 mass matrix action on a single quadrilateral element in 2D.
-void LocalMassPAOperator::MultQuad(const Vector &x, Vector &y) const
-{
-   const DenseMatrix &LQs = tensors1D->LQshape1D;
-
-   y.SetSize(x.Size());
-   y = 0.0;
-
-   const int ndof1D = LQs.Height(), nqp1D = LQs.Width();
-   DenseMatrix LQ(ndof1D, nqp1D), QQ(nqp1D, nqp1D);
-   DenseMatrix X(x.GetData(), ndof1D, ndof1D), Y(y.GetData(), ndof1D, ndof1D);
-   double *qq = QQ.GetData();
-   const int nqp = nqp1D * nqp1D;
-
-   // LQ_i1_k2 = X_i1_i2 LQs_i2_k2  -- contract in y direction.
-   // QQ_k1_k2 = LQs_i1_k1 LQ_i1_k2 -- contract in x direction.
-   mfem::Mult(X, LQs, LQ);
-   MultAtB(LQs, LQ, QQ);
-
-   // QQ_k1_k2 *= quad_data_k1_k2 -- scaling with quadrature values.
-   const double *d = quad_data->rho0DetJ0w.GetData() + zone_id*nqp;
-   for (int q = 0; q < nqp; q++) { qq[q] *= d[q]; }
-
-   // LQ_i1_k2 = LQs_i1_k1 QQ_k1_k2 -- contract in x direction.
-   // Y_i1_i2  = LQ_i1_k2 LQs_i2_k2 -- contract in y direction.
-   mfem::Mult(LQs, QQ, LQ);
-   MultABt(LQ, LQs, Y);
-}
-
-// L2 mass matrix action on a single hexahedral element in 3D.
-void LocalMassPAOperator::MultHex(const Vector &x, Vector &y) const
-{
-   const DenseMatrix &LQs = tensors1D->LQshape1D;
-
-   y.SetSize(x.Size());
-   y = 0.0;
-
-   const int ndof1D = LQs.Height(), nqp1D = LQs.Width();
-   DenseMatrix LL_Q(ndof1D * ndof1D, nqp1D);
-   DenseMatrix L_LQ(LL_Q.GetData(), ndof1D, ndof1D*nqp1D);
-   DenseMatrix Q_LQ(nqp1D, ndof1D*nqp1D);
-   DenseMatrix QQ_Q(nqp1D*nqp1D, nqp1D);
-   double *qqq = QQ_Q.GetData();
-   DenseMatrix X(x.GetData(), ndof1D*ndof1D, ndof1D),
-               Y(y.GetData(), ndof1D*ndof1D, ndof1D);
-   const int nqp = nqp1D * nqp1D * nqp1D;
-
-   // LLQ_i1_i2_k3  = X_i1_i2_i3 LQs_i3_k3   -- contract in z direction.
-   // QLQ_k1_i2_k3  = LQs_i1_k1 LLQ_i1_i2_k3 -- contract in x direction.
-   // QQQ_k1_k2_k3  = QLQ_k1_i2_k3 LQs_i2_k2 -- contract in y direction.
-   // The last step does some reordering (it's not product of matrices).
-   mfem::Mult(X, LQs, LL_Q);
-   MultAtB(LQs, L_LQ, Q_LQ);
-   for (int k1 = 0; k1 < nqp1D; k1++)
-   {
-      for (int k2 = 0; k2 < nqp1D; k2++)
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
       {
-         for (int k3 = 0; k3 < nqp1D; k3++)
+         MFEM_FOREACH_THREAD(ly,y,L1D)
          {
-            QQ_Q(k1 + nqp1D*k2, k3) = 0.0;
-            for (int i2 = 0; i2 < ndof1D; i2++)
+            MFEM_FOREACH_THREAD(lx,x,L1D)
             {
-               QQ_Q(k1 + nqp1D*k2, k3) +=
-                  Q_LQ(k1, i2 + k3*ndof1D) * LQs(i2, k2);
+               double u = 0.0;
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  u += MQQ0[qz][qy][lx] * Bt[ly][qy];
+               }
+               MMQ0[qz][ly][lx] = u;
             }
          }
       }
-   }
-
-   // QQQ_k1_k2_k3 *= quad_data_k1_k2_k3 -- scaling with quadrature values.
-   double *d = quad_data->rho0DetJ0w.GetData() + zone_id*nqp;
-   for (int q = 0; q < nqp; q++) { qqq[q] *= d[q]; }
-
-   // QLQ_k1_i2_k3 = QQQ_k1_k2_k3 LQs_i2_k2 -- contract in y direction.
-   // The first step does some reordering (it's not product of matrices).
-   // LLQ_i1_i2_k3 = LQs_i1_k1 QLQ_k1_i2_k3 -- contract in x direction.
-   // Y_i1_i2_i3   = LLQ_i1_i2_k3 DQs_i3_k3 -- contract in z direction.
-   for (int k1 = 0; k1 < nqp1D; k1++)
-   {
-      for (int i2 = 0; i2 < ndof1D; i2++)
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lz,z,L1D)
       {
-         for (int k3 = 0; k3 < nqp1D; k3++)
+         MFEM_FOREACH_THREAD(ly,y,L1D)
          {
-            Q_LQ(k1, i2 + ndof1D*k3) = 0.0;
-            for (int k2 = 0; k2 < nqp1D; k2++)
+            MFEM_FOREACH_THREAD(lx,x,L1D)
             {
-               Q_LQ(k1, i2 + ndof1D*k3) +=
-                  QQ_Q(k1 + nqp1D*k2, k3) * LQs(i2, k2);
+               double u = 0.0;
+               for (int qz = 0; qz < Q1D; ++qz)
+               {
+                  u += MMQ0[qz][ly][lx] * Bt[lz][qz];
+               }
+               energy(lx,ly,lz,e) = u;
             }
          }
       }
+      MFEM_SYNC_THREAD;
+   });
+}
+
+typedef void (*fForceMultTranspose)(const int NE,
+                                    const Array<double> &Bt,
+                                    const Array<double> &B,
+                                    const Array<double> &G,
+                                    const DenseTensor &sJit,
+                                    const Vector &X, Vector &Y);
+
+static void ForceMultTranspose(const int DIM, const int D1D, const int Q1D,
+                               const int L1D, const int NE,
+                               const Array<double> &L2Bt,
+                               const Array<double> &H1B,
+                               const Array<double> &H1G,
+                               const DenseTensor &stressJinvT,
+                               const Vector &v,
+                               Vector &e)
+{
+   // DIM, D1D, Q1D, L1D(=D1D-1)
+   MFEM_VERIFY(L1D==D1D-1, "L1D!=D1D-1");
+   const int id = ((DIM)<<8)|(D1D)<<4|(Q1D);
+   static std::unordered_map<int, fForceMultTranspose> call =
+   {
+      {0x234,&ForceMultTranspose2D<2,3,4,2>},
+      {0x246,&ForceMultTranspose2D<2,4,6,3>},
+      {0x258,&ForceMultTranspose2D<2,5,8,4>},
+      {0x334,&ForceMultTranspose3D<3,3,4,2>},
+      {0x346,&ForceMultTranspose3D<3,4,6,3>},
+      {0x358,&ForceMultTranspose3D<3,5,8,4>}
+   };
+   if (!call[id])
+   {
+      mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
+      MFEM_ABORT("Unknown kernel");
    }
-   mfem::Mult(LQs, Q_LQ, L_LQ);
-   MultABt(LL_Q, LQs, Y);
+   call[id](NE, L2Bt, H1B, H1G, stressJinvT, v, e);
+}
+
+void ForcePAOperator::MultTranspose(const Vector &x, Vector &y) const
+{
+   H1R->Mult(x, Y);
+   ForceMultTranspose(dim, D1D, Q1D, L1D, NE,
+                      L2D2Q->Bt, H1D2Q->B, H1D2Q->G,
+                      qdata.stressJinvT, Y, X);
+   if (L2R) { L2R->MultTranspose(X, y); }
+   else { y = X; }
 }
 
 } // namespace hydrodynamics
diff --git a/laghos_assembly.hpp b/laghos_assembly.hpp
index b5bfce91..14e313b2 100644
--- a/laghos_assembly.hpp
+++ b/laghos_assembly.hpp
@@ -18,6 +18,8 @@
 #define MFEM_LAGHOS_ASSEMBLY
 
 #include "mfem.hpp"
+#include "general/forall.hpp"
+#include "linalg/dtensor.hpp"
 
 namespace mfem
 {
@@ -28,63 +30,35 @@ namespace hydrodynamics
 // Container for all data needed at quadrature points.
 struct QuadratureData
 {
-   // TODO: use QuadratureFunctions?
-
-   // Reference to physical Jacobian for the initial mesh. These are computed
-   // only at time zero and stored here.
+   // Reference to physical Jacobian for the initial mesh.
+   // These are computed only at time zero and stored here.
    DenseTensor Jac0inv;
 
-   // Quadrature data used for full/partial assembly of the force operator. At
-   // each quadrature point, it combines the stress, inverse Jacobian,
-   // determinant of the Jacobian and the integration weight. It must be
-   // recomputed in every time step.
+   // Quadrature data used for full/partial assembly of the force operator.
+   // At each quadrature point, it combines the stress, inverse Jacobian,
+   // determinant of the Jacobian and the integration weight.
+   // It must be recomputed in every time step.
    DenseTensor stressJinvT;
 
-   // Quadrature data used for full/partial assembly of the mass matrices. At
-   // time zero, we compute and store (rho0 * det(J0) * qp_weight) at each
+   // Quadrature data used for full/partial assembly of the mass matrices.
+   // At time zero, we compute and store (rho0 * det(J0) * qp_weight) at each
    // quadrature point. Note the at any other time, we can compute
    // rho = rho0 * det(J0) / det(J), representing the notion of pointwise mass
    // conservation.
    Vector rho0DetJ0w;
 
-   // Initial length scale. This represents a notion of local mesh size. We
-   // assume that all initial zones have similar size.
+   // Initial length scale. This represents a notion of local mesh size.
+   // We assume that all initial zones have similar size.
    double h0;
 
    // Estimate of the minimum time step over all quadrature points. This is
    // recomputed at every time step to achieve adaptive time stepping.
    double dt_est;
 
-   QuadratureData(int dim, int nzones, int quads_per_zone)
-      : Jac0inv(dim, dim, nzones * quads_per_zone),
-        stressJinvT(nzones * quads_per_zone, dim, dim),
-        rho0DetJ0w(nzones * quads_per_zone) { }
-};
-
-// Stores values of the one-dimensional shape functions and gradients at all 1D
-// quadrature points. All sizes are (dofs1D_cnt x quads1D_cnt).
-struct Tensors1D
-{
-   // H1 shape functions and gradients, L2 shape functions.
-   DenseMatrix HQshape1D, HQgrad1D, LQshape1D;
-
-   Tensors1D(int H1order, int L2order, int nqp1D, bool bernstein_v);
-};
-
-class FastEvaluator
-{
-   const int dim;
-   FiniteElementSpace &H1FESpace;
-   Tensors1D *tensors1D;
-
-public:
-   FastEvaluator(FiniteElementSpace &h1fes, Tensors1D *t1D)
-      : dim(h1fes.GetMesh()->Dimension()), H1FESpace(h1fes), tensors1D(t1D) { }
-
-   void GetL2Values(const Vector &vecL2, Vector &vecQP) const;
-   // The input vec is an H1 function with dim components, over a zone.
-   // The output is J_ij = d(vec_i) / d(x_j) with ij = 1 .. dim.
-   void GetVectorGrad(const DenseMatrix &vec, DenseTensor &J) const;
+   QuadratureData(int dim, int NE, int quads_per_el)
+      : Jac0inv(dim, dim, NE * quads_per_el),
+        stressJinvT(NE * quads_per_el, dim, dim),
+        rho0DetJ0w(NE * quads_per_el) { }
 };
 
 // This class is used only for visualization. It assembles (rho, phi) in each
@@ -92,157 +66,67 @@ class FastEvaluator
 // projection of the density.
 class DensityIntegrator : public LinearFormIntegrator
 {
+   using LinearFormIntegrator::AssembleRHSElementVect;
 private:
-   const QuadratureData &quad_data;
+   const QuadratureData &qdata;
 
 public:
-   DensityIntegrator(QuadratureData &quad_data_) : quad_data(quad_data_) { }
-
+   DensityIntegrator(QuadratureData &qdata) : qdata(qdata) { }
    virtual void AssembleRHSElementVect(const FiniteElement &fe,
                                        ElementTransformation &Tr,
                                        Vector &elvect);
 };
 
-// Assembles element contributions to the global force matrix. This class is
-// used for the full assembly case; it's not used with partial assembly.
+// Performs full assembly for the force operator.
 class ForceIntegrator : public BilinearFormIntegrator
 {
 private:
-   const QuadratureData &quad_data;
-
+   const QuadratureData &qdata;
 public:
-   ForceIntegrator(QuadratureData &quad_data_) : quad_data(quad_data_) { }
-
+   ForceIntegrator(QuadratureData &qdata) : qdata(qdata) { }
    virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
                                        const FiniteElement &test_fe,
-                                       ElementTransformation &Trans,
+                                       ElementTransformation &Tr,
                                        DenseMatrix &elmat);
 };
 
-// Performs partial assembly, which corresponds to (and replaces) the use of the
-// LagrangianHydroOperator::Force global matrix.
+// Performs partial assembly for the force operator.
 class ForcePAOperator : public Operator
 {
 private:
-   const int dim, nzones;
-
-   QuadratureData *quad_data;
-   FiniteElementSpace &H1FESpace, &L2FESpace;
-   Tensors1D *tensors1D;
-
-   // Force matrix action on quadrilateral elements in 2D.
-   void MultQuad(const Vector &vecL2, Vector &vecH1) const;
-   // Force matrix action on hexahedral elements in 3D.
-   void MultHex(const Vector &vecL2, Vector &vecH1) const;
-
-   // Transpose force matrix action on quadrilateral elements in 2D.
-   void MultTransposeQuad(const Vector &vecH1, Vector &vecL2) const;
-   // Transpose force matrix action on hexahedral elements in 3D.
-   void MultTransposeHex(const Vector &vecH1, Vector &vecL2) const;
-
+   const int dim, NE;
+   const QuadratureData &qdata;
+   const ParFiniteElementSpace &H1, &L2;
+   const Operator *H1R, *L2R;
+   const IntegrationRule &ir1D;
+   const int D1D, Q1D, L1D, H1sz, L2sz;
+   const DofToQuad *L2D2Q, *H1D2Q;
+   mutable Vector X, Y;
 public:
-   ForcePAOperator(QuadratureData *quad_data_,
-                   FiniteElementSpace &h1fes, FiniteElementSpace &l2fes,
-                   Tensors1D *t1D)
-      : dim(h1fes.GetMesh()->Dimension()), nzones(h1fes.GetMesh()->GetNE()),
-        quad_data(quad_data_), H1FESpace(h1fes), L2FESpace(l2fes),
-        tensors1D(t1D) { }
-
-   virtual void Mult(const Vector &vecL2, Vector &vecH1) const;
-   virtual void MultTranspose(const Vector &vecH1, Vector &vecL2) const;
-
-   ~ForcePAOperator() { }
+   ForcePAOperator(const QuadratureData&,
+                   ParFiniteElementSpace&,
+                   ParFiniteElementSpace&,
+                   const IntegrationRule&);
+   virtual void Mult(const Vector&, Vector&) const;
+   virtual void MultTranspose(const Vector&, Vector&) const;
 };
 
 // Performs partial assembly for the velocity mass matrix.
 class MassPAOperator : public Operator
 {
 private:
-   const int dim, nzones;
-
-   QuadratureData *quad_data;
-   FiniteElementSpace &FESpace;
-   Tensors1D *tensors1D;
-
-   // Mass matrix action on quadrilateral elements in 2D.
-   void MultQuad(const Vector &x, Vector &y) const;
-   // Mass matrix action on hexahedral elements in 3D.
-   void MultHex(const Vector &x, Vector &y) const;
-
-public:
-   MassPAOperator(QuadratureData *quad_data_, FiniteElementSpace &fes,
-                  Tensors1D *t1D)
-      : Operator(fes.GetVSize()),
-        dim(fes.GetMesh()->Dimension()), nzones(fes.GetMesh()->GetNE()),
-        quad_data(quad_data_), FESpace(fes), tensors1D(t1D) { }
-
-   // Mass matrix action.
-   virtual void Mult(const Vector &x, Vector &y) const;
-
-   void ComputeDiagonal2D(Vector &diag) const;
-   void ComputeDiagonal3D(Vector &diag) const;
-
-   virtual const Operator *GetProlongation() const
-   { return FESpace.GetProlongationMatrix(); }
-   virtual const Operator *GetRestriction() const
-   { return FESpace.GetRestrictionMatrix(); }
-};
-
-// Scales by the inverse diagonal of the MassPAOperator.
-class DiagonalSolver : public Solver
-{
-private:
-   Vector diag;
-   FiniteElementSpace &FESpace;
-
-public:
-   DiagonalSolver(FiniteElementSpace &fes)
-      : Solver(fes.GetVSize()), diag(), FESpace(fes) { }
-
-   void SetDiagonal(Vector &d)
-   {
-      const Operator *P = FESpace.GetProlongationMatrix();
-
-      // Happens when this is called by the serial version of Laghos.
-      if (P == NULL) { diag = d; return; }
-
-      diag.SetSize(P->Width());
-      P->MultTranspose(d, diag);
-   }
-
-   virtual void Mult(const Vector &x, Vector &y) const
-   {
-      for (int i = 0; i < x.Size(); i++) { y(i) = x(i) / diag(i); }
-   }
-   virtual void SetOperator(const Operator &op) { }
-};
-
-// Performs partial assembly for the energy mass matrix on a single zone.
-// Used to perform local CG solves, thus avoiding unnecessary communication.
-class LocalMassPAOperator : public Operator
-{
-private:
-   const int dim;
-   int zone_id;
-
-   QuadratureData *quad_data;
-   Tensors1D *tensors1D;
-
-   // Mass matrix action on a quadrilateral element in 2D.
-   void MultQuad(const Vector &x, Vector &y) const;
-   // Mass matrix action on a hexahedral element in 3D.
-   void MultHex(const Vector &x, Vector &y) const;
-
+   const MPI_Comm comm;
+   const int dim, NE, vsize;
+   ParBilinearForm pabf;
+   int ess_tdofs_count;
+   Array<int> ess_tdofs;
+   OperatorPtr mass;
 public:
-   LocalMassPAOperator(QuadratureData *quad_data_, FiniteElementSpace &fes,
-                       Tensors1D *t1D)
-      : Operator(fes.GetFE(0)->GetDof()),
-        dim(fes.GetMesh()->Dimension()), zone_id(0),
-        quad_data(quad_data_), tensors1D(t1D) { }
-
-   void SetZoneId(int zid) { zone_id = zid; }
-
-   virtual void Mult(const Vector &x, Vector &y) const;
+   MassPAOperator(ParFiniteElementSpace&, const IntegrationRule&, Coefficient&);
+   virtual void Mult(const Vector&, Vector&) const;
+   virtual void SetEssentialTrueDofs(Array<int>&);
+   virtual void EliminateRHS(Vector&) const;
+   const ParBilinearForm &GetBF() const { return pabf; }
 };
 
 } // namespace hydrodynamics
diff --git a/laghos_solver.cpp b/laghos_solver.cpp
index 4ab392e3..4e0cf7f1 100644
--- a/laghos_solver.cpp
+++ b/laghos_solver.cpp
@@ -14,12 +14,13 @@
 // software, applications, hardware, advanced system engineering and early
 // testbed platforms, in support of the nation's exascale computing imperative.
 
+#include "general/forall.hpp"
 #include "laghos_solver.hpp"
+#include "linalg/kernels.hpp"
+#include <unordered_map>
 
 #ifdef MFEM_USE_MPI
 
-using namespace std;
-
 namespace mfem
 {
 
@@ -30,6 +31,7 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
                     ParGridFunction &gf, const char *title,
                     int x, int y, int w, int h, bool vec)
 {
+   gf.HostRead();
    ParMesh &pmesh = *gf.ParFESpace()->GetParMesh();
    MPI_Comm comm = pmesh.GetComm();
 
@@ -59,14 +61,14 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
       if (myid == 0 && newly_opened)
       {
          const char* keys = (gf.FESpace()->GetMesh()->Dimension() == 2)
-                            ? "mAcRjlPPPPPPPP" : "maaAcl";
+                            ? "mAcRjlPPPPPPPP" : "mmaaAcl";
 
          sock << "window_title '" << title << "'\n"
               << "window_geometry "
               << x << " " << y << " " << w << " " << h << "\n"
               << "keys " << keys;
          if ( vec ) { sock << "vvv"; }
-         sock << endl;
+         sock << std::endl;
       }
 
       if (myid == 0)
@@ -78,130 +80,295 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
    while (connection_failed);
 }
 
-LagrangianHydroOperator::LagrangianHydroOperator(int size,
-                                                 ParFiniteElementSpace &h1_fes,
-                                                 ParFiniteElementSpace &l2_fes,
-                                                 Array<int> &essential_tdofs,
-                                                 ParGridFunction &rho0,
-                                                 int source_type_, double cfl_,
-                                                 Coefficient *material_,
-                                                 bool visc, bool pa,
-                                                 double cgt, int cgiter,
-                                                 double ftz,
-                                                 int h1_basis_type)
-   : TimeDependentOperator(size),
-     H1FESpace(h1_fes), L2FESpace(l2_fes),
-     ess_tdofs(essential_tdofs),
-     dim(h1_fes.GetMesh()->Dimension()),
-     nzones(h1_fes.GetMesh()->GetNE()),
-     l2dofs_cnt(l2_fes.GetFE(0)->GetDof()),
-     h1dofs_cnt(h1_fes.GetFE(0)->GetDof()),
-     source_type(source_type_), cfl(cfl_),
-     use_viscosity(visc), p_assembly(pa), cg_rel_tol(cgt), cg_max_iter(cgiter),
-     ftz_tol(ftz),
-     material_pcf(material_),
-     Mv(&h1_fes), Mv_spmat_copy(),
-     Me(l2dofs_cnt, l2dofs_cnt, nzones), Me_inv(l2dofs_cnt, l2dofs_cnt, nzones),
-     integ_rule(IntRules.Get(h1_fes.GetMesh()->GetElementBaseGeometry(0),
-                             3*h1_fes.GetOrder(0) + l2_fes.GetOrder(0) - 1)),
-     quad_data(dim, nzones, integ_rule.GetNPoints()),
-     quad_data_is_current(false), forcemat_is_assembled(false),
-     tensors1D(H1FESpace.GetFE(0)->GetOrder(), L2FESpace.GetFE(0)->GetOrder(),
-               int(floor(0.7 + pow(integ_rule.GetNPoints(), 1.0 / dim))),
-               h1_basis_type == BasisType::Positive),
-     evaluator(H1FESpace, &tensors1D),
-     Force(&l2_fes, &h1_fes), ForcePA(&quad_data, h1_fes, l2_fes, &tensors1D),
-     VMassPA(&quad_data, H1FESpace, &tensors1D), VMassPA_prec(H1FESpace),
-     locEMassPA(&quad_data, l2_fes, &tensors1D),
-     locCG(), timer()
+static void Rho0DetJ0Vol(const int dim, const int NE,
+                         const IntegrationRule &ir,
+                         ParMesh *pmesh,
+                         ParFiniteElementSpace &L2,
+                         const ParGridFunction &rho0,
+                         QuadratureData &qdata,
+                         double &volume)
 {
+   const int NQ = ir.GetNPoints();
+   const int Q1D = IntRules.Get(Geometry::SEGMENT,ir.GetOrder()).GetNPoints();
+   const int flags = GeometricFactors::JACOBIANS|GeometricFactors::DETERMINANTS;
+   const GeometricFactors *geom = pmesh->GetGeometricFactors(ir, flags);
+   Vector rho0Q(NQ*NE);
+   rho0Q.UseDevice(true);
+   Vector j, detj;
+   const QuadratureInterpolator *qi = L2.GetQuadratureInterpolator(ir);
+   qi->Mult(rho0, QuadratureInterpolator::VALUES, rho0Q, j, detj);
+   auto W = ir.GetWeights().Read();
+   auto R = Reshape(rho0Q.Read(), NQ, NE);
+   auto J = Reshape(geom->J.Read(), NQ, dim, dim, NE);
+   auto detJ = Reshape(geom->detJ.Read(), NQ, NE);
+   auto V = Reshape(qdata.rho0DetJ0w.Write(), NQ, NE);
+   Memory<double> &Jinv_m = qdata.Jac0inv.GetMemory();
+   const MemoryClass mc = Device::GetMemoryClass();
+   const int Ji_total_size = qdata.Jac0inv.TotalSize();
+   auto invJ = Reshape(Jinv_m.Write(mc, Ji_total_size), dim, dim, NQ, NE);
+   Vector vol(NE*NQ), one(NE*NQ);
+   auto A = Reshape(vol.Write(), NQ, NE);
+   auto O = Reshape(one.Write(), NQ, NE);
+   MFEM_ASSERT(dim==2 || dim==3, "");
+   if (dim==2)
+   {
+      MFEM_FORALL_2D(e, NE, Q1D, Q1D, 1,
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               const int q = qx + qy * Q1D;
+               const double J11 = J(q,0,0,e);
+               const double J12 = J(q,1,0,e);
+               const double J21 = J(q,0,1,e);
+               const double J22 = J(q,1,1,e);
+               const double det = detJ(q,e);
+               V(q,e) =  W[q] * R(q,e) * det;
+               const double r_idetJ = 1.0 / det;
+               invJ(0,0,q,e) =  J22 * r_idetJ;
+               invJ(1,0,q,e) = -J12 * r_idetJ;
+               invJ(0,1,q,e) = -J21 * r_idetJ;
+               invJ(1,1,q,e) =  J11 * r_idetJ;
+               A(q,e) = W[q] * det;
+               O(q,e) = 1.0;
+            }
+         }
+      });
+   }
+   else
+   {
+      MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
+      {
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  const int q = qx + (qy + qz * Q1D) * Q1D;
+                  const double J11 = J(q,0,0,e), J12 = J(q,0,1,e), J13 = J(q,0,2,e);
+                  const double J21 = J(q,1,0,e), J22 = J(q,1,1,e), J23 = J(q,1,2,e);
+                  const double J31 = J(q,2,0,e), J32 = J(q,2,1,e), J33 = J(q,2,2,e);
+                  const double det = detJ(q,e);
+                  V(q,e) = W[q] * R(q,e) * det;
+                  const double r_idetJ = 1.0 / det;
+                  invJ(0,0,q,e) = r_idetJ * ((J22 * J33)-(J23 * J32));
+                  invJ(1,0,q,e) = r_idetJ * ((J32 * J13)-(J33 * J12));
+                  invJ(2,0,q,e) = r_idetJ * ((J12 * J23)-(J13 * J22));
+                  invJ(0,1,q,e) = r_idetJ * ((J23 * J31)-(J21 * J33));
+                  invJ(1,1,q,e) = r_idetJ * ((J33 * J11)-(J31 * J13));
+                  invJ(2,1,q,e) = r_idetJ * ((J13 * J21)-(J11 * J23));
+                  invJ(0,2,q,e) = r_idetJ * ((J21 * J32)-(J22 * J31));
+                  invJ(1,2,q,e) = r_idetJ * ((J31 * J12)-(J32 * J11));
+                  invJ(2,2,q,e) = r_idetJ * ((J11 * J22)-(J12 * J21));
+                  A(q,e) = W[q] * det;
+                  O(q,e) = 1.0;
+               }
+            }
+         }
+      });
+   }
+   qdata.rho0DetJ0w.HostRead();
+   volume = vol * one;
+}
 
-   GridFunctionCoefficient rho_coeff(&rho0);
+LagrangianHydroOperator::LagrangianHydroOperator(const int size,
+                                                 ParFiniteElementSpace &h1,
+                                                 ParFiniteElementSpace &l2,
+                                                 const Array<int> &ess_tdofs,
+                                                 Coefficient &rho0_coeff,
+                                                 ParGridFunction &rho0_gf,
+                                                 Coefficient &gamma_coeff,
+                                                 ParGridFunction &gamma_gf,
+                                                 const int source,
+                                                 const double cfl,
+                                                 const bool visc,
+                                                 const bool p_assembly,
+                                                 const double cgt,
+                                                 const int cgiter,
+                                                 double ftz,
+                                                 const int oq) :
+   TimeDependentOperator(size),
+   H1(h1), L2(l2), H1c(H1.GetParMesh(), H1.FEColl(), 1),
+   pmesh(H1.GetParMesh()),
+   H1Vsize(H1.GetVSize()),
+   H1TVSize(H1.TrueVSize()),
+   H1GTVSize(H1.GlobalTrueVSize()),
+   L2Vsize(L2.GetVSize()),
+   L2TVSize(L2.TrueVSize()),
+   L2GTVSize(L2.GlobalTrueVSize()),
+   block_offsets(4),
+   x_gf(&H1),
+   ess_tdofs(ess_tdofs),
+   dim(pmesh->Dimension()),
+   NE(pmesh->GetNE()),
+   l2dofs_cnt(L2.GetFE(0)->GetDof()),
+   h1dofs_cnt(H1.GetFE(0)->GetDof()),
+   source_type(source), cfl(cfl),
+   use_viscosity(visc),
+   p_assembly(p_assembly),
+   cg_rel_tol(cgt), cg_max_iter(cgiter),ftz_tol(ftz),
+   gamma_coeff(gamma_coeff),
+   gamma_gf(gamma_gf),
+   Mv(&H1), Mv_spmat_copy(),
+   Me(l2dofs_cnt, l2dofs_cnt, NE),
+   Me_inv(l2dofs_cnt, l2dofs_cnt, NE),
+   ir(IntRules.Get(pmesh->GetElementBaseGeometry(0),
+                   (oq > 0) ? oq : 3 * H1.GetOrder(0) + L2.GetOrder(0) - 1)),
+   Q1D(int(floor(0.7 + pow(ir.GetNPoints(), 1.0 / dim)))),
+   qdata(dim, NE, ir.GetNPoints()),
+   qdata_is_current(false),
+   forcemat_is_assembled(false),
+   Force(&L2, &H1),
+   ForcePA(nullptr), VMassPA(nullptr), EMassPA(nullptr),
+   VMassPA_Jprec(nullptr),
+   CG_VMass(H1.GetParMesh()->GetComm()),
+   CG_EMass(L2.GetParMesh()->GetComm()),
+   timer(p_assembly ? L2TVSize : 1),
+   qupdate(dim, NE, Q1D, visc, cfl, &timer, gamma_gf, ir, H1, L2),
+   X(H1c.GetTrueVSize()),
+   B(H1c.GetTrueVSize()),
+   one(L2Vsize),
+   rhs(H1Vsize),
+   e_rhs(L2Vsize),
+   rhs_c_gf(&H1c),
+   dvc_gf(&H1c)
+{
+   block_offsets[0] = 0;
+   block_offsets[1] = block_offsets[0] + H1Vsize;
+   block_offsets[2] = block_offsets[1] + H1Vsize;
+   block_offsets[3] = block_offsets[2] + L2Vsize;
+   one.UseDevice(true);
+   one = 1.0;
 
-   // Standard local assembly and inversion for energy mass matrices.
-   MassIntegrator mi(rho_coeff, &integ_rule);
-   for (int i = 0; i < nzones; i++)
+   if (p_assembly)
    {
-      DenseMatrixInverse inv(&Me(i));
-      mi.AssembleElementMatrix(*l2_fes.GetFE(i),
-                               *l2_fes.GetElementTransformation(i), Me(i));
-      inv.Factor();
-      inv.GetInverseMatrix(Me_inv(i));
+      ForcePA = new ForcePAOperator(qdata, H1, L2, ir);
+      VMassPA = new MassPAOperator(H1c, ir, rho0_coeff);
+      EMassPA = new MassPAOperator(L2, ir, rho0_coeff);
+      // Inside the above constructors for mass, there is reordering of the mesh
+      // nodes which is performed on the host. Since the mesh nodes are a
+      // subvector, so we need to sync with the rest of the base vector (which
+      // is assumed to be in the memory space used by the mfem::Device).
+      H1.GetParMesh()->GetNodes()->ReadWrite();
+      // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e.,
+      // we must enforce v_x/y/z = 0 for the velocity components.
+      const int bdr_attr_max = H1.GetMesh()->bdr_attributes.Max();
+      Array<int> ess_bdr(bdr_attr_max);
+      for (int c = 0; c < dim; c++)
+      {
+         ess_bdr = 0;
+         ess_bdr[c] = 1;
+         H1c.GetEssentialTrueDofs(ess_bdr, c_tdofs[c]);
+         c_tdofs[c].Read();
+      }
+      X.UseDevice(true);
+      B.UseDevice(true);
+      rhs.UseDevice(true);
+      e_rhs.UseDevice(true);
+   }
+   else
+   {
+      // Standard local assembly and inversion for energy mass matrices.
+      // 'Me' is used in the computation of the internal energy
+      // which is used twice: once at the start and once at the end of the run.
+      MassIntegrator mi(rho0_coeff, &ir);
+      for (int e = 0; e < NE; e++)
+      {
+         DenseMatrixInverse inv(&Me(e));
+         const FiniteElement &fe = *L2.GetFE(e);
+         ElementTransformation &Tr = *L2.GetElementTransformation(e);
+         mi.AssembleElementMatrix(fe, Tr, Me(e));
+         inv.Factor();
+         inv.GetInverseMatrix(Me_inv(e));
+      }
+      // Standard assembly for the velocity mass matrix.
+      VectorMassIntegrator *vmi = new VectorMassIntegrator(rho0_coeff, &ir);
+      Mv.AddDomainIntegrator(vmi);
+      Mv.Assemble();
+      Mv_spmat_copy = Mv.SpMat();
    }
-
-   // Standard assembly for the velocity mass matrix.
-   VectorMassIntegrator *vmi = new VectorMassIntegrator(rho_coeff, &integ_rule);
-   Mv.AddDomainIntegrator(vmi);
-   Mv.Assemble();
-   Mv_spmat_copy = Mv.SpMat();
 
    // Values of rho0DetJ0 and Jac0inv at all quadrature points.
-   const int nqp = integ_rule.GetNPoints();
-   Vector rho_vals(nqp);
-   for (int i = 0; i < nzones; i++)
+   // Initial local mesh size (assumes all mesh elements are the same).
+   int Ne, ne = NE;
+   double Volume, vol = 0.0;
+   if (dim > 1) { Rho0DetJ0Vol(dim, NE, ir, pmesh, L2, rho0_gf, qdata, vol); }
+   else
    {
-      rho0.GetValues(i, integ_rule, rho_vals);
-      ElementTransformation *T = h1_fes.GetElementTransformation(i);
-      for (int q = 0; q < nqp; q++)
+      const int NQ = ir.GetNPoints();
+      Vector rho_vals(NQ);
+      for (int e = 0; e < NE; e++)
       {
-         const IntegrationPoint &ip = integ_rule.IntPoint(q);
-         T->SetIntPoint(&ip);
-
-         DenseMatrixInverse Jinv(T->Jacobian());
-         Jinv.GetInverseMatrix(quad_data.Jac0inv(i*nqp + q));
-
-         const double rho0DetJ0 = T->Weight() * rho_vals(q);
-         quad_data.rho0DetJ0w(i*nqp + q) = rho0DetJ0 *
-                                           integ_rule.IntPoint(q).weight;
+         rho0_gf.GetValues(e, ir, rho_vals);
+         ElementTransformation &Tr = *H1.GetElementTransformation(e);
+         for (int q = 0; q < NQ; q++)
+         {
+            const IntegrationPoint &ip = ir.IntPoint(q);
+            Tr.SetIntPoint(&ip);
+            DenseMatrixInverse Jinv(Tr.Jacobian());
+            Jinv.GetInverseMatrix(qdata.Jac0inv(e*NQ + q));
+            const double rho0DetJ0 = Tr.Weight() * rho_vals(q);
+            qdata.rho0DetJ0w(e*NQ + q) = rho0DetJ0 * ir.IntPoint(q).weight;
+         }
       }
+      for (int e = 0; e < NE; e++) { vol += pmesh->GetElementVolume(e); }
    }
-
-   // Initial local mesh size (assumes all mesh elements are of the same type).
-   double loc_area = 0.0, glob_area;
-   int loc_z_cnt = nzones, glob_z_cnt;
-   ParMesh *pm = H1FESpace.GetParMesh();
-   for (int i = 0; i < nzones; i++) { loc_area += pm->GetElementVolume(i); }
-   MPI_Allreduce(&loc_area, &glob_area, 1, MPI_DOUBLE, MPI_SUM, pm->GetComm());
-   MPI_Allreduce(&loc_z_cnt, &glob_z_cnt, 1, MPI_INT, MPI_SUM, pm->GetComm());
-   switch (pm->GetElementBaseGeometry(0))
+   MPI_Allreduce(&vol, &Volume, 1, MPI_DOUBLE, MPI_SUM, pmesh->GetComm());
+   MPI_Allreduce(&ne, &Ne, 1, MPI_INT, MPI_SUM, pmesh->GetComm());
+   switch (pmesh->GetElementBaseGeometry(0))
    {
-      case Geometry::SEGMENT:
-         quad_data.h0 = glob_area / glob_z_cnt; break;
-      case Geometry::SQUARE:
-         quad_data.h0 = sqrt(glob_area / glob_z_cnt); break;
-      case Geometry::TRIANGLE:
-         quad_data.h0 = sqrt(2.0 * glob_area / glob_z_cnt); break;
-      case Geometry::CUBE:
-         quad_data.h0 = pow(glob_area / glob_z_cnt, 1.0/3.0); break;
-      case Geometry::TETRAHEDRON:
-         quad_data.h0 = pow(6.0 * glob_area / glob_z_cnt, 1.0/3.0); break;
+      case Geometry::SEGMENT: qdata.h0 = Volume / Ne; break;
+      case Geometry::SQUARE: qdata.h0 = sqrt(Volume / Ne); break;
+      case Geometry::TRIANGLE: qdata.h0 = sqrt(2.0 * Volume / Ne); break;
+      case Geometry::CUBE: qdata.h0 = pow(Volume / Ne, 1./3.); break;
+      case Geometry::TETRAHEDRON: qdata.h0 = pow(6.0 * Volume / Ne, 1./3.); break;
       default: MFEM_ABORT("Unknown zone type!");
    }
-   quad_data.h0 /= (double) H1FESpace.GetOrder(0);
+   qdata.h0 /= (double) H1.GetOrder(0);
 
    if (p_assembly)
    {
       // Setup the preconditioner of the velocity mass operator.
-      Vector d;
-      (dim == 2) ? VMassPA.ComputeDiagonal2D(d) : VMassPA.ComputeDiagonal3D(d);
-      VMassPA_prec.SetDiagonal(d);
+      // BC are handled by the VMassPA, so ess_tdofs here can be empty.
+      Array<int> ess_tdofs;
+      VMassPA_Jprec = new OperatorJacobiSmoother(VMassPA->GetBF(), ess_tdofs);
+      CG_VMass.SetPreconditioner(*VMassPA_Jprec);
+
+      CG_VMass.SetOperator(*VMassPA);
+      CG_VMass.SetRelTol(cg_rel_tol);
+      CG_VMass.SetAbsTol(0.0);
+      CG_VMass.SetMaxIter(cg_max_iter);
+      CG_VMass.SetPrintLevel(-1);
+
+      CG_EMass.SetOperator(*EMassPA);
+      CG_EMass.iterative_mode = false;
+      CG_EMass.SetRelTol(cg_rel_tol);
+      CG_EMass.SetAbsTol(0.0);
+      CG_EMass.SetMaxIter(cg_max_iter);
+      CG_EMass.SetPrintLevel(-1);
    }
    else
    {
-      ForceIntegrator *fi = new ForceIntegrator(quad_data);
-      fi->SetIntRule(&integ_rule);
+      ForceIntegrator *fi = new ForceIntegrator(qdata);
+      fi->SetIntRule(&ir);
       Force.AddDomainIntegrator(fi);
       // Make a dummy assembly to figure out the sparsity.
       Force.Assemble(0);
       Force.Finalize(0);
    }
+}
 
-   locCG.SetOperator(locEMassPA);
-   locCG.iterative_mode = false;
-   locCG.SetRelTol(1e-8);
-   locCG.SetAbsTol(1e-8 * numeric_limits<double>::epsilon());
-   locCG.SetMaxIter(200);
-   locCG.SetPrintLevel(0);
+LagrangianHydroOperator::~LagrangianHydroOperator()
+{
+   if (p_assembly)
+   {
+      delete EMassPA;
+      delete VMassPA;
+      delete VMassPA_Jprec;
+      delete ForcePA;
+   }
 }
 
 void LagrangianHydroOperator::Mult(const Vector &S, Vector &dS_dt) const
@@ -210,23 +377,19 @@ void LagrangianHydroOperator::Mult(const Vector &S, Vector &dS_dt) const
    // needed only because some mfem time integrators don't update the solution
    // vector at every intermediate stage (hence they don't change the mesh).
    UpdateMesh(S);
-
    // The monolithic BlockVector stores the unknown fields as follows:
    // (Position, Velocity, Specific Internal Energy).
-   Vector* sptr = (Vector*) &S;
+   Vector* sptr = const_cast<Vector*>(&S);
    ParGridFunction v;
-   const int VsizeH1 = H1FESpace.GetVSize();
-   v.MakeRef(&H1FESpace, *sptr, VsizeH1);
-
+   const int VsizeH1 = H1.GetVSize();
+   v.MakeRef(&H1, *sptr, VsizeH1);
    // Set dx_dt = v (explicit).
    ParGridFunction dx;
-   dx.MakeRef(&H1FESpace, dS_dt, 0);
+   dx.MakeRef(&H1, dS_dt, 0);
    dx = v;
-
    SolveVelocity(S, dS_dt);
    SolveEnergy(S, v, dS_dt);
-
-   quad_data_is_current = false;
+   qdata_is_current = false;
 }
 
 void LagrangianHydroOperator::SolveVelocity(const Vector &S,
@@ -234,48 +397,40 @@ void LagrangianHydroOperator::SolveVelocity(const Vector &S,
 {
    UpdateQuadratureData(S);
    AssembleForceMatrix();
-
-   const int VsizeL2 = L2FESpace.GetVSize();
-   const int VsizeH1 = H1FESpace.GetVSize();
-
    // The monolithic BlockVector stores the unknown fields as follows:
    // (Position, Velocity, Specific Internal Energy).
    ParGridFunction dv;
-   dv.MakeRef(&H1FESpace, dS_dt, VsizeH1);
+   dv.MakeRef(&H1, dS_dt, H1Vsize);
    dv = 0.0;
-
-   Vector one(VsizeL2), rhs(VsizeH1), B, X; one = 1.0;
    if (p_assembly)
    {
       timer.sw_force.Start();
-      ForcePA.Mult(one, rhs);
-      if (ftz_tol>0.0)
-      {
-         for (int i = 0; i < VsizeH1; i++)
-         {
-            if (fabs(rhs[i]) < ftz_tol)
-            {
-               rhs[i] = 0.0;
-            }
-         }
-      }
+      ForcePA->Mult(one, rhs);
       timer.sw_force.Stop();
       rhs.Neg();
 
-      Operator *cVMassPA;
-      VMassPA.FormLinearSystem(ess_tdofs, dv, rhs, cVMassPA, X, B);
-      CGSolver cg(H1FESpace.GetParMesh()->GetComm());
-      cg.SetPreconditioner(VMassPA_prec);
-      cg.SetOperator(*cVMassPA);
-      cg.SetRelTol(cg_rel_tol); cg.SetAbsTol(0.0);
-      cg.SetMaxIter(cg_max_iter);
-      cg.SetPrintLevel(0);
-      timer.sw_cgH1.Start();
-      cg.Mult(B, X);
-      timer.sw_cgH1.Stop();
-      timer.H1cg_iter += cg.GetNumIterations();
-      VMassPA.RecoverFEMSolution(X, rhs, dv);
-      delete cVMassPA;
+      // Partial assembly solve for each velocity component
+      const int size = H1c.GetVSize();
+      const Operator *Pconf = H1c.GetProlongationMatrix();
+      for (int c = 0; c < dim; c++)
+      {
+         dvc_gf.MakeRef(&H1c, dS_dt, H1Vsize + c*size);
+         rhs_c_gf.MakeRef(&H1c, rhs, c*size);
+         if (Pconf) { Pconf->MultTranspose(rhs_c_gf, B); }
+         else { B = rhs_c_gf; }
+         H1c.GetRestrictionMatrix()->Mult(dvc_gf, X);
+         VMassPA->SetEssentialTrueDofs(c_tdofs[c]);
+         VMassPA->EliminateRHS(B);
+         timer.sw_cgH1.Start();
+         CG_VMass.Mult(B, X);
+         timer.sw_cgH1.Stop();
+         timer.H1iter += CG_VMass.GetNumIterations();
+         if (Pconf) { Pconf->Mult(X, dvc_gf); }
+         else { dvc_gf = X; }
+         // We need to sync the subvector 'dvc_gf' with its base vector
+         // because it may have been moved to a different memory space.
+         dvc_gf.GetMemory().SyncAlias(dS_dt.GetMemory(), dvc_gf.Size());
+      }
    }
    else
    {
@@ -286,18 +441,19 @@ void LagrangianHydroOperator::SolveVelocity(const Vector &S,
 
       HypreParMatrix A;
       Mv.FormLinearSystem(ess_tdofs, dv, rhs, A, X, B);
-      CGSolver cg(H1FESpace.GetParMesh()->GetComm());
+      CGSolver cg(H1.GetParMesh()->GetComm());
       HypreSmoother prec;
       prec.SetType(HypreSmoother::Jacobi, 1);
       cg.SetPreconditioner(prec);
       cg.SetOperator(A);
-      cg.SetRelTol(cg_rel_tol); cg.SetAbsTol(0.0);
+      cg.SetRelTol(cg_rel_tol);
+      cg.SetAbsTol(0.0);
       cg.SetMaxIter(cg_max_iter);
-      cg.SetPrintLevel(0);
+      cg.SetPrintLevel(-1);
       timer.sw_cgH1.Start();
       cg.Mult(B, X);
       timer.sw_cgH1.Stop();
-      timer.H1cg_iter += cg.GetNumIterations();
+      timer.H1iter += cg.GetNumIterations();
       Mv.RecoverFEMSolution(X, rhs, dv);
    }
 }
@@ -308,60 +464,54 @@ void LagrangianHydroOperator::SolveEnergy(const Vector &S, const Vector &v,
    UpdateQuadratureData(S);
    AssembleForceMatrix();
 
-   const int VsizeL2 = L2FESpace.GetVSize();
-   const int VsizeH1 = H1FESpace.GetVSize();
-
    // The monolithic BlockVector stores the unknown fields as follows:
    // (Position, Velocity, Specific Internal Energy).
    ParGridFunction de;
-   de.MakeRef(&L2FESpace, dS_dt, VsizeH1*2);
+   de.MakeRef(&L2, dS_dt, H1Vsize*2);
    de = 0.0;
 
    // Solve for energy, assemble the energy source if such exists.
-   LinearForm *e_source = NULL;
+   LinearForm *e_source = nullptr;
    if (source_type == 1) // 2D Taylor-Green.
    {
-      e_source = new LinearForm(&L2FESpace);
+      e_source = new LinearForm(&L2);
       TaylorCoefficient coeff;
-      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &integ_rule);
+      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &ir);
       e_source->AddDomainIntegrator(d);
       e_source->Assemble();
    }
+
    Array<int> l2dofs;
-   Vector e_rhs(VsizeL2), loc_rhs(l2dofs_cnt), loc_de(l2dofs_cnt);
    if (p_assembly)
    {
       timer.sw_force.Start();
-      ForcePA.MultTranspose(v, e_rhs);
+      ForcePA->MultTranspose(v, e_rhs);
       timer.sw_force.Stop();
-
       if (e_source) { e_rhs += *e_source; }
-      for (int z = 0; z < nzones; z++)
-      {
-         L2FESpace.GetElementDofs(z, l2dofs);
-         e_rhs.GetSubVector(l2dofs, loc_rhs);
-         locEMassPA.SetZoneId(z);
-         timer.sw_cgL2.Start();
-         locCG.Mult(loc_rhs, loc_de);
-         timer.sw_cgL2.Stop();
-         timer.L2dof_iter += locCG.GetNumIterations() * l2dofs_cnt;
-         de.SetSubVector(l2dofs, loc_de);
-      }
+      timer.sw_cgL2.Start();
+      CG_EMass.Mult(e_rhs, de);
+      timer.sw_cgL2.Stop();
+      const HYPRE_Int cg_num_iter = CG_EMass.GetNumIterations();
+      timer.L2iter += (cg_num_iter==0) ? 1 : cg_num_iter;
+      // Move the memory location of the subvector 'de' to the memory
+      // location of the base vector 'dS_dt'.
+      de.GetMemory().SyncAlias(dS_dt.GetMemory(), de.Size());
    }
-   else
+   else // not p_assembly
    {
       timer.sw_force.Start();
       Force.MultTranspose(v, e_rhs);
       timer.sw_force.Stop();
       if (e_source) { e_rhs += *e_source; }
-      for (int z = 0; z < nzones; z++)
+      Vector loc_rhs(l2dofs_cnt), loc_de(l2dofs_cnt);
+      for (int e = 0; e < NE; e++)
       {
-         L2FESpace.GetElementDofs(z, l2dofs);
+         L2.GetElementDofs(e, l2dofs);
          e_rhs.GetSubVector(l2dofs, loc_rhs);
          timer.sw_cgL2.Start();
-         Me_inv(z).Mult(loc_rhs, loc_de);
+         Me_inv(e).Mult(loc_rhs, loc_de);
          timer.sw_cgL2.Stop();
-         timer.L2dof_iter += l2dofs_cnt;
+         timer.L2iter += 1;
          de.SetSubVector(l2dofs, loc_de);
       }
    }
@@ -370,129 +520,168 @@ void LagrangianHydroOperator::SolveEnergy(const Vector &S, const Vector &v,
 
 void LagrangianHydroOperator::UpdateMesh(const Vector &S) const
 {
-   Vector* sptr = (Vector*) &S;
-   x_gf.MakeRef(&H1FESpace, *sptr, 0);
-   H1FESpace.GetParMesh()->NewNodes(x_gf, false);
+   Vector* sptr = const_cast<Vector*>(&S);
+   x_gf.MakeRef(&H1, *sptr, 0);
+   H1.GetParMesh()->NewNodes(x_gf, false);
 }
 
 double LagrangianHydroOperator::GetTimeStepEstimate(const Vector &S) const
 {
    UpdateMesh(S);
    UpdateQuadratureData(S);
-
    double glob_dt_est;
-   MPI_Allreduce(&quad_data.dt_est, &glob_dt_est, 1, MPI_DOUBLE, MPI_MIN,
-                 H1FESpace.GetParMesh()->GetComm());
+   const MPI_Comm comm = H1.GetParMesh()->GetComm();
+   MPI_Allreduce(&qdata.dt_est, &glob_dt_est, 1, MPI_DOUBLE, MPI_MIN, comm);
    return glob_dt_est;
 }
 
 void LagrangianHydroOperator::ResetTimeStepEstimate() const
 {
-   quad_data.dt_est = numeric_limits<double>::infinity();
+   qdata.dt_est = std::numeric_limits<double>::infinity();
 }
 
 void LagrangianHydroOperator::ComputeDensity(ParGridFunction &rho) const
 {
-   rho.SetSpace(&L2FESpace);
-
+   rho.SetSpace(&L2);
    DenseMatrix Mrho(l2dofs_cnt);
    Vector rhs(l2dofs_cnt), rho_z(l2dofs_cnt);
    Array<int> dofs(l2dofs_cnt);
    DenseMatrixInverse inv(&Mrho);
-   MassIntegrator mi(&integ_rule);
-   DensityIntegrator di(quad_data);
-   di.SetIntRule(&integ_rule);
-   for (int i = 0; i < nzones; i++)
+   MassIntegrator mi(&ir);
+   DensityIntegrator di(qdata);
+   di.SetIntRule(&ir);
+   for (int e = 0; e < NE; e++)
    {
-      di.AssembleRHSElementVect(*L2FESpace.GetFE(i),
-                                *L2FESpace.GetElementTransformation(i), rhs);
-      mi.AssembleElementMatrix(*L2FESpace.GetFE(i),
-                               *L2FESpace.GetElementTransformation(i), Mrho);
+      const FiniteElement &fe = *L2.GetFE(e);
+      ElementTransformation &eltr = *L2.GetElementTransformation(e);
+      di.AssembleRHSElementVect(fe, eltr, rhs);
+      mi.AssembleElementMatrix(fe, eltr, Mrho);
       inv.Factor();
       inv.Mult(rhs, rho_z);
-      L2FESpace.GetElementDofs(i, dofs);
+      L2.GetElementDofs(e, dofs);
       rho.SetSubVector(dofs, rho_z);
    }
 }
 
-double LagrangianHydroOperator::InternalEnergy(const ParGridFunction &e) const
+double LagrangianHydroOperator::InternalEnergy(const ParGridFunction &gf) const
 {
-   Vector one(l2dofs_cnt), loc_e(l2dofs_cnt);
-   one = 1.0;
-   Array<int> l2dofs;
-
-   double loc_ie = 0.0;
-   for (int z = 0; z < nzones; z++)
+   double glob_ie = 0.0;
+   // This should be turned into a kernel so that it could be displayed in pa
+   if (!p_assembly)
    {
-      L2FESpace.GetElementDofs(z, l2dofs);
-      e.GetSubVector(l2dofs, loc_e);
-      loc_ie += Me(z).InnerProduct(loc_e, one);
+      Vector one(l2dofs_cnt), loc_e(l2dofs_cnt);
+      one = 1.0;
+      Array<int> l2dofs;
+      double loc_ie = 0.0;
+      for (int e = 0; e < NE; e++)
+      {
+         L2.GetElementDofs(e, l2dofs);
+         gf.GetSubVector(l2dofs, loc_e);
+         loc_ie += Me(e).InnerProduct(loc_e, one);
+      }
+      MPI_Comm comm = H1.GetParMesh()->GetComm();
+      MPI_Allreduce(&loc_ie, &glob_ie, 1, MPI_DOUBLE, MPI_SUM, comm);
    }
-
-   double glob_ie;
-   MPI_Allreduce(&loc_ie, &glob_ie, 1, MPI_DOUBLE, MPI_SUM,
-                 H1FESpace.GetParMesh()->GetComm());
    return glob_ie;
 }
 
 double LagrangianHydroOperator::KineticEnergy(const ParGridFunction &v) const
 {
-   double loc_ke = 0.5 * Mv_spmat_copy.InnerProduct(v, v);
-
-   double glob_ke;
-   MPI_Allreduce(&loc_ke, &glob_ke, 1, MPI_DOUBLE, MPI_SUM,
-                 H1FESpace.GetParMesh()->GetComm());
+   double glob_ke = 0.0;
+   // This should be turned into a kernel so that it could be displayed in pa
+   if (!p_assembly)
+   {
+      double loc_ke = 0.5 * Mv_spmat_copy.InnerProduct(v, v);
+      MPI_Allreduce(&loc_ke, &glob_ke, 1, MPI_DOUBLE, MPI_SUM,
+                    H1.GetParMesh()->GetComm());
+   }
    return glob_ke;
 }
 
-void LagrangianHydroOperator::PrintTimingData(bool IamRoot, int steps) const
+void LagrangianHydroOperator::PrintTimingData(bool IamRoot, int steps,
+                                              const bool fom) const
 {
-   double my_rt[5], rt_max[5];
+   const MPI_Comm com = H1.GetComm();
+   double my_rt[5], T[5];
    my_rt[0] = timer.sw_cgH1.RealTime();
    my_rt[1] = timer.sw_cgL2.RealTime();
    my_rt[2] = timer.sw_force.RealTime();
    my_rt[3] = timer.sw_qdata.RealTime();
    my_rt[4] = my_rt[0] + my_rt[2] + my_rt[3];
-   MPI_Reduce(my_rt, rt_max, 5, MPI_DOUBLE, MPI_MAX, 0, H1FESpace.GetComm());
+   MPI_Reduce(my_rt, T, 5, MPI_DOUBLE, MPI_MAX, 0, com);
 
-   HYPRE_Int mydata[2], alldata[2];
-   mydata[0] = timer.L2dof_iter;
+   HYPRE_Int mydata[3], alldata[3];
+   mydata[0] = timer.L2dof * timer.L2iter;
    mydata[1] = timer.quad_tstep;
-   MPI_Reduce(mydata, alldata, 2, HYPRE_MPI_INT, MPI_SUM, 0,
-              H1FESpace.GetComm());
+   mydata[2] = NE;
+   MPI_Reduce(mydata, alldata, 3, HYPRE_MPI_INT, MPI_SUM, 0, com);
 
    if (IamRoot)
    {
-      const HYPRE_Int H1gsize = H1FESpace.GlobalTrueVSize(),
-                      L2gsize = L2FESpace.GlobalTrueVSize();
       using namespace std;
+      // FOM = (FOM1 * T1 + FOM2 * T2 + FOM3 * T3) / (T1 + T2 + T3)
+      const HYPRE_Int H1iter = p_assembly ? (timer.H1iter/dim) : timer.H1iter;
+      const double FOM1 = 1e-6 * H1GTVSize * H1iter / T[0];
+      const double FOM2 = 1e-6 * steps * (H1GTVSize + L2GTVSize) / T[2];
+      const double FOM3 = 1e-6 * alldata[1] * ir.GetNPoints() / T[3];
+      const double FOM = (FOM1 * T[0] + FOM2 * T[2] + FOM3 * T[3]) / T[4];
+      const double FOM0 = 1e-6 * steps * (H1GTVSize + L2GTVSize) / T[4];
       cout << endl;
-      cout << "CG (H1) total time: " << rt_max[0] << endl;
+      cout << "CG (H1) total time: " << T[0] << endl;
       cout << "CG (H1) rate (megadofs x cg_iterations / second): "
-           << 1e-6 * H1gsize * timer.H1cg_iter / rt_max[0] << endl;
+           << FOM1 << endl;
       cout << endl;
-      cout << "CG (L2) total time: " << rt_max[1] << endl;
+      cout << "CG (L2) total time: " << T[1] << endl;
       cout << "CG (L2) rate (megadofs x cg_iterations / second): "
-           << 1e-6 * alldata[0] / rt_max[1] << endl;
+           << 1e-6 * alldata[0] / T[1] << endl;
       cout << endl;
-      // The Force operator is applied twice per time step, on the H1 and the L2
-      // vectors, respectively.
-      cout << "Forces total time: " << rt_max[2] << endl;
+      cout << "Forces total time: " << T[2] << endl;
       cout << "Forces rate (megadofs x timesteps / second): "
-           << 1e-6 * steps * (H1gsize + L2gsize) / rt_max[2] << endl;
+           << FOM2 << endl;
       cout << endl;
-      cout << "UpdateQuadData total time: " << rt_max[3] << endl;
+      cout << "UpdateQuadData total time: " << T[3] << endl;
       cout << "UpdateQuadData rate (megaquads x timesteps / second): "
-           << 1e-6 * alldata[1] * integ_rule.GetNPoints() / rt_max[3] << endl;
+           << FOM3 << endl;
       cout << endl;
-      cout << "Major kernels total time (seconds): " << rt_max[4] << endl;
+      cout << "Major kernels total time (seconds): " << T[4] << endl;
       cout << "Major kernels total rate (megadofs x time steps / second): "
-           << 1e-6 * steps * (H1gsize + L2gsize) / rt_max[4] << endl;
+           << FOM << endl;
+      if (!fom) { return; }
+      const int QPT = ir.GetNPoints();
+      const HYPRE_Int GNZones = alldata[2];
+      const long ndofs = 2*H1GTVSize + L2GTVSize + QPT*GNZones;
+      cout << endl;
+      cout << "| Ranks " << "| Zones   "
+           << "| H1 dofs " << "| L2 dofs "
+           << "| QP "      << "| N dofs   "
+           << "| FOM0   "
+           << "| FOM1   " << "| T1   "
+           << "| FOM2   " << "| T2   "
+           << "| FOM3   " << "| T3   "
+           << "| FOM    " << "| TT   "
+           << "|" << endl;
+      cout << setprecision(3);
+      cout << "| " << setw(6) << H1.GetNRanks()
+           << "| " << setw(8) << GNZones
+           << "| " << setw(8) << H1GTVSize
+           << "| " << setw(8) << L2GTVSize
+           << "| " << setw(3) << QPT
+           << "| " << setw(9) << ndofs
+           << "| " << setw(7) << FOM0
+           << "| " << setw(7) << FOM1
+           << "| " << setw(5) << T[0]
+           << "| " << setw(7) << FOM2
+           << "| " << setw(5) << T[2]
+           << "| " << setw(7) << FOM3
+           << "| " << setw(5) << T[3]
+           << "| " << setw(7) << FOM
+           << "| " << setw(5) << T[4]
+           << "| " << endl;
    }
 }
 
 // Smooth transition between 0 and 1 for x in [-eps, eps].
-inline double smooth_step_01(double x, double eps)
+MFEM_HOST_DEVICE inline double smooth_step_01(double x, double eps)
 {
    const double y = (x + eps) / (2.0 * eps);
    if (y < 0.0) { return 0.0; }
@@ -502,28 +691,32 @@ inline double smooth_step_01(double x, double eps)
 
 void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
 {
-   if (quad_data_is_current) { return; }
-   timer.sw_qdata.Start();
+   if (qdata_is_current) { return; }
+
+   qdata_is_current = true;
+   forcemat_is_assembled = false;
 
-   const int nqp = integ_rule.GetNPoints();
+   if (dim > 1) { return qupdate.UpdateQuadratureData(S, qdata); }
 
+   // This code is only for the 1D/FA mode
+   timer.sw_qdata.Start();
+   const int nqp = ir.GetNPoints();
    ParGridFunction x, v, e;
-   Vector* sptr = (Vector*) &S;
-   x.MakeRef(&H1FESpace, *sptr, 0);
-   v.MakeRef(&H1FESpace, *sptr, H1FESpace.GetVSize());
-   e.MakeRef(&L2FESpace, *sptr, 2*H1FESpace.GetVSize());
+   Vector* sptr = const_cast<Vector*>(&S);
+   x.MakeRef(&H1, *sptr, 0);
+   v.MakeRef(&H1, *sptr, H1.GetVSize());
+   e.MakeRef(&L2, *sptr, 2*H1.GetVSize());
    Vector e_vals, e_loc(l2dofs_cnt), vector_vals(h1dofs_cnt * dim);
    DenseMatrix Jpi(dim), sgrad_v(dim), Jinv(dim), stress(dim), stressJiT(dim),
                vecvalMat(vector_vals.GetData(), h1dofs_cnt, dim);
    DenseTensor grad_v_ref(dim, dim, nqp);
    Array<int> L2dofs, H1dofs;
-
    // Batched computations are needed, because hydrodynamic codes usually
    // involve expensive computations of material properties. Although this
    // miniapp uses simple EOS equations, we still want to represent the batched
    // cycle structure.
    int nzones_batch = 3;
-   const int nbatches =  nzones / nzones_batch + 1; // +1 for the remainder.
+   const int nbatches =  NE / nzones_batch + 1; // +1 for the remainder.
    int nqp_batch = nqp * nzones_batch;
    double *gamma_b = new double[nqp_batch],
    *rho_b = new double[nqp_batch],
@@ -537,66 +730,41 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
    {
       int z_id = b * nzones_batch; // Global index over zones.
       // The last batch might not be full.
-      if (z_id == nzones) { break; }
-      else if (z_id + nzones_batch > nzones)
+      if (z_id == NE) { break; }
+      else if (z_id + nzones_batch > NE)
       {
-         nzones_batch = nzones - z_id;
+         nzones_batch = NE - z_id;
          nqp_batch    = nqp * nzones_batch;
       }
-
-      double min_detJ = numeric_limits<double>::infinity();
+      double min_detJ = std::numeric_limits<double>::infinity();
       for (int z = 0; z < nzones_batch; z++)
       {
-         ElementTransformation *T = H1FESpace.GetElementTransformation(z_id);
+         ElementTransformation *T = H1.GetElementTransformation(z_id);
          Jpr_b[z].SetSize(dim, dim, nqp);
-
-         if (p_assembly)
-         {
-            // Energy values at quadrature point.
-            L2FESpace.GetElementDofs(z_id, L2dofs);
-            e.GetSubVector(L2dofs, e_loc);
-            evaluator.GetL2Values(e_loc, e_vals);
-
-            // All reference->physical Jacobians at the quadrature points.
-            H1FESpace.GetElementVDofs(z_id, H1dofs);
-            x.GetSubVector(H1dofs, vector_vals);
-            evaluator.GetVectorGrad(vecvalMat, Jpr_b[z]);
-         }
-         else { e.GetValues(z_id, integ_rule, e_vals); }
+         e.GetValues(z_id, ir, e_vals);
          for (int q = 0; q < nqp; q++)
          {
-            const IntegrationPoint &ip = integ_rule.IntPoint(q);
+            const IntegrationPoint &ip = ir.IntPoint(q);
             T->SetIntPoint(&ip);
-            if (!p_assembly) { Jpr_b[z](q) = T->Jacobian(); }
+            Jpr_b[z](q) = T->Jacobian();
             const double detJ = Jpr_b[z](q).Det();
-            min_detJ = min(min_detJ, detJ);
-
+            min_detJ = fmin(min_detJ, detJ);
             const int idx = z * nqp + q;
-            if (material_pcf == NULL) { gamma_b[idx] = 5./3.; } // Ideal gas.
-            else { gamma_b[idx] = material_pcf->Eval(*T, ip); }
-            rho_b[idx] = quad_data.rho0DetJ0w(z_id*nqp + q) / detJ / ip.weight;
-            e_b[idx]   = max(0.0, e_vals(q));
+            gamma_b[idx] = gamma_coeff.Eval(*T, ip);
+            rho_b[idx] = qdata.rho0DetJ0w(z_id*nqp + q) / detJ / ip.weight;
+            e_b[idx] = fmax(0.0, e_vals(q));
          }
          ++z_id;
       }
-
       // Batched computation of material properties.
       ComputeMaterialProperties(nqp_batch, gamma_b, rho_b, e_b, p_b, cs_b);
-
       z_id -= nzones_batch;
       for (int z = 0; z < nzones_batch; z++)
       {
-         ElementTransformation *T = H1FESpace.GetElementTransformation(z_id);
-         if (p_assembly)
-         {
-            // All reference->physical Jacobians at the quadrature points.
-            H1FESpace.GetElementVDofs(z_id, H1dofs);
-            v.GetSubVector(H1dofs, vector_vals);
-            evaluator.GetVectorGrad(vecvalMat, grad_v_ref);
-         }
+         ElementTransformation *T = H1.GetElementTransformation(z_id);
          for (int q = 0; q < nqp; q++)
          {
-            const IntegrationPoint &ip = integ_rule.IntPoint(q);
+            const IntegrationPoint &ip = ir.IntPoint(q);
             T->SetIntPoint(&ip);
             // Note that the Jacobian was already computed above. We've chosen
             // not to store the Jacobians for all batched quadrature points.
@@ -604,10 +772,8 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
             CalcInverse(Jpr, Jinv);
             const double detJ = Jpr.Det(), rho = rho_b[z*nqp + q],
                          p = p_b[z*nqp + q], sound_speed = cs_b[z*nqp + q];
-
             stress = 0.0;
             for (int d = 0; d < dim; d++) { stress(d, d) = -p; }
-
             double visc_coeff = 0.0;
             if (use_viscosity)
             {
@@ -615,30 +781,18 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
                // eigenvector of the symmetric velocity gradient gives the
                // direction of maximal compression. This is used to define the
                // relative change of the initial length scale.
-               if (p_assembly)
-               {
-                  mfem::Mult(grad_v_ref(q), Jinv, sgrad_v);
-               }
-               else
-               {
-                  v.GetVectorGradient(*T, sgrad_v);
-               }
+               v.GetVectorGradient(*T, sgrad_v);
                sgrad_v.Symmetrize();
                double eig_val_data[3], eig_vec_data[9];
-               if (dim==1)
-               {
-                  eig_val_data[0] = sgrad_v(0, 0);
-                  eig_vec_data[0] = 1.;
-               }
-               else { sgrad_v.CalcEigenvalues(eig_val_data, eig_vec_data); }
+               eig_val_data[0] = sgrad_v(0, 0);
+               eig_vec_data[0] = 1.;
                Vector compr_dir(eig_vec_data, dim);
                // Computes the initial->physical transformation Jacobian.
-               mfem::Mult(Jpr, quad_data.Jac0inv(z_id*nqp + q), Jpi);
+               mfem::Mult(Jpr, qdata.Jac0inv(z_id*nqp + q), Jpi);
                Vector ph_dir(dim); Jpi.Mult(compr_dir, ph_dir);
                // Change of the initial mesh size in the compression direction.
-               const double h = quad_data.h0 * ph_dir.Norml2() /
+               const double h = qdata.h0 * ph_dir.Norml2() /
                                 compr_dir.Norml2();
-
                // Measure of maximal compression.
                const double mu = eig_val_data[0];
                visc_coeff = 2.0 * rho * h * h * fabs(mu);
@@ -649,36 +803,36 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
                const double eps = 1e-12;
                visc_coeff += 0.5 * rho * h * sound_speed *
                              (1.0 - smooth_step_01(mu - 2.0 * eps, eps));
-
                stress.Add(visc_coeff, sgrad_v);
             }
-
             // Time step estimate at the point. Here the more relevant length
             // scale is related to the actual mesh deformation; we use the min
             // singular value of the ref->physical Jacobian. In addition, the
             // time step estimate should be aware of the presence of shocks.
             const double h_min =
-               Jpr.CalcSingularvalue(dim-1) / (double) H1FESpace.GetOrder(0);
+               Jpr.CalcSingularvalue(dim-1) / (double) H1.GetOrder(0);
             const double inv_dt = sound_speed / h_min +
                                   2.5 * visc_coeff / rho / h_min / h_min;
             if (min_detJ < 0.0)
             {
                // This will force repetition of the step with smaller dt.
-               quad_data.dt_est = 0.0;
+               qdata.dt_est = 0.0;
             }
             else
             {
-               quad_data.dt_est = min(quad_data.dt_est, cfl * (1.0 / inv_dt) );
+               if (inv_dt>0.0)
+               {
+                  qdata.dt_est = fmin(qdata.dt_est, cfl*(1.0/inv_dt));
+               }
             }
-
             // Quadrature data for partial assembly of the force operator.
             MultABt(stress, Jinv, stressJiT);
-            stressJiT *= integ_rule.IntPoint(q).weight * detJ;
+            stressJiT *= ir.IntPoint(q).weight * detJ;
             for (int vd = 0 ; vd < dim; vd++)
             {
                for (int gd = 0; gd < dim; gd++)
                {
-                  quad_data.stressJinvT(vd)(z_id*nqp + q, gd) =
+                  qdata.stressJinvT(vd)(z_id*nqp + q, gd) =
                      stressJiT(vd, gd);
                }
             }
@@ -686,34 +840,344 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
          ++z_id;
       }
    }
-
    delete [] gamma_b;
    delete [] rho_b;
    delete [] e_b;
    delete [] p_b;
    delete [] cs_b;
    delete [] Jpr_b;
-   quad_data_is_current = true;
-   forcemat_is_assembled = false;
-
    timer.sw_qdata.Stop();
-   timer.quad_tstep += nzones;
+   timer.quad_tstep += NE;
+}
+
+template<int DIM> MFEM_HOST_DEVICE static inline
+void QUpdateBody(const int NE, const int e,
+                 const int NQ, const int q,
+                 const bool use_viscosity,
+                 const double h0,
+                 const double h1order,
+                 const double cfl,
+                 const double infinity,
+                 double* __restrict__ Jinv,
+                 double* __restrict__ stress,
+                 double* __restrict__ sgrad_v,
+                 double* __restrict__ eig_val_data,
+                 double* __restrict__ eig_vec_data,
+                 double* __restrict__ compr_dir,
+                 double* __restrict__ Jpi,
+                 double* __restrict__ ph_dir,
+                 double* __restrict__ stressJiT,
+                 const double* __restrict__ d_gamma,
+                 const double* __restrict__ d_weights,
+                 const double* __restrict__ d_Jacobians,
+                 const double* __restrict__ d_rho0DetJ0w,
+                 const double* __restrict__ d_e_quads,
+                 const double* __restrict__ d_grad_v_ext,
+                 const double* __restrict__ d_Jac0inv,
+                 double *d_dt_est,
+                 double *d_stressJinvT)
+{
+   constexpr int DIM2 = DIM*DIM;
+   double min_detJ = infinity;
+
+   const int eq = e * NQ + q;
+   const double gamma = d_gamma[e];
+   const double weight =  d_weights[q];
+   const double inv_weight = 1. / weight;
+   const double *J = d_Jacobians + DIM2*(NQ*e + q);
+   const double detJ = kernels::Det<DIM>(J);
+   min_detJ = fmin(min_detJ, detJ);
+   kernels::CalcInverse<DIM>(J, Jinv);
+   const double R = inv_weight * d_rho0DetJ0w[eq] / detJ;
+   const double E = fmax(0.0, d_e_quads[eq]);
+   const double P = (gamma - 1.0) * R * E;
+   const double S = sqrt(gamma * (gamma - 1.0) * E);
+   for (int k = 0; k < DIM2; k++) { stress[k] = 0.0; }
+   for (int d = 0; d < DIM; d++) { stress[d*DIM+d] = -P; }
+   double visc_coeff = 0.0;
+   if (use_viscosity)
+   {
+      // Compression-based length scale at the point. The first
+      // eigenvector of the symmetric velocity gradient gives the
+      // direction of maximal compression. This is used to define the
+      // relative change of the initial length scale.
+      const double *dV = d_grad_v_ext + DIM2*(NQ*e + q);
+      kernels::Mult(DIM, DIM, DIM, dV, Jinv, sgrad_v);
+      kernels::Symmetrize(DIM, sgrad_v);
+      if (DIM == 1)
+      {
+         eig_val_data[0] = sgrad_v[0];
+         eig_vec_data[0] = 1.;
+      }
+      else
+      {
+         kernels::CalcEigenvalues<DIM>(sgrad_v, eig_val_data, eig_vec_data);
+      }
+      for (int k=0; k<DIM; k++) { compr_dir[k] = eig_vec_data[k]; }
+      // Computes the initial->physical transformation Jacobian.
+      kernels::Mult(DIM, DIM, DIM, J, d_Jac0inv + eq*DIM*DIM, Jpi);
+      kernels::Mult(DIM, DIM, Jpi, compr_dir, ph_dir);
+      // Change of the initial mesh size in the compression direction.
+      const double ph_dir_nl2 = kernels::Norml2(DIM, ph_dir);
+      const double compr_dir_nl2 = kernels::Norml2(DIM, compr_dir);
+      const double H = h0 * ph_dir_nl2 / compr_dir_nl2;
+      // Measure of maximal compression.
+      const double mu = eig_val_data[0];
+      visc_coeff = 2.0 * R * H * H * fabs(mu);
+      // The following represents a "smooth" version of the statement
+      // "if (mu < 0) visc_coeff += 0.5 rho h sound_speed".  Note that
+      // eps must be scaled appropriately if a different unit system is
+      // being used.
+      const double eps = 1e-12;
+      visc_coeff += 0.5 * R * H  * S * (1.0 - smooth_step_01(mu-2.0*eps, eps));
+      kernels::Add(DIM, DIM, visc_coeff, stress, sgrad_v, stress);
+   }
+   // Time step estimate at the point. Here the more relevant length
+   // scale is related to the actual mesh deformation; we use the min
+   // singular value of the ref->physical Jacobian. In addition, the
+   // time step estimate should be aware of the presence of shocks.
+   const double sv = kernels::CalcSingularvalue<DIM>(J, DIM - 1);
+   const double h_min = sv / h1order;
+   const double ih_min = 1. / h_min;
+   const double irho_ih_min_sq = ih_min * ih_min / R ;
+   const double idt = S * ih_min + 2.5 * visc_coeff * irho_ih_min_sq;
+   if (min_detJ < 0.0)
+   {
+      // This will force repetition of the step with smaller dt.
+      d_dt_est[eq] = 0.0;
+   }
+   else
+   {
+      if (idt > 0.0)
+      {
+         const double cfl_inv_dt = cfl / idt;
+         d_dt_est[eq] = fmin(d_dt_est[eq], cfl_inv_dt);
+      }
+   }
+   // Quadrature data for partial assembly of the force operator.
+   kernels::MultABt(DIM, DIM, DIM, stress, Jinv, stressJiT);
+   for (int k = 0; k < DIM2; k++) { stressJiT[k] *= weight * detJ; }
+   for (int vd = 0 ; vd < DIM; vd++)
+   {
+      for (int gd = 0; gd < DIM; gd++)
+      {
+         const int offset = eq + NQ*NE*(gd + vd*DIM);
+         d_stressJinvT[offset] = stressJiT[vd + gd*DIM];
+      }
+   }
+}
+
+template<int DIM, int Q1D> static inline
+void QKernel(const int NE, const int NQ,
+             const bool use_viscosity,
+             const double h0,
+             const double h1order,
+             const double cfl,
+             const double infinity,
+             const ParGridFunction &gamma_gf,
+             const Array<double> &weights,
+             const Vector &Jacobians,
+             const Vector &rho0DetJ0w,
+             const Vector &e_quads,
+             const Vector &grad_v_ext,
+             const DenseTensor &Jac0inv,
+             Vector &dt_est,
+             DenseTensor &stressJinvT)
+{
+   constexpr int DIM2 = DIM*DIM;
+   auto d_gamma = gamma_gf.Read();
+   auto d_weights = weights.Read();
+   auto d_Jacobians = Jacobians.Read();
+   auto d_rho0DetJ0w = rho0DetJ0w.Read();
+   auto d_e_quads = e_quads.Read();
+   auto d_grad_v_ext = grad_v_ext.Read();
+   auto d_Jac0inv = Read(Jac0inv.GetMemory(), Jac0inv.TotalSize());
+   auto d_dt_est = dt_est.ReadWrite();
+   auto d_stressJinvT = Write(stressJinvT.GetMemory(), stressJinvT.TotalSize());
+   if (DIM == 2)
+   {
+      MFEM_FORALL_2D(e, NE, Q1D, Q1D, 1,
+      {
+         double Jinv[DIM2];
+         double stress[DIM2];
+         double sgrad_v[DIM2];
+         double eig_val_data[3];
+         double eig_vec_data[9];
+         double compr_dir[DIM];
+         double Jpi[DIM2];
+         double ph_dir[DIM];
+         double stressJiT[DIM2];
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               QUpdateBody<DIM>(NE, e, NQ, qx + qy * Q1D,
+               use_viscosity, h0, h1order, cfl, infinity,
+               Jinv, stress, sgrad_v, eig_val_data, eig_vec_data,
+               compr_dir, Jpi, ph_dir, stressJiT,
+               d_gamma, d_weights, d_Jacobians, d_rho0DetJ0w,
+               d_e_quads, d_grad_v_ext, d_Jac0inv,
+               d_dt_est, d_stressJinvT);
+            }
+         }
+         MFEM_SYNC_THREAD;
+      });
+   }
+   if (DIM == 3)
+   {
+      MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
+      {
+         double Jinv[DIM2];
+         double stress[DIM2];
+         double sgrad_v[DIM2];
+         double eig_val_data[3];
+         double eig_vec_data[9];
+         double compr_dir[DIM];
+         double Jpi[DIM2];
+         double ph_dir[DIM];
+         double stressJiT[DIM2];
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qz,z,Q1D)
+               {
+                  QUpdateBody<DIM>(NE, e, NQ, qx + Q1D * (qy + qz * Q1D),
+                  use_viscosity, h0, h1order, cfl, infinity,
+                  Jinv, stress, sgrad_v, eig_val_data, eig_vec_data,
+                  compr_dir, Jpi, ph_dir, stressJiT,
+                  d_gamma, d_weights, d_Jacobians, d_rho0DetJ0w,
+                  d_e_quads, d_grad_v_ext, d_Jac0inv,
+                  d_dt_est, d_stressJinvT);
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+      });
+   }
+}
+
+void QUpdate::UpdateQuadratureData(const Vector &S, QuadratureData &qdata)
+{
+   timer->sw_qdata.Start();
+   Vector* S_p = const_cast<Vector*>(&S);
+   const int H1_size = H1.GetVSize();
+   const double h1order = (double) H1.GetOrder(0);
+   const double infinity = std::numeric_limits<double>::infinity();
+   ParGridFunction x, v, e;
+   x.MakeRef(&H1,*S_p, 0);
+   H1R->Mult(x, e_vec);
+   q1->SetOutputLayout(QVectorLayout::byVDIM);
+   q1->Derivatives(e_vec, q_dx);
+   v.MakeRef(&H1,*S_p, H1_size);
+   H1R->Mult(v, e_vec);
+   q1->Derivatives(e_vec, q_dv);
+   e.MakeRef(&L2, *S_p, 2*H1_size);
+   q2->SetOutputLayout(QVectorLayout::byVDIM);
+   q2->Values(e, q_e);
+   q_dt_est = qdata.dt_est;
+   const int id = (dim << 4) | Q1D;
+   typedef void (*fQKernel)(const int NE, const int NQ,
+                            const bool use_viscosity,
+                            const double h0, const double h1order,
+                            const double cfl, const double infinity,
+                            const ParGridFunction &gamma_gf,
+                            const Array<double> &weights,
+                            const Vector &Jacobians, const Vector &rho0DetJ0w,
+                            const Vector &e_quads, const Vector &grad_v_ext,
+                            const DenseTensor &Jac0inv,
+                            Vector &dt_est, DenseTensor &stressJinvT);
+   static std::unordered_map<int, fQKernel> qupdate =
+   {
+      {0x24,&QKernel<2,4>}, {0x26,&QKernel<2,6>}, {0x28,&QKernel<2,8>},
+      {0x34,&QKernel<3,4>}, {0x36,&QKernel<3,6>}, {0x38,&QKernel<3,8>}
+   };
+   if (!qupdate[id])
+   {
+      mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
+      MFEM_ABORT("Unknown kernel");
+   }
+   qupdate[id](NE, NQ, use_viscosity, qdata.h0, h1order, cfl, infinity,
+               gamma_gf, ir.GetWeights(), q_dx,
+               qdata.rho0DetJ0w, q_e, q_dv,
+               qdata.Jac0inv, q_dt_est, qdata.stressJinvT);
+   qdata.dt_est = q_dt_est.Min();
+   timer->sw_qdata.Stop();
+   timer->quad_tstep += NE;
 }
 
 void LagrangianHydroOperator::AssembleForceMatrix() const
 {
    if (forcemat_is_assembled || p_assembly) { return; }
-
    Force = 0.0;
    timer.sw_force.Start();
    Force.Assemble();
    timer.sw_force.Stop();
-
    forcemat_is_assembled = true;
 }
 
 } // namespace hydrodynamics
 
+void HydroODESolver::Init(TimeDependentOperator &tdop)
+{
+   ODESolver::Init(tdop);
+   hydro_oper = dynamic_cast<hydrodynamics::LagrangianHydroOperator *>(f);
+   MFEM_VERIFY(hydro_oper, "HydroSolvers expect LagrangianHydroOperator.");
+}
+
+void RK2AvgSolver::Init(TimeDependentOperator &tdop)
+{
+   HydroODESolver::Init(tdop);
+   const Array<int> &block_offsets = hydro_oper->GetBlockOffsets();
+   V.SetSize(block_offsets[1], mem_type);
+   V.UseDevice(true);
+   dS_dt.Update(block_offsets, mem_type);
+   dS_dt = 0.0;
+   S0.Update(block_offsets, mem_type);
+}
+
+void RK2AvgSolver::Step(Vector &S, double &t, double &dt)
+{
+   // The monolithic BlockVector stores the unknown fields as follows:
+   // (Position, Velocity, Specific Internal Energy).
+   S0.Vector::operator=(S);
+   Vector &v0 = S0.GetBlock(1);
+   Vector &dx_dt = dS_dt.GetBlock(0);
+   Vector &dv_dt = dS_dt.GetBlock(1);
+
+   // In each sub-step:
+   // - Update the global state Vector S.
+   // - Compute dv_dt using S.
+   // - Update V using dv_dt.
+   // - Compute de_dt and dx_dt using S and V.
+
+   // -- 1.
+   // S is S0.
+   hydro_oper->UpdateMesh(S);
+   hydro_oper->SolveVelocity(S, dS_dt);
+   // V = v0 + 0.5 * dt * dv_dt;
+   add(v0, 0.5 * dt, dv_dt, V);
+   hydro_oper->SolveEnergy(S, V, dS_dt);
+   dx_dt = V;
+
+   // -- 2.
+   // S = S0 + 0.5 * dt * dS_dt;
+   add(S0, 0.5 * dt, dS_dt, S);
+   hydro_oper->ResetQuadratureData();
+   hydro_oper->UpdateMesh(S);
+   hydro_oper->SolveVelocity(S, dS_dt);
+   // V = v0 + 0.5 * dt * dv_dt;
+   add(v0, 0.5 * dt, dv_dt, V);
+   hydro_oper->SolveEnergy(S, V, dS_dt);
+   dx_dt = V;
+
+   // -- 3.
+   // S = S0 + dt * dS_dt.
+   add(S0, dt, dS_dt, S);
+   hydro_oper->ResetQuadratureData();
+   t += dt;
+}
+
 } // namespace mfem
 
 #endif // MFEM_USE_MPI
diff --git a/laghos_solver.hpp b/laghos_solver.hpp
index 8db34cb5..6886a0d9 100644
--- a/laghos_solver.hpp
+++ b/laghos_solver.hpp
@@ -42,13 +42,53 @@ struct TimingData
    // CG solves (H1 and L2) / force RHS assemblies / quadrature computations.
    StopWatch sw_cgH1, sw_cgL2, sw_force, sw_qdata;
 
+   // Store the number of dofs of the corresponding local CG
+   const HYPRE_Int L2dof;
+
    // These accumulate the total processed dofs or quad points:
-   // #(CG iterations) for the H1 CG solve.
-   // #dofs  * #(CG iterations) for the L2 CG solve.
+   // #(CG iterations) for the L2 CG solve.
    // #quads * #(RK sub steps) for the quadrature data computations.
-   int H1cg_iter, L2dof_iter, quad_tstep;
+   HYPRE_Int H1iter, L2iter;
+   HYPRE_Int quad_tstep;
+
+   TimingData(const HYPRE_Int l2d) :
+      L2dof(l2d), H1iter(0), L2iter(0), quad_tstep(0) { }
+};
 
-   TimingData() : H1cg_iter(0), L2dof_iter(0), quad_tstep(0) { }
+class QUpdate
+{
+private:
+   const int dim, vdim, NQ, NE, Q1D;
+   const bool use_viscosity;
+   const double cfl;
+   TimingData *timer;
+   const IntegrationRule &ir;
+   ParFiniteElementSpace &H1, &L2;
+   const Operator *H1R;
+   Vector q_dt_est, q_e, e_vec, q_dx, q_dv;
+   const QuadratureInterpolator *q1,*q2;
+   const ParGridFunction &gamma_gf;
+public:
+   QUpdate(const int d, const int ne, const int q1d, const bool visc,
+           const double cfl, TimingData *t,
+           const ParGridFunction &gamma_gf,
+           const IntegrationRule &ir,
+           ParFiniteElementSpace &h1, ParFiniteElementSpace &l2):
+      dim(d), vdim(h1.GetVDim()),
+      NQ(ir.GetNPoints()), NE(ne), Q1D(q1d),
+      use_viscosity(visc), cfl(cfl),
+      timer(t), ir(ir), H1(h1), L2(l2),
+      H1R(H1.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
+      q_dt_est(NE*NQ),
+      q_e(NE*NQ),
+      e_vec(NQ*NE*vdim),
+      q_dx(NQ*NE*vdim*vdim),
+      q_dv(NQ*NE*vdim*vdim),
+      q1(H1.GetQuadratureInterpolator(ir)),
+      q2(L2.GetQuadratureInterpolator(ir)),
+      gamma_gf(gamma_gf) { }
+
+   void UpdateQuadratureData(const Vector &S, QuadratureData &qdata);
 };
 
 // Given a solutions state (x, v, e), this class performs all necessary
@@ -56,57 +96,57 @@ struct TimingData
 class LagrangianHydroOperator : public TimeDependentOperator
 {
 protected:
-   ParFiniteElementSpace &H1FESpace, &L2FESpace;
-
+   ParFiniteElementSpace &H1, &L2;
+   mutable ParFiniteElementSpace H1c;
+   ParMesh *pmesh;
+   // FE spaces local and global sizes
+   const int H1Vsize;
+   const int H1TVSize;
+   const HYPRE_Int H1GTVSize;
+   const int L2Vsize;
+   const int L2TVSize;
+   const HYPRE_Int L2GTVSize;
+   Array<int> block_offsets;
    // Reference to the current mesh configuration.
    mutable ParGridFunction x_gf;
-
-   Array<int> &ess_tdofs;
-
-   const int dim, nzones, l2dofs_cnt, h1dofs_cnt, source_type;
+   const Array<int> &ess_tdofs;
+   const int dim, NE, l2dofs_cnt, h1dofs_cnt, source_type;
    const double cfl;
    const bool use_viscosity, p_assembly;
    const double cg_rel_tol;
    const int cg_max_iter;
    const double ftz_tol;
-   Coefficient *material_pcf;
-
+   Coefficient &gamma_coeff;
+   const ParGridFunction &gamma_gf;
    // Velocity mass matrix and local inverses of the energy mass matrices. These
    // are constant in time, due to the pointwise mass conservation property.
    mutable ParBilinearForm Mv;
    SparseMatrix Mv_spmat_copy;
    DenseTensor Me, Me_inv;
-
    // Integration rule for all assemblies.
-   const IntegrationRule &integ_rule;
-
-   // Data associated with each quadrature point in the mesh. These values are
-   // recomputed at each time step.
-   mutable QuadratureData quad_data;
-   mutable bool quad_data_is_current, forcemat_is_assembled;
-
-   // Structures used to perform partial assembly.
-   Tensors1D tensors1D;
-   FastEvaluator evaluator;
-
+   const IntegrationRule &ir;
+   // Data associated with each quadrature point in the mesh.
+   // These values are recomputed at each time step.
+   const int Q1D;
+   mutable QuadratureData qdata;
+   mutable bool qdata_is_current, forcemat_is_assembled;
    // Force matrix that combines the kinematic and thermodynamic spaces. It is
    // assembled in each time step and then it is used to compute the final
    // right-hand sides for momentum and specific internal energy.
    mutable MixedBilinearForm Force;
-
    // Same as above, but done through partial assembly.
-   ForcePAOperator ForcePA;
-
+   ForcePAOperator *ForcePA;
    // Mass matrices done through partial assembly:
    // velocity (coupled H1 assembly) and energy (local L2 assemblies).
-   mutable MassPAOperator VMassPA;
-   mutable DiagonalSolver VMassPA_prec;
-   mutable LocalMassPAOperator locEMassPA;
-
+   MassPAOperator *VMassPA, *EMassPA;
+   OperatorJacobiSmoother *VMassPA_Jprec;
    // Linear solver for energy.
-   CGSolver locCG;
-
+   CGSolver CG_VMass, CG_EMass;
    mutable TimingData timer;
+   mutable QUpdate qupdate;
+   mutable Vector X, B, one, rhs, e_rhs;
+   mutable ParGridFunction rhs_c_gf, dvc_gf;
+   mutable Array<int> c_tdofs[3];
 
    virtual void ComputeMaterialProperties(int nvalues, const double gamma[],
                                           const double rho[], const double e[],
@@ -123,38 +163,49 @@ class LagrangianHydroOperator : public TimeDependentOperator
    void AssembleForceMatrix() const;
 
 public:
-   LagrangianHydroOperator(int size, ParFiniteElementSpace &h1_fes,
+   LagrangianHydroOperator(const int size,
+                           ParFiniteElementSpace &h1_fes,
                            ParFiniteElementSpace &l2_fes,
-                           Array<int> &essential_tdofs, ParGridFunction &rho0,
-                           int source_type_, double cfl_,
-                           Coefficient *material_, bool visc, bool pa,
-                           double cgt, int cgiter, double ftz_tol,
-                           int h1_basis_type);
+                           const Array<int> &ess_tdofs,
+                           Coefficient &rho0_coeff,
+                           ParGridFunction &rho0_gf,
+                           Coefficient &mat_gf_coeff,
+                           ParGridFunction &gamma_gf,
+                           const int source,
+                           const double cfl,
+                           const bool visc, const bool pa,
+                           const double cgt, const int cgiter, double ftz_tol,
+                           const int order_q);
+   ~LagrangianHydroOperator();
 
    // Solve for dx_dt, dv_dt and de_dt.
    virtual void Mult(const Vector &S, Vector &dS_dt) const;
 
+   virtual MemoryClass GetMemoryClass() const
+   { return Device::GetMemoryClass(); }
+
    void SolveVelocity(const Vector &S, Vector &dS_dt) const;
    void SolveEnergy(const Vector &S, const Vector &v, Vector &dS_dt) const;
    void UpdateMesh(const Vector &S) const;
 
-   // Calls UpdateQuadratureData to compute the new quad_data.dt_estimate.
+   // Calls UpdateQuadratureData to compute the new qdata.dt_estimate.
    double GetTimeStepEstimate(const Vector &S) const;
    void ResetTimeStepEstimate() const;
-   void ResetQuadratureData() const { quad_data_is_current = false; }
+   void ResetQuadratureData() const { qdata_is_current = false; }
 
-   // The density values, which are stored only at some quadrature points, are
-   // projected as a ParGridFunction.
+   // The density values, which are stored only at some quadrature points,
+   // are projected as a ParGridFunction.
    void ComputeDensity(ParGridFunction &rho) const;
-
    double InternalEnergy(const ParGridFunction &e) const;
    double KineticEnergy(const ParGridFunction &v) const;
 
-   void PrintTimingData(bool IamRoot, int steps) const;
+   int GetH1VSize() const { return H1.GetVSize(); }
+   const Array<int> &GetBlockOffsets() const { return block_offsets; }
 
-   int GetH1VSize() const { return H1FESpace.GetVSize(); }
+   void PrintTimingData(bool IamRoot, int steps, const bool fom) const;
 };
 
+// TaylorCoefficient used in the 2D Taylor-Green problem.
 class TaylorCoefficient : public Coefficient
 {
    virtual double Eval(ElementTransformation &T,
@@ -169,6 +220,28 @@ class TaylorCoefficient : public Coefficient
 
 } // namespace hydrodynamics
 
+class HydroODESolver : public ODESolver
+{
+protected:
+   hydrodynamics::LagrangianHydroOperator *hydro_oper;
+public:
+   HydroODESolver() : hydro_oper(NULL) { }
+   virtual void Init(TimeDependentOperator&);
+   virtual void Step(Vector&, double&, double&)
+   { MFEM_ABORT("Time stepping is undefined."); }
+};
+
+class RK2AvgSolver : public HydroODESolver
+{
+protected:
+   Vector V;
+   BlockVector dS_dt, S0;
+public:
+   RK2AvgSolver() { }
+   virtual void Init(TimeDependentOperator &_f);
+   virtual void Step(Vector &S, double &t, double &dt);
+};
+
 } // namespace mfem
 
 #endif // MFEM_USE_MPI
diff --git a/laghos_timeinteg.cpp b/laghos_timeinteg.cpp
deleted file mode 100644
index 04706e29..00000000
--- a/laghos_timeinteg.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_timeinteg.hpp"
-#include "laghos_solver.hpp"
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-void HydroODESolver::Init(TimeDependentOperator &_f)
-{
-   ODESolver::Init(_f);
-
-   hydro_oper = dynamic_cast<LagrangianHydroOperator *>(f);
-   MFEM_VERIFY(hydro_oper, "HydroSolvers expect LagrangianHydroOperator.");
-}
-
-void RK2AvgSolver::Step(Vector &S, double &t, double &dt)
-{
-   const int Vsize = hydro_oper->GetH1VSize();
-   Vector V(Vsize), dS_dt(S.Size()), S0(S);
-
-   // The monolithic BlockVector stores the unknown fields as follows:
-   // (Position, Velocity, Specific Internal Energy).
-   Vector dv_dt, v0, dx_dt;
-   v0.SetDataAndSize(S0.GetData() + Vsize, Vsize);
-   dv_dt.SetDataAndSize(dS_dt.GetData() + Vsize, Vsize);
-   dx_dt.SetDataAndSize(dS_dt.GetData(), Vsize);
-
-   // In each sub-step:
-   // - Update the global state Vector S.
-   // - Compute dv_dt using S.
-   // - Update V using dv_dt.
-   // - Compute de_dt and dx_dt using S and V.
-
-   // -- 1.
-   // S is S0.
-   hydro_oper->UpdateMesh(S);
-   hydro_oper->SolveVelocity(S, dS_dt);
-   // V = v0 + 0.5 * dt * dv_dt;
-   add(v0, 0.5 * dt, dv_dt, V);
-   hydro_oper->SolveEnergy(S, V, dS_dt);
-   dx_dt = V;
-
-   // -- 2.
-   // S = S0 + 0.5 * dt * dS_dt;
-   add(S0, 0.5 * dt, dS_dt, S);
-   hydro_oper->ResetQuadratureData();
-   hydro_oper->UpdateMesh(S);
-   hydro_oper->SolveVelocity(S, dS_dt);
-   // V = v0 + 0.5 * dt * dv_dt;
-   add(v0, 0.5 * dt, dv_dt, V);
-   hydro_oper->SolveEnergy(S, V, dS_dt);
-   dx_dt = V;
-
-   // -- 3.
-   // S = S0 + dt * dS_dt.
-   add(S0, dt, dS_dt, S);
-   hydro_oper->ResetQuadratureData();
-
-   t += dt;
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
diff --git a/laghos_timeinteg.hpp b/laghos_timeinteg.hpp
deleted file mode 100644
index bb9cbc7e..00000000
--- a/laghos_timeinteg.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_TIMEINTEG
-#define MFEM_LAGHOS_TIMEINTEG
-
-#include "mfem.hpp"
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-class LagrangianHydroOperator;
-
-class HydroODESolver : public ODESolver
-{
-protected:
-   LagrangianHydroOperator *hydro_oper;
-
-public:
-   HydroODESolver() : hydro_oper(NULL) { }
-
-   virtual void Init(TimeDependentOperator &_f);
-
-   virtual void Step(Vector &S, double &t, double &dt)
-   { MFEM_ABORT("Time stepping is undefined."); }
-};
-
-class RK2AvgSolver : public HydroODESolver
-{
-public:
-   RK2AvgSolver() { }
-
-   virtual void Step(Vector &S, double &t, double &dt);
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_LAGHOS_TIMEINTEG
diff --git a/makefile b/makefile
index 33366036..41cf296f 100644
--- a/makefile
+++ b/makefile
@@ -19,7 +19,12 @@ define LAGHOS_HELP_MSG
 Laghos makefile targets:
 
    make
+   make setup
+   make setup MFEM_BUILD=pcuda
    make status/info
+   make test
+   make tests
+   make checks
    make install
    make clean
    make distclean
@@ -27,6 +32,10 @@ Laghos makefile targets:
 
 Examples:
 
+make setup
+   Build Laghos third party libraries: HYPRE, METIS and MFEM
+   (By default MFEM will be compiled in parallel mode, but MFEM_BUILD=pcuda
+    will allow a parallel CUDA build.)
 make -j 4
    Build Laghos using the current configuration options from MFEM.
    (Laghos requires the MFEM finite element library, and uses its compiler and
@@ -46,116 +55,90 @@ make style
 
 endef
 
+NPROC = $(shell getconf _NPROCESSORS_ONLN)
+GOALS = help clean distclean style setup mfem metis hypre
+
 # Default installation location
 PREFIX ?= ./bin
 INSTALL = /usr/bin/install
 
-# Use the MFEM build directory
+# Use the MFEM source, build, or install directory
 MFEM_DIR ?= ../mfem
 CONFIG_MK = $(MFEM_DIR)/config/config.mk
-TEST_MK = $(MFEM_DIR)/config/test.mk
-# Use the MFEM install directory
-# MFEM_DIR = ../mfem/mfem
-# CONFIG_MK = $(MFEM_DIR)/config.mk
-# TEST_MK = $(MFEM_DIR)/test.mk
-
-# Use two relative paths to MFEM: first one for compilation in '.' and second
-# one for compilation in 'lib'.
-MFEM_DIR1 := $(MFEM_DIR)
-MFEM_DIR2 := $(realpath $(MFEM_DIR))
+ifeq ($(wildcard $(CONFIG_MK)),)
+   CONFIG_MK = $(MFEM_DIR)/share/mfem/config.mk
+endif
+TEST_MK = $(MFEM_TEST_MK)
 
 # Use the compiler used by MFEM. Get the compiler and the options for compiling
 # and linking from MFEM's config.mk. (Skip this if the target does not require
 # building.)
 MFEM_LIB_FILE = mfem_is_not_built
-ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
+ifeq (,$(filter $(GOALS),$(MAKECMDGOALS)))
    -include $(CONFIG_MK)
+   ifneq ($(realpath $(MFEM_DIR)),$(MFEM_SOURCE_DIR))
+      ifneq ($(realpath $(MFEM_DIR)),$(MFEM_INSTALL_DIR))
+         MFEM_BUILD_DIR := $(MFEM_DIR)
+         override MFEM_DIR := $(MFEM_SOURCE_DIR)
+      endif
+   endif
 endif
 
 CXX = $(MFEM_CXX)
 CPPFLAGS = $(MFEM_CPPFLAGS)
 CXXFLAGS = $(MFEM_CXXFLAGS)
-
-# MFEM config does not define C compiler
-CC     = gcc
-CFLAGS = -O3
-
-# Optional link flags
-LDFLAGS =
-
-OPTIM_OPTS = -O3
-DEBUG_OPTS = -g -Wall
-LAGHOS_DEBUG = $(MFEM_DEBUG)
-ifneq ($(LAGHOS_DEBUG),$(MFEM_DEBUG))
-   ifeq ($(LAGHOS_DEBUG),YES)
-      CXXFLAGS = $(DEBUG_OPTS)
-   else
-      CXXFLAGS = $(OPTIM_OPTS)
-   endif
-endif
-
 LAGHOS_FLAGS = $(CPPFLAGS) $(CXXFLAGS) $(MFEM_INCFLAGS)
-LAGHOS_LIBS = $(MFEM_LIBS)
-
-ifeq ($(LAGHOS_DEBUG),YES)
-   LAGHOS_FLAGS += -DLAGHOS_DEBUG
-endif
+# Extra include dir, needed for now to include headers like "general/forall.hpp"
+EXTRA_INC_DIR = $(or $(wildcard $(MFEM_DIR)/include/mfem),$(MFEM_DIR))
+CCC = $(strip $(CXX) $(LAGHOS_FLAGS) $(if $(EXTRA_INC_DIR),-I$(EXTRA_INC_DIR)))
 
+LAGHOS_LIBS = $(MFEM_LIBS) $(MFEM_EXT_LIBS)
 LIBS = $(strip $(LAGHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(LAGHOS_FLAGS))
-Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
 
-SOURCE_FILES = laghos.cpp laghos_solver.cpp laghos_assembly.cpp laghos_timeinteg.cpp
-OBJECT_FILES1 = $(SOURCE_FILES:.cpp=.o)
-OBJECT_FILES = $(OBJECT_FILES1:.c=.o)
-HEADER_FILES = laghos_solver.hpp laghos_assembly.hpp laghos_timeinteg.hpp
+SOURCE_FILES = $(sort $(wildcard *.cpp))
+HEADER_FILES = $(sort $(wildcard *.hpp))
+OBJECT_FILES = $(SOURCE_FILES:.cpp=.o)
 
 # Targets
 
-.PHONY: all clean distclean install status info opt debug test style clean-build clean-exec
+.PHONY: all clean distclean install status info opt debug test tests style \
+	clean-build clean-exec clean-tests setup mfem hypre metis
 
-.SUFFIXES: .c .cpp .o
+.SUFFIXES: .cpp .o
 .cpp.o:
 	cd $(<D); $(CCC) -c $(<F)
-.c.o:
-	cd $(<D); $(Ccc) -c $(<F)
 
-laghos: override MFEM_DIR = $(MFEM_DIR1)
-laghos:	$(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
-	$(CCC) -o laghos $(OBJECT_FILES) $(LIBS)
+laghos: $(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
+	$(MFEM_CXX) $(MFEM_LINK_FLAGS) -o laghos $(OBJECT_FILES) $(LIBS)
 
-all: laghos
+all:;@$(MAKE) -j $(NPROC) laghos
 
-opt:
-	$(MAKE) "LAGHOS_DEBUG=NO"
-
-debug:
-	$(MAKE) "LAGHOS_DEBUG=YES"
-
-$(OBJECT_FILES): override MFEM_DIR = $(MFEM_DIR2)
 $(OBJECT_FILES): $(HEADER_FILES) $(CONFIG_MK)
 
+# Quick test with specific execution options
 MFEM_TESTS = laghos
-include $(TEST_MK)
-# Testing: Specific execution options
-RUN_MPI = $(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) 4
+RUN_MPI_4 = $(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) 4
 test: laghos
-	@$(call mfem-test,$<, $(RUN_MPI), Laghos miniapp,\
+	@$(call mfem-test,$<, $(RUN_MPI_4), Laghos miniapp,\
 	-p 0 -m data/square01_quad.mesh -rs 3 -tf 0.1)
 # Testing: "test" target and mfem-test* variables are defined in MFEM's
 # config/test.mk
+ifeq (,$(filter $(GOALS),$(MAKECMDGOALS)))
+include $(TEST_MK)
+endif
 
 # Generate an error message if the MFEM library is not built and exit
 $(CONFIG_MK) $(MFEM_LIB_FILE):
 	$(error The MFEM library is not built)
 
-clean: clean-build clean-exec
+cln clean: clean-build clean-exec clean-tests
 
 clean-build:
 	rm -rf laghos *.o *~ *.dSYM
 clean-exec:
 	rm -rf ./results
-
+clean-tests:
+	rm -rf BASELINE.dat RUN.dat RESULTS.dat
 distclean: clean
 	rm -rf bin/
 
@@ -168,16 +151,133 @@ help:
 	@true
 
 status info:
-	$(info MFEM_DIR    = $(MFEM_DIR))
+	$(info MFEM_DIR     = $(MFEM_DIR))
 	$(info LAGHOS_FLAGS = $(LAGHOS_FLAGS))
 	$(info LAGHOS_LIBS  = $(value LAGHOS_LIBS))
-	$(info PREFIX      = $(PREFIX))
+	$(info PREFIX       = $(PREFIX))
 	@true
 
-ASTYLE = astyle --options=$(MFEM_DIR1)/config/mfem.astylerc
+ASTYLE = astyle --options=$(MFEM_DIR)/config/mfem.astylerc
 FORMAT_FILES := $(SOURCE_FILES) $(HEADER_FILES)
-
 style:
 	@if ! $(ASTYLE) $(FORMAT_FILES) | grep Formatted; then\
 	   echo "No source files were changed.";\
 	fi
+
+# Laghos checks template - Default arguments
+ECHO=echo
+SED=sed -e
+ranks=1
+dims=2 3
+problems=0 1 2 3 4 5 6
+OPTS=-cgt 1.e-14 -rs 0 --checks
+USE_CUDA := $(MFEM_USE_CUDA:NO=)
+optioni=1 2$(if $(USE_CUDA), 3)
+options=-fa -pa $(if $(USE_CUDA),-d_cuda) #-d_debug
+#optioni = $(shell for i in {1..$(words $(options))}; do echo $$i; done)
+
+# Laghos checks template - Targets
+define laghos_checks_template
+.PHONY: laghos_$(1)_$(2)_$(3)_$(4)
+laghos_$(1)_$(2)_$(3)_$(4): laghos
+	$(eval name=laghos-x$(4)-p$(1)-$(2)D$(word $(3),$(options)))
+	$(eval command=$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(4) ./laghos $(OPTS) -p $(1) -dim $(2) $(shell echo $(word $(3),$(options))|$(SED) "s/-/ -/g"|$(SED) "s/_/ /g"))
+	@$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(4) ./$$< $(OPTS) -p $(1) -dim $(2) $(shell echo $(word $(3),$(options))|$(SED) "s/-/ -/g"|$(SED) "s/_/ /g") > /dev/null 2>&1 && \
+	$(call COLOR_PRINT,'\033[0;32m',OK,': $(name)\n') || $(call COLOR_PRINT,'\033[1;31m',KO,': $(command)\n');
+endef
+# Generate all Laghos checks template targets
+$(foreach p, $(problems), $(foreach d, $(dims), $(foreach o, $(optioni), $(foreach r, $(ranks),\
+	$(eval $(call laghos_checks_template,$(p),$(d),$(o),$(r)))))))
+# Output info on all Laghos checks template targets
+#$(foreach p, $(problems), $(foreach d, $(dims), $(foreach o, $(optioni), $(foreach r, $(ranks),\
+#   $(info $(call laghos_checks_template,$(p),$(d),$(o),$(r)))))))
+checks: laghos
+checks: |$(foreach p,$(problems), $(foreach d,$(dims), $(foreach o,$(optioni), $(foreach r,$(ranks), laghos_$(p)_$(d)_$(o)_$(r)))))
+
+1:;@$(MAKE) -j $(NPROC) checks ranks=1
+2:;@$(MAKE) -j 8 checks ranks=2
+3:;@$(MAKE) -j 4 checks ranks=3
+
+# Laghos run tests
+tests:
+	cat << EOF > RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 0 -dim 2 -rs 3 -tf 0.75 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 20 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 0 -dim 3 -rs 1 -tf 0.75 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 20 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 1 -dim 2 -rs 3 -tf 0.8 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 1 -dim 3 -rs 2 -tf 0.6 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 2 -dim 1 -rs 5 -tf 0.2 -fa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 18 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 3 -m data/box01_hex.mesh -rs 1 -tf 3.0 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) $(MFEM_MPI_NP) \
+	./laghos -p 4 -m data/square_gresho.mesh -rs 3 -ok 3 \
+	         -ot 2 -tf 0.62831853 -s 7 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 20 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(shell cat << EOF > BASELINE.dat)
+	$(shell echo 'step = 0339, dt = 0.000702, |e| = 4.9695537349e+01' >> BASELINE.dat)
+	$(shell echo 'step = 1041, dt = 0.000121, |e| = 3.3909635545e+03' >> BASELINE.dat)
+	$(shell echo 'step = 1154, dt = 0.001655, |e| = 4.6303396053e+01' >> BASELINE.dat)
+	$(shell echo 'step = 0560, dt = 0.002449, |e| = 1.3408616722e+02' >> BASELINE.dat)
+	$(shell echo 'step = 0413, dt = 0.000470, |e| = 3.2012077410e+01' >> BASELINE.dat)
+	$(shell echo 'step = 2872, dt = 0.000064, |e| = 5.6547039096e+01' >> BASELINE.dat)
+	$(shell echo 'step = 0528, dt = 0.000180, |e| = 5.6505348812e+01' >> BASELINE.dat)
+	$(shell echo 'step = 0776, dt = 0.000045, |e| = 4.0982431726e+02' >> BASELINE.dat)
+	diff --report-identical-files RESULTS.dat BASELINE.dat
+
+# Setup: download & install third party libraries: HYPRE, METIS & MFEM
+
+HYPRE_URL = https://computation.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods
+HYPRE_VER = 2.11.2
+HYPRE_DIR = hypre
+hypre:
+	@(if [[ ! -e ../$(HYPRE_DIR) ]]; then cd ..; \
+		wget -nc $(HYPRE_URL)/download/hypre-$(HYPRE_VER).tar.gz &&\
+		tar xzvf hypre-$(HYPRE_VER).tar.gz &&\
+		ln -s hypre-$(HYPRE_VER) $(HYPRE_DIR) &&\
+		cd $(HYPRE_DIR)/src &&\
+		./configure --disable-fortran --without-fei CC=mpicc CXX=mpic++ &&\
+		make -j $(NPROC);	else echo "Using existing ../$(HYPRE_DIR)"; fi)
+
+METIS_URL = http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis
+METIS_VER = 4.0.3
+METIS_DIR = metis-4.0
+metis:
+	@(if [[ ! -e ../$(METIS_DIR) ]]; then cd ..; \
+		wget -nc $(METIS_URL)/OLD/metis-$(METIS_VER).tar.gz &&\
+		tar zxvf metis-$(METIS_VER).tar.gz &&\
+		ln -s metis-$(METIS_VER) $(METIS_DIR) &&\
+		cd $(METIS_DIR) &&\
+		make -j $(NPROC) OPTFLAGS="-O2";\
+		else echo "Using existing ../$(METIS_DIR)"; fi)
+
+MFEM_GIT = https://github.com/mfem/mfem.git
+MFEM_BUILD ?= parallel
+#MFEM_BUILD ?= pcuda -j CUDA_ARCH=sm_70
+mfem: hypre metis
+	@(if [[ ! -e ../mfem ]]; then cd ..; \
+		git clone --single-branch --branch master --depth 1 $(MFEM_GIT) &&\
+		cd mfem &&\
+		make $(MFEM_BUILD) -j $(NPROC); else echo "Using existing ../mfem"; fi)
+
+setup: mfem
diff --git a/occa/.gitignore b/occa/.gitignore
deleted file mode 100644
index ba604415..00000000
--- a/occa/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.o
-laghos
diff --git a/occa/README.md b/occa/README.md
deleted file mode 100644
index 93040f50..00000000
--- a/occa/README.md
+++ /dev/null
@@ -1,150 +0,0 @@
-               __                __
-              / /   ____  ____  / /_  ____  _____
-             / /   / __ `/ __ `/ __ \/ __ \/ ___/
-            / /___/ /_/ / /_/ / / / / /_/ (__  )
-           /_____/\__,_/\__, /_/ /_/\____/____/
-                       /____/
-
-        High-order Lagrangian Hydrodynamics Miniapp
-
-                      OCCA version
-
-## Overview
-
-This directory contains the OCCA version of the **Laghos** (LAGrangian
-High-Order Solver), which is provided as a reference implementation and is NOT
-the official benchmark version of the miniapp.
-
-For more details about Laghos see the [README file](../README.md) in the
-top-level directory.
-
-The Laghos miniapp is part of the [CEED software suite](http://ceed.exascaleproject.org/software),
-a collection of software benchmarks, miniapps, libraries and APIs for
-efficient exascale discretizations based on high-order finite element
-and spectral element methods. See http://github.com/ceed for more
-information and source code availability.
-
-The CEED research is supported by the [Exascale Computing Project](https://exascaleproject.org/exascale-computing-project)
-(17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
-organizations (Office of Science and the National Nuclear Security
-Administration) responsible for the planning and preparation of a
-[capable exascale ecosystem](https://exascaleproject.org/what-is-exascale),
-including software, applications, hardware, advanced system engineering and early
-testbed platforms, in support of the nation’s exascale computing imperative.
-
-## Differences with the official benchmark version
-
-The OCCA version differs from the official benchmark version of Laghos (in the
-top-level directory) in the following ways:
-
-1. Only problems 0 and 1 are defined
-2. Final iterations (`step`), time steps (`dt`) and energies (`|e|`) differ from the original version
-
-## Building
-
-Follow the steps below to build the OCCA version
-
-### Environment setup
-```sh
-export MPI_HOME=~/usr/local/openmpi/3.0.0
-```
-
-### Hypre
-- <https://computation.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods/download/hypre-2.11.2.tar.gz>
-- `tar xzvf hypre-2.11.2.tar.gz`
-- ` cd hypre-2.11.2/src`
-- `./configure --disable-fortran --with-MPI --with-MPI-include=$MPI_HOME/include --with-MPI-lib-dirs=$MPI_HOME/lib`
-- `make -j`
-- `cd ../..`
-
-### Metis
--   <http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/metis-5.1.0.tar.gz>
--   `tar xzvf metis-5.1.0.tar.gz`
--   `cd metis-5.1.0`
--   ``make config prefix=`pwd` ``
--   `make && make install`
--   `cd ..`
-
-### OCCA
--   `git clone git@github.com:libocca/occa.git`
--   `cd occa`
--   `make -j`
--   `export PATH+=":${PWD}/bin"`
--   `export LD_LIBRARY_PATH+=":${PWD}/lib"`
--   `cd ..`
-
-### MFEM with OCCA
--   `git clone git@github.com:mfem/mfem.git`
--   `cd mfem`
--   `git checkout occa-dev`
--   ``make config MFEM_USE_MPI=YES HYPRE_DIR=`pwd`/../hypre-2.11.2/src/hypre MFEM_USE_METIS_5=YES METIS_DIR=`pwd`/../metis-5.1.0 MFEM_USE_OCCA=YES OCCA_DIR=`pwd`/../occa``
--   `make status` to verify that all the include paths are correct
--   `make -j`
--   `cd ..`
-
-### OCCA Laghos
--   `git clone git@github.com:CEED/Laghos.git`
--   `cd Laghos/occa`
--   `make -j`
-
-## Running
-
-The OCCA version can run the same sample test runs as the official benchmark
-version of Laghos.
-
-### Options
--   -m <string>: Mesh file to use
--   -ok <int>: Order (degree) of the kinematic finite element space
--   -rs <int>: Number of times to refine the mesh uniformly in serial
--   -p <int>: Problem setup to use, Sedov problem is '1'
--   -cfl <double>: CFL-condition number
--   -ms <int>: Maximum number of steps (negative means no restriction)
--   -d <string>: OCCA device string (e.g. "mode: 'CUDA', device_id: 0")
-
-## Verification of Results
-
-To make sure the results are correct, we tabulate reference final iterations
-(`step`), time steps (`dt`) and energies (`|e|`) for the runs listed below:
-
-### Serial Mode
-
-1. `mpirun -np 4 laghos -p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.75`
-2. `mpirun -np 4 laghos -p 0 -m ../data/cube01_hex.mesh -rs 1 -tf 0.75`
-3. `mpirun -np 4 laghos -p 1 -m ../data/square01_quad.mesh -rs 3 -tf 0.8 -cfl 0.05`
-4. `mpirun -np 4 laghos -p 1 -m ../data/cube01_hex.mesh -rs 2 -tf 0.6 -cfl 0.08`
-
-### CUDA Mode
-
-1. `mpirun -np 4 laghos -p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.75 -d "mode: 'CUDA', device_id: 0"`
-2. `mpirun -np 4 laghos -p 0 -m ../data/cube01_hex.mesh -rs 1 -tf 0.75 -d "mode: 'CUDA', device_id: 0"`
-3. `mpirun -np 4 laghos -p 1 -m ../data/square01_quad.mesh -rs 3 -tf 0.8 -cfl 0.05 -d "mode: 'CUDA', device_id: 0"`
-4. `mpirun -np 4 laghos -p 1 -m ../data/cube01_hex.mesh -rs 2 -tf 0.6 -cfl 0.08 -d "mode: 'CUDA', device_id: 0"`
-
-### Results
-
-| `run` | `step` | `dt` | `e` |
-| ----- | ------ | ---- | --- |
-|  1. |  333 | 0.000008 | 49.6955373330   |
-|  2. | 1036 | 0.000093 | 3390.9635544028 |
-|  3. | 1625 | 0.000309 | 19.6812117043   |
-|  4. |  558 | 0.000359 | 50.4237325177   |
-
-An implementation is considered valid if the final energy values are all within
-round-off distance from the above reference values.
-
-> Sedov blast example has differences from original version
-
-## Contact
-
-You can reach the Laghos team by emailing laghos@llnl.gov or by leaving a
-comment in the [issue tracker](https://github.com/CEED/Laghos/issues).
-
-## Copyright
-
-The following copyright applies to each file in the CEED software suite,
-unless otherwise stated in the file:
-
-> Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the
-> Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights reserved.
-
-See files LICENSE and NOTICE in the top-level directory for details.
diff --git a/occa/config.json b/occa/config.json
deleted file mode 100644
index b39f8881..00000000
--- a/occa/config.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "devices": [
-    { "mode": "Serial" }
-  ]
-}
diff --git a/occa/kernels/cpu/force.okl b/occa/kernels/cpu/force.okl
deleted file mode 100644
index 07f34d03..00000000
--- a/occa/kernels/cpu/force.okl
+++ /dev/null
@@ -1,414 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "occa://mfem/fem/defines.okl"
-
-#define ELEMENT_BATCH 10
-
-#if L2_DOFS_1D > NUM_QUAD_1D
-#  define L2_MAX_1D L2_DOFS_1D
-#else
-#  define L2_MAX_1D NUM_QUAD_1D
-#endif
-
-#if H1_DOFS_1D > NUM_QUAD_1D
-#  define H1_MAX_1D H1_DOFS_1D
-#else
-#  define H1_MAX_1D NUM_QUAD_1D
-#endif
-
-#if L2_DOFS_1D > H1_DOFS_1D
-#  define MAX_DOFS_1D L2_DOFS_1D
-#else
-#  define MAX_DOFS_1D H1_DOFS_1D
-#endif
-
-#if H1_MAX_1D > L2_MAX_1D
-#  define INNER_SIZE H1_MAX_1D
-#else
-#  define INNER_SIZE L2_MAX_1D
-#endif
-
-#define INNER_SIZE_2D (INNER_SIZE * INNER_SIZE)
-
-typedef double* L2DofToQuad_t @dim(NUM_QUAD_1D, L2_DOFS_1D);
-typedef double* H1DofToQuad_t @dim(NUM_QUAD_1D, H1_DOFS_1D);
-
-typedef double* L2QuadToDof_t @dim(L2_DOFS_1D , NUM_QUAD_1D);
-typedef double* H1QuadToDof_t @dim(H1_DOFS_1D , NUM_QUAD_1D);
-
-#if VDIM_ORDERING == ORDERING_BY_VDIM
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-#else
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,0);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,4,0);
-#endif
-
-typedef double* E2D_t      @dim(L2_DOFS_1D, L2_DOFS_1D, numElements);
-typedef double* E3D_t      @dim(L2_DOFS_1D, L2_DOFS_1D, L2_DOFS_1D, numElements);
-typedef double* Stress2D_t @dim(NUM_DIM, NUM_DIM, NUM_QUAD_1D, NUM_QUAD_1D, numElements);
-typedef double* Stress3D_t @dim(NUM_DIM, NUM_DIM, NUM_QUAD_1D, NUM_QUAD_1D, NUM_QUAD_1D, numElements);
-
-@kernel void Mult2D(const int numElements,
-                    @restrict const L2DofToQuad_t L2DofToQuad,
-                    @restrict const H1QuadToDof_t H1QuadToDof,
-                    @restrict const H1QuadToDof_t H1QuadToDofD,
-                    @restrict const Stress2D_t stressJinvT,
-                    @restrict const E2D_t e,
-                    @restrict V2D_t v) {
-  for (int el = 0; el < numElements; ++el; @tile(1, @outer, @inner, check=false)) {
-    double e_xy[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-    for (int i = 0; i < NUM_QUAD_2D; ++i) {
-      e_xy[i] = 0;
-    }
-
-    for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-      double e_x[NUM_QUAD_1D];
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-        e_x[qy] = 0;
-      }
-
-      for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-        const double r_e = e(dx, dy, el);
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          e_x[qx] += L2DofToQuad(qx, dx) * r_e;
-        }
-      }
-
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-        const double wy = L2DofToQuad(qy, dy);
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          e_xy(qx, qy) += wy * e_x[qx];
-        }
-      }
-    }
-
-    for (int c = 0; c < 2; ++c) {
-      for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-        for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-          v(c, dx, dy, el) = 0;
-        }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-        double Dxy[H1_DOFS_1D];
-        double xy[H1_DOFS_1D];
-        for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-          Dxy[dx] = 0;
-          xy[dx]  = 0;
-        }
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          const double esx = e_xy(qx, qy) * stressJinvT(0, c, qx, qy, el);
-          const double esy = e_xy(qx, qy) * stressJinvT(1, c, qx, qy, el);
-          for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-            Dxy[dx] += esx * H1QuadToDofD(dx, qx);
-            xy[dx]  += esy * H1QuadToDof(dx, qx);
-          }
-        }
-        for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-          const double wy  = H1QuadToDof(dy, qy);
-          const double wDy = H1QuadToDofD(dy, qy);
-          for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-            v(c, dx, dy, el) += ((wy  * Dxy[dx]) +
-                                 (wDy * xy[dx]));
-          }
-        }
-      }
-    }
-  }
-}
-
-@kernel void MultTranspose2D(const int numElements,
-                             @restrict const L2QuadToDof_t L2QuadToDof,
-                             @restrict const H1DofToQuad_t H1DofToQuad,
-                             @restrict const H1DofToQuad_t H1DofToQuadD,
-                             @restrict const Stress2D_t stressJinvT,
-                             @restrict const V2D_t v,
-                             @restrict E2D_t e) {
-  for (int el = 0; el < numElements; ++el; @tile(1, @outer, @inner, check=false)) {
-    double vStress[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-    for (int i = 0; i < NUM_QUAD_2D; ++i) {
-      vStress[i] = 0;
-    }
-    for (int c = 0; c < NUM_DIM; ++c) {
-      double v_Dxy[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-      double v_xDy[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-      for (int i = 0; i < NUM_QUAD_2D; ++i) {
-        v_Dxy[i] = v_xDy[i] = 0;
-      }
-      for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-        double v_x[NUM_QUAD_1D];
-        double v_Dx[NUM_QUAD_1D];
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          v_x[qx] = v_Dx[qx] = 0;
-        }
-
-        for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-          const double r_v = v(c, dx, dy, el);
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            v_x[qx]  += r_v * H1DofToQuad(qx, dx);
-            v_Dx[qx] += r_v * H1DofToQuadD(qx, dx);
-          }
-        }
-
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          const double wy  = H1DofToQuad(qy, dy);
-          const double wDy = H1DofToQuadD(qy, dy);
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            v_Dxy(qx, qy) += v_Dx[qx] * wy;
-            v_xDy(qx, qy) += v_x[qx]  * wDy;
-          }
-        }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          vStress(qx, qy) += ((v_Dxy(qx, qy) * stressJinvT(0, c, qx, qy, el)) +
-                              (v_xDy(qx, qy) * stressJinvT(1, c, qx, qy, el)));
-        }
-      }
-    }
-    for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-      for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-        e(dx, dy, el) = 0;
-      }
-    }
-    for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-      double e_x[L2_DOFS_1D];
-      for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-        e_x[dx] = 0;
-      }
-      for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-        const double r_v = vStress(qx, qy);
-        for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-          e_x[dx] += r_v * L2QuadToDof(dx, qx);
-        }
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-        const double w = L2QuadToDof(dy, qy);
-        for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-          e(dx, dy, el) += e_x[dx] * w;
-        }
-      }
-    }
-  }
-}
-
-@kernel void Mult3D(const int numElements,
-                    @restrict const L2DofToQuad_t L2DofToQuad,
-                    @restrict const H1QuadToDof_t H1QuadToDof,
-                    @restrict const H1QuadToDof_t H1QuadToDofD,
-                    @restrict const Stress3D_t stressJinvT,
-                    @restrict const E3D_t e,
-                    @restrict V3D_t v) {
-  for (int el = 0; el < numElements; ++el; @tile(1, @outer, @inner, check=false)) {
-    double e_xyz[NUM_QUAD_3D] @dim(NUM_QUAD_1D, NUM_QUAD_1D, NUM_QUAD_1D);
-    for (int i = 0; i < NUM_QUAD_3D; ++i) {
-      e_xyz[i] = 0;
-    }
-
-    for (int dz = 0; dz < L2_DOFS_1D; ++dz) {
-      double e_xy[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-      for (int i = 0; i < NUM_QUAD_2D; ++i) {
-        e_xy[i] = 0;
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-        double e_x[NUM_QUAD_1D];
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          e_x[qy] = 0;
-        }
-
-        for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-          const double r_e = e(dx, dy, dz, el);
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            e_x[qx] += L2DofToQuad(qx, dx) * r_e;
-          }
-        }
-
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          const double wy = L2DofToQuad(qy, dy);
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            e_xy(qx, qy) += wy * e_x[qx];
-          }
-        }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-        const double wz = L2DofToQuad(qz, dz);
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            e_xyz(qx, qy, qz) += wz * e_xy(qx, qy);
-          }
-        }
-      }
-    }
-
-    for (int c = 0; c < 3; ++c) {
-      for (int dz = 0; dz < H1_DOFS_1D; ++dz) {
-        for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-          for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-            v(c, dx, dy, dz, el) = 0;
-          }
-        }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-        double Dxy_x[H1_DOFS_1D * H1_DOFS_1D] @dim(H1_DOFS_1D, H1_DOFS_1D);
-        double xDy_y[H1_DOFS_1D * H1_DOFS_1D] @dim(H1_DOFS_1D, H1_DOFS_1D);
-        double xy_z[H1_DOFS_1D * H1_DOFS_1D]  @dim(H1_DOFS_1D, H1_DOFS_1D);
-        for (int d = 0; d < (H1_DOFS_1D * H1_DOFS_1D); ++d) {
-          Dxy_x[d] = xDy_y[d] = xy_z[d] = 0;
-        }
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          double Dx_x[H1_DOFS_1D];
-          double x_y[H1_DOFS_1D];
-          double x_z[H1_DOFS_1D];
-          for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-            Dx_x[dx] = x_y[dx] = x_z[dx] = 0;
-          }
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            const double r_e = e_xyz(qx, qy, qz);
-            const double esx = r_e * stressJinvT(0, c, qx, qy, qz, el);
-            const double esy = r_e * stressJinvT(1, c, qx, qy, qz, el);
-            const double esz = r_e * stressJinvT(2, c, qx, qy, qz, el);
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-              Dx_x[dx] += esx * H1QuadToDofD(dx, qx);
-              x_y[dx]  += esy * H1QuadToDof(dx, qx);
-              x_z[dx]  += esz * H1QuadToDof(dx, qx);
-            }
-          }
-          for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-            const double wy  = H1QuadToDof(dy, qy);
-            const double wDy = H1QuadToDofD(dy, qy);
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-              Dxy_x(dx, dy) += Dx_x[dx] * wy;
-              xDy_y(dx, dy) += x_y[dx]  * wDy;
-              xy_z(dx, dy)  += x_z[dx]  * wy;
-            }
-          }
-        }
-        for (int dz = 0; dz < H1_DOFS_1D; ++dz) {
-          const double wz  = H1QuadToDof(dz, qz);
-          const double wDz = H1QuadToDofD(dz, qz);
-          for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-              v(c, dx, dy, dz, el) += ((Dxy_x(dx, dy) * wz) +
-                                       (xDy_y(dx, dy) * wz) +
-                                       (xy_z(dx, dy)  * wDz));
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-@kernel void MultTranspose3D(const int numElements,
-                             @restrict const L2QuadToDof_t L2QuadToDof,
-                             @restrict const H1DofToQuad_t H1DofToQuad,
-                             @restrict const H1DofToQuad_t H1DofToQuadD,
-                             @restrict const Stress3D_t stressJinvT,
-                             @restrict const V3D_t v,
-                             @restrict E3D_t e) {
-  for (int el = 0; el < numElements; ++el; @tile(1, @outer, @inner, check=false)) {
-    double vStress[NUM_QUAD_3D] @dim(NUM_QUAD_1D, NUM_QUAD_1D, NUM_QUAD_1D);
-    for (int i = 0; i < NUM_QUAD_3D; ++i) {
-      vStress[i] = 0;
-    }
-    for (int c = 0; c < NUM_DIM; ++c) {
-      for (int dz = 0; dz < H1_DOFS_1D; ++dz) {
-        double Dxy_x[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-        double xDy_y[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-        double xy_z[NUM_QUAD_2D]  @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-        for (int i = 0; i < NUM_QUAD_2D; ++i) {
-          Dxy_x[i] = xDy_y[i] = xy_z[i] = 0;
-        }
-        for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-          double Dx_x[NUM_QUAD_1D];
-          double x_y[NUM_QUAD_1D];
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            Dx_x[qx] = x_y[qx] = 0;
-          }
-
-          for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-            const double r_v = v(c, dx, dy, dz, el);
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-              Dx_x[qx] += r_v * H1DofToQuadD(qx, dx);
-              x_y[qx]  += r_v * H1DofToQuad(qx, dx);
-            }
-          }
-
-          for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-            const double wy  = H1DofToQuad(qy, dy);
-            const double wDy = H1DofToQuadD(qy, dy);
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-              Dxy_x(qx, qy) += Dx_x[qx] * wy;
-              xDy_y(qx, qy) += x_y[qx]  * wDy;
-              xy_z(qx, qy)  += x_y[qx]  * wy;
-            }
-          }
-        }
-        for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-          const double wz  = H1DofToQuad(qz, dz);
-          const double wDz = H1DofToQuadD(qz, dz);
-          for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-              vStress(qx, qy, qz) += ((Dxy_x(qx, qy) * wz  * stressJinvT(0, c, qx, qy, qz, el)) +
-                                      (xDy_y(qx, qy) * wz  * stressJinvT(1, c, qx, qy, qz, el)) +
-                                      (xy_z(qx, qy)  * wDz * stressJinvT(2, c, qx, qy, qz, el)));
-            }
-          }
-        }
-      }
-    }
-    for (int dz = 0; dz < L2_DOFS_1D; ++dz) {
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-        for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-          e(dx, dy, dz, el) = 0;
-        }
-      }
-    }
-    for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-      double e_xy[L2_DOFS_1D * L2_DOFS_1D] @dim(L2_DOFS_1D, L2_DOFS_1D);
-      for (int d = 0; d < (L2_DOFS_1D * L2_DOFS_1D); ++d) {
-        e_xy[d] = 0;
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-        double e_x[L2_DOFS_1D];
-        for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-          e_x[dx] = 0;
-        }
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          const double r_v = vStress(qx, qy, qz);
-          for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-            e_x[dx] += r_v * L2QuadToDof(dx, qx);
-          }
-        }
-        for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-          const double w = L2QuadToDof(dy, qy);
-          for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-            e_xy(dx, dy) += e_x[dx] * w;
-          }
-        }
-      }
-      for (int dz = 0; dz < L2_DOFS_1D; ++dz) {
-        const double w = L2QuadToDof(dz, qz);
-        for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-          for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-            e(dx, dy, dz, el) += w * e_xy(dx, dy);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/occa/kernels/cpu/quadratureData.okl b/occa/kernels/cpu/quadratureData.okl
deleted file mode 100644
index e51a6bcb..00000000
--- a/occa/kernels/cpu/quadratureData.okl
+++ /dev/null
@@ -1,459 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "occa://mfem/fem/defines.okl"
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-#ifndef GAMMA
-#  if defined(VORTEX_PROBLEM)
-#    define GAMMA(q, el) (5.0 / 3.0)
-#  elif defined(SEDOV_PROBLEM) || defined(SHOCK_TUBE_PROBLEM)
-#    define GAMMA(q, el) 1.4
-#  else
-#    error "Cannot handle this problem ... yet!"
-#  endif
-#endif
-
-#if VDIM_ORDERING == ORDERING_BY_VDIM
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-#else
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,0);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,4,0);
-#endif
-
-typedef double* QJacobian_t @dim(NUM_DIM, NUM_DIM, NUM_QUAD, numElements);
-typedef double* Stress_t    @dim(NUM_DIM, NUM_DIM, NUM_QUAD, numElements);
-
-@kernel void InitQuadratureData(const int numElements,
-                                @restrict const QLocal_t rho0,
-                                @restrict const QLocal_t detJ,
-                                @restrict const double * quadWeights,
-                                @restrict QLocal_t rho0DetJ0w) {
-  for (int el = 0; el < numElements; ++el; @outer) {
-    for (int q = 0; q < NUM_QUAD; ++q; @inner) {
-      rho0DetJ0w(q, el) = rho0(q, el) * detJ(q, el) * quadWeights[q];
-    }
-  }
-}
-
-@kernel void UpdateQuadratureData2D(const int numElements,
-                                    @restrict const DofToQuad_t dofToQuad,
-                                    @restrict const DofToQuad_t dofToQuadD,
-                                    @restrict const double * quadWeights,
-                                    @restrict const V2D_t v,
-                                    @restrict const QLocal_t e,
-                                    @restrict const QLocal_t rho0DetJ0w,
-                                    @restrict const QJacobian_t invJ0,
-                                    @restrict const QJacobian_t J,
-                                    @restrict const QJacobian_t invJ,
-                                    @restrict const QLocal_t detJ,
-                                    @restrict Stress_t stressJinvT,
-                                    @restrict QLocal_t dtEst) {
-  for (int el = 0; el < numElements; ++el; @tile(1, @outer, @inner, check=false)) {
-    double s_gradv[4 * NUM_QUAD_2D] @dim(2, 2, NUM_QUAD_2D);
-
-    for (int i = 0; i < (4 * NUM_QUAD_2D); ++i) {
-      s_gradv[i] = 0;
-    }
-
-    for (int dy = 0; dy < NUM_DOFS_1D; ++dy) {
-      double vDx[2 * NUM_QUAD_1D] @dim(2, NUM_QUAD_1D);
-      double vx[2 * NUM_QUAD_1D]  @dim(2, NUM_QUAD_1D);
-      for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-        for (int vi = 0; vi < 2; ++vi) {
-          vDx(vi, qx) = 0;
-          vx(vi, qx)  = 0;
-        }
-      }
-
-      for (int dx = 0; dx < NUM_DOFS_1D; ++dx) {
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          for (int vi = 0; vi < 2; ++vi) {
-            vDx(vi, qx) += v(vi, dx, dy, el) * dofToQuadD(qx, dx);
-            vx(vi, qx)  += v(vi, dx, dy, el) * dofToQuad(qx, dx);
-          }
-        }
-      }
-
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-        const double wy  = dofToQuad(qy, dy);
-        const double wDy = dofToQuadD(qy, dy);
-
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-          for (int vi = 0; vi < 2; ++vi) {
-            s_gradv(vi, 0, qx + qy*NUM_QUAD_1D) += wy  * vDx(vi, qx);
-            s_gradv(vi, 1, qx + qy*NUM_QUAD_1D) += wDy * vx(vi, qx);
-          }
-        }
-      }
-    }
-
-    for (int q = 0; q < NUM_QUAD; ++q) {
-      double q_gradv[NUM_DIM * NUM_DIM]  @dim(NUM_DIM, NUM_DIM);
-      double q_stress[NUM_DIM * NUM_DIM] @dim(NUM_DIM, NUM_DIM);
-
-      const double invJ_00 = invJ(0, 0, q, el), invJ_10 = invJ(1, 0, q, el);
-      const double invJ_01 = invJ(0, 1, q, el), invJ_11 = invJ(1, 1, q, el);
-
-      q_gradv(0, 0) = ((s_gradv(0, 0, q) * invJ_00) + (s_gradv(1, 0, q) * invJ_01));
-      q_gradv(1, 0) = ((s_gradv(0, 0, q) * invJ_10) + (s_gradv(1, 0, q) * invJ_11));
-      q_gradv(0, 1) = ((s_gradv(0, 1, q) * invJ_00) + (s_gradv(1, 1, q) * invJ_01));
-      q_gradv(1, 1) = ((s_gradv(0, 1, q) * invJ_10) + (s_gradv(1, 1, q) * invJ_11));
-
-      const double q_gamma = GAMMA(q, el);
-      const double q_Jw = detJ(q, el) * quadWeights[q];
-
-      const double q_rho = rho0DetJ0w(q, el) / q_Jw;
-      const double q_e   = max(0.0, e(q, el));
-
-      // TODO: Input OccaVector eos(q,e) -> (stress, soundSpeed)
-      const double s = -(q_gamma - 1.0) * q_rho * q_e;
-      q_stress(0, 0) = s; q_stress(1, 0) = 0;
-      q_stress(0, 1) = 0; q_stress(1, 1) = s;
-
-      const double gradv00 = q_gradv(0, 0);
-      const double gradv11 = q_gradv(1, 1);
-      const double gradv10 = 0.5 * (q_gradv(1, 0) + q_gradv(0, 1));
-      q_gradv(1, 0) = gradv10;
-      q_gradv(0, 1) = gradv10;
-
-      double comprDirX = 1;
-      double comprDirY = 0;
-      double minEig = 0;
-      // linalg/densemat.cpp: Eigensystem2S()
-      if (gradv10 == 0) {
-        minEig = (gradv00 < gradv11) ? gradv00 : gradv11;
-      } else {
-        const double zeta  = (gradv11 - gradv00) / (2.0 * gradv10);
-        const double azeta = fabs(zeta);
-        double t = 1.0 / (azeta + sqrt(1.0 + zeta*zeta));
-        if ((t < 0) != (zeta < 0)) {
-          t = -t;
-        }
-
-        const double c = sqrt(1.0 / (1.0 + t*t));
-        const double s = c * t;
-        t *= gradv10;
-
-        if ((gradv00 - t) <= (gradv11 + t)) {
-          minEig = gradv00 - t;
-          comprDirX = c;
-          comprDirY = -s;
-        } else {
-          minEig = gradv11 + t;
-          comprDirX = s;
-          comprDirY = c;
-        }
-      }
-
-      // Computes the initial->physical transformation Jacobian.
-      const double J_00 = J(0, 0, q, el), J_10 = J(1, 0, q, el);
-      const double J_01 = J(0, 1, q, el), J_11 = J(1, 1, q, el);
-
-      const double invJ0_00 = invJ0(0, 0, q, el), invJ0_10 = invJ0(1, 0, q, el);
-      const double invJ0_01 = invJ0(0, 1, q, el), invJ0_11 = invJ0(1, 1, q, el);
-
-      const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01));
-      const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11));
-      const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01));
-      const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11));
-
-      const double physDirX = (Jpi_00 * comprDirX) + (Jpi_10 * comprDirY);
-      const double physDirY = (Jpi_01 * comprDirX) + (Jpi_11 * comprDirY);
-
-      const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY));
-
-      // TODO: soundSpeed will be an input as well (function call or values per q)
-      const double soundSpeed = sqrt(q_gamma * (q_gamma - 1.0) * q_e);
-      dtEst(q, el) = CFL * q_h / soundSpeed;
-
-      if (USE_VISCOSITY) {
-        // TODO: Check how we can extract outside of kernel
-        const double mu = minEig;
-        double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-        if (mu < 0) {
-          coeff += 0.5 * q_rho * q_h * soundSpeed;
-        }
-        for (int y = 0; y < NUM_DIM; ++y) {
-          for (int x = 0; x < NUM_DIM; ++x) {
-            q_stress(x, y) += coeff * q_gradv(x, y);
-          }
-        }
-      }
-      const double S00 = q_stress(0, 0), S10 = q_stress(1, 0);
-      const double S01 = q_stress(0, 1), S11 = q_stress(1, 1);
-
-      stressJinvT(0, 0, q, el) = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01));
-      stressJinvT(1, 0, q, el) = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11));
-
-      stressJinvT(0, 1, q, el) = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01));
-      stressJinvT(1, 1, q, el) = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11));
-    }
-  }
-}
-
-@kernel void UpdateQuadratureData3D(const int numElements,
-                                    @restrict const DofToQuad_t dofToQuad,
-                                    @restrict const DofToQuad_t dofToQuadD,
-                                    @restrict const double * quadWeights,
-                                    @restrict const V3D_t v,
-                                    @restrict const QLocal_t e,
-                                    @restrict const QLocal_t rho0DetJ0w,
-                                    @restrict const QJacobian_t invJ0,
-                                    @restrict const QJacobian_t J,
-                                    @restrict const QJacobian_t invJ,
-                                    @restrict const QLocal_t detJ,
-                                    @restrict Stress_t stressJinvT,
-                                    @restrict QLocal_t dtEst) {
-  for (int el = 0; el < numElements; ++el; @tile(1, @outer, @inner, check=false)) {
-    double s_gradv[9 * NUM_QUAD_3D] @dim(3, 3, NUM_QUAD_3D);
-
-    for (int i = 0; i < (9 * NUM_QUAD_3D); ++i) {
-      s_gradv[i] = 0;
-    }
-
-    for (int dz = 0; dz < NUM_DOFS_1D; ++dz) {
-      double vDxy[3 * NUM_QUAD_2D] @dim(3, NUM_QUAD_1D, NUM_QUAD_1D);
-      double vxDy[3 * NUM_QUAD_2D] @dim(3, NUM_QUAD_1D, NUM_QUAD_1D);
-      double vxy[3 * NUM_QUAD_2D]  @dim(3, NUM_QUAD_1D, NUM_QUAD_1D);
-      for (int i = 0; i < (3 * NUM_QUAD_2D); ++i) {
-        vDxy[i] = 0;
-        vxDy[i] = 0;
-        vxy[i]  = 0;
-      }
-
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy) {
-        double vDx[3 * NUM_QUAD_1D] @dim(3, NUM_QUAD_1D);
-        double vx[3 * NUM_QUAD_1D]  @dim(3, NUM_QUAD_1D);
-        for (int i = 0; i < (3 * NUM_QUAD_1D); ++i) {
-          vDx[i] = 0;
-          vx[i]  = 0;
-        }
-
-        for (int dx = 0; dx < NUM_DOFS_1D; ++dx) {
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            for (int vi = 0; vi < 3; ++vi) {
-              vDx(vi, qx) += v(vi, dx, dy, dz, el) * dofToQuadD(qx, dx);
-              vx(vi, qx)  += v(vi, dx, dy, dz, el) * dofToQuad(qx, dx);
-            }
-          }
-        }
-
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          const double wy  = dofToQuad(qy, dy);
-          const double wDy = dofToQuadD(qy, dy);
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            for (int vi = 0; vi < 3; ++vi) {
-              vDxy(vi, qx, qy) += wy  * vDx(vi, qx);
-              vxDy(vi, qx, qy) += wDy * vx(vi, qx);
-              vxy(vi, qx, qy)  += wy  * vx(vi, qx);
-            }
-          }
-        }
-      }
-      for (int qz = 0; qz < NUM_DOFS_1D; ++qz) {
-        const double wz  = dofToQuad(qz, dz);
-        const double wDz = dofToQuadD(qz, dz);
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-            const int q = qx + qy*NUM_QUAD_1D + qz*NUM_QUAD_2D;
-            for (int vi = 0; vi < 3; ++vi) {
-              s_gradv(vi, 0, q) += wz  * vDxy(vi, qx, qy);
-              s_gradv(vi, 1, q) += wz  * vxDy(vi, qx, qy);
-              s_gradv(vi, 2, q) += wDz * vxy(vi, qx, qy);
-            }
-          }
-        }
-      }
-    }
-
-    for (int q = 0; q < NUM_QUAD; ++q) {
-      double q_gradv[9]  @dim(3, 3);
-      double q_stress[9] @dim(3, 3);
-
-      const double invJ_00 = invJ(0, 0, q, el), invJ_10 = invJ(1, 0, q, el), invJ_20 = invJ(2, 0, q, el);
-      const double invJ_01 = invJ(0, 1, q, el), invJ_11 = invJ(1, 1, q, el), invJ_21 = invJ(2, 1, q, el);
-      const double invJ_02 = invJ(0, 2, q, el), invJ_12 = invJ(1, 2, q, el), invJ_22 = invJ(2, 2, q, el);
-
-      q_gradv(0, 0) = ((s_gradv(0, 0, q) * invJ_00) + (s_gradv(1, 0, q) * invJ_01) + (s_gradv(2, 0, q) * invJ_02));
-      q_gradv(1, 0) = ((s_gradv(0, 0, q) * invJ_10) + (s_gradv(1, 0, q) * invJ_11) + (s_gradv(2, 0, q) * invJ_12));
-      q_gradv(2, 0) = ((s_gradv(0, 0, q) * invJ_20) + (s_gradv(1, 0, q) * invJ_21) + (s_gradv(2, 0, q) * invJ_22));
-
-      q_gradv(0, 1) = ((s_gradv(0, 1, q) * invJ_00) + (s_gradv(1, 1, q) * invJ_01) + (s_gradv(2, 1, q) * invJ_02));
-      q_gradv(1, 1) = ((s_gradv(0, 1, q) * invJ_10) + (s_gradv(1, 1, q) * invJ_11) + (s_gradv(2, 1, q) * invJ_12));
-      q_gradv(2, 1) = ((s_gradv(0, 1, q) * invJ_20) + (s_gradv(1, 1, q) * invJ_21) + (s_gradv(2, 1, q) * invJ_22));
-
-      q_gradv(0, 2) = ((s_gradv(0, 2, q) * invJ_00) + (s_gradv(1, 2, q) * invJ_01) + (s_gradv(2, 2, q) * invJ_02));
-      q_gradv(1, 2) = ((s_gradv(0, 2, q) * invJ_10) + (s_gradv(1, 2, q) * invJ_11) + (s_gradv(2, 2, q) * invJ_12));
-      q_gradv(2, 2) = ((s_gradv(0, 2, q) * invJ_20) + (s_gradv(1, 2, q) * invJ_21) + (s_gradv(2, 2, q) * invJ_22));
-
-      const double q_gamma = GAMMA(q, el);
-      const double q_Jw = detJ(q, el) * quadWeights[q];
-
-      const double q_rho = rho0DetJ0w(q, el) / q_Jw;
-      const double q_e   = max(0.0, e(q, el));
-
-      const double s = -(q_gamma - 1.0) * q_rho * q_e;
-      q_stress(0, 0) = s; q_stress(1, 0) = 0; q_stress(2, 0) = 0;
-      q_stress(0, 1) = 0; q_stress(1, 1) = s; q_stress(2, 1) = 0;
-      q_stress(0, 2) = 0; q_stress(1, 2) = 0; q_stress(2, 2) = s;
-
-      const double gradv00 = q_gradv(0, 0);
-      const double gradv11 = q_gradv(1, 1);
-      const double gradv22 = q_gradv(2, 2);
-      const double gradv10 = 0.5 * (q_gradv(1, 0) + q_gradv(0, 1));
-      const double gradv20 = 0.5 * (q_gradv(2, 0) + q_gradv(0, 2));
-      const double gradv21 = 0.5 * (q_gradv(2, 1) + q_gradv(1, 2));
-      q_gradv(1, 0) = gradv10; q_gradv(2, 0) = gradv20;
-      q_gradv(0, 1) = gradv10; q_gradv(2, 1) = gradv21;
-      q_gradv(0, 2) = gradv20; q_gradv(1, 2) = gradv21;
-
-      double minEig = 0;
-      double comprDirX = 1;
-      double comprDirY = 0;
-      double comprDirZ = 0;
-
-      {
-        // Compute eigenvalues using quadrature formula
-        const double q_ = (gradv00 + gradv11 + gradv22) / 3.0;
-        const double gradv_q00 = (gradv00 - q_);
-        const double gradv_q11 = (gradv11 - q_);
-        const double gradv_q22 = (gradv22 - q_);
-
-        const double p1 = ((gradv10 * gradv10) +
-                           (gradv20 * gradv20) +
-                           (gradv21 * gradv21));
-        const double p2 = ((gradv_q00 * gradv_q00) +
-                           (gradv_q11 * gradv_q11) +
-                           (gradv_q22 * gradv_q22) +
-                           (2.0 * p1));
-        const double p    = sqrt(p2 / 6.0);
-        const double pinv = 1.0 / p;
-        // det(pinv * (gradv - q*I))
-        const double r = (0.5 * pinv * pinv * pinv *
-                          ((gradv_q00 * gradv_q11 * gradv_q22) +
-                           (2.0 * gradv10 * gradv21 * gradv20) -
-                           (gradv_q11 * gradv20 * gradv20) -
-                           (gradv_q22 * gradv10 * gradv10) -
-                           (gradv_q00 * gradv21 * gradv21)));
-
-        double phi = 0;
-        if (r <= -1.0) {
-          phi = M_PI / 3.0;
-        } else if (r < 1.0) {
-          phi = acos(r) / 3.0;
-        }
-
-        minEig = q_ + (2.0 * p * cos(phi + (2.0 * M_PI / 3.0)));
-        const double eig3 = q_ + (2.0 * p * cos(phi));
-        const double eig2 = 3.0 * q_ - minEig - eig3;
-        double maxNorm = 0;
-
-        for (int i = 0; i < 3; ++i) {
-          const double x = q_gradv[i + 3*0] - (i == 0)*eig3;
-          const double y = q_gradv[i + 3*1] - (i == 1)*eig3;
-          const double z = q_gradv[i + 3*2] - (i == 2)*eig3;
-          const double cx = ((x * (gradv00 - eig2)) +
-                             (y * gradv10) +
-                             (z * gradv20));
-          const double cy = ((x * gradv10) +
-                             (y * (gradv11 - eig2)) +
-                             (z * gradv21));
-          const double cz = ((x * gradv20) +
-                             (y * gradv21) +
-                             (z * (gradv22 - eig2)));
-          const double cNorm = (cx*cx + cy*cy + cz*cz);
-          if ((cNorm > 1e-16) && (maxNorm < cNorm)) {
-            comprDirX = cx;
-            comprDirY = cy;
-            comprDirZ = cz;
-            maxNorm = cNorm;
-          }
-        }
-        if (maxNorm > 1e-16) {
-          const double maxNormInv = 1.0 / sqrt(maxNorm);
-          comprDirX *= maxNormInv;
-          comprDirY *= maxNormInv;
-          comprDirZ *= maxNormInv;
-        }
-      }
-
-      // Computes the initial->physical transformation Jacobian.
-      const double J_00 = J(0, 0, q, el), J_10 = J(1, 0, q, el), J_20 = J(2, 0, q, el);
-      const double J_01 = J(0, 1, q, el), J_11 = J(1, 1, q, el), J_21 = J(2, 1, q, el);
-      const double J_02 = J(0, 2, q, el), J_12 = J(1, 2, q, el), J_22 = J(2, 2, q, el);
-
-      const double invJ0_00 = invJ0(0, 0, q, el), invJ0_10 = invJ0(1, 0, q, el), invJ0_20 = invJ0(2, 0, q, el);
-      const double invJ0_01 = invJ0(0, 1, q, el), invJ0_11 = invJ0(1, 1, q, el), invJ0_21 = invJ0(2, 1, q, el);
-      const double invJ0_02 = invJ0(0, 2, q, el), invJ0_12 = invJ0(1, 2, q, el), invJ0_22 = invJ0(2, 2, q, el);
-
-      const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01) + (J_20 * invJ0_02));
-      const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11) + (J_20 * invJ0_12));
-      const double Jpi_20 = ((J_00 * invJ0_20) + (J_10 * invJ0_21) + (J_20 * invJ0_22));
-
-      const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01) + (J_21 * invJ0_02));
-      const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11) + (J_21 * invJ0_12));
-      const double Jpi_21 = ((J_01 * invJ0_20) + (J_11 * invJ0_21) + (J_21 * invJ0_22));
-
-      const double Jpi_02 = ((J_02 * invJ0_00) + (J_12 * invJ0_01) + (J_22 * invJ0_02));
-      const double Jpi_12 = ((J_02 * invJ0_10) + (J_12 * invJ0_11) + (J_22 * invJ0_12));
-      const double Jpi_22 = ((J_02 * invJ0_20) + (J_12 * invJ0_21) + (J_22 * invJ0_22));
-
-      const double physDirX = ((Jpi_00 * comprDirX) + (Jpi_10 * comprDirY) + (Jpi_20 * comprDirZ));
-      const double physDirY = ((Jpi_01 * comprDirX) + (Jpi_11 * comprDirY) + (Jpi_21 * comprDirZ));
-      const double physDirZ = ((Jpi_02 * comprDirX) + (Jpi_12 * comprDirY) + (Jpi_22 * comprDirZ));
-
-      const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY) + (physDirZ * physDirZ));
-
-      const double soundSpeed = sqrt(q_gamma * (q_gamma - 1.0) * q_e);
-      dtEst(q, el) = CFL * q_h / soundSpeed;
-
-      if (USE_VISCOSITY) {
-        // TODO: Check how we can extract outside of kernel
-        const double mu = minEig;
-        double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-        if (mu < 0) {
-          coeff += 0.5 * q_rho * q_h * soundSpeed;
-        }
-        for (int y = 0; y < 3; ++y) {
-          for (int x = 0; x < 3; ++x) {
-            q_stress(x, y) += coeff * q_gradv(x, y);
-          }
-        }
-      }
-
-      const double S00 = q_stress(0, 0), S10 = q_stress(1, 0), S20 = q_stress(2, 0);
-      const double S01 = q_stress(0, 1), S11 = q_stress(1, 1), S21 = q_stress(2, 1);
-      const double S02 = q_stress(0, 2), S12 = q_stress(1, 2), S22 = q_stress(2, 2);
-
-      stressJinvT(0, 0, q, el) = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01) + (S20 * invJ_02));
-      stressJinvT(1, 0, q, el) = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11) + (S20 * invJ_12));
-      stressJinvT(2, 0, q, el) = q_Jw * ((S00 * invJ_20) + (S10 * invJ_21) + (S20 * invJ_22));
-
-      stressJinvT(0, 1, q, el) = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01) + (S21 * invJ_02));
-      stressJinvT(1, 1, q, el) = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11) + (S21 * invJ_12));
-      stressJinvT(2, 1, q, el) = q_Jw * ((S01 * invJ_20) + (S11 * invJ_21) + (S21 * invJ_22));
-
-      stressJinvT(0, 2, q, el) = q_Jw * ((S02 * invJ_00) + (S12 * invJ_01) + (S22 * invJ_02));
-      stressJinvT(1, 2, q, el) = q_Jw * ((S02 * invJ_10) + (S12 * invJ_11) + (S22 * invJ_12));
-      stressJinvT(2, 2, q, el) = q_Jw * ((S02 * invJ_20) + (S12 * invJ_21) + (S22 * invJ_22));
-    }
-  }
-}
diff --git a/occa/kernels/force.okl b/occa/kernels/force.okl
deleted file mode 100644
index 701cd429..00000000
--- a/occa/kernels/force.okl
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "occa://mfem/fem/defines.okl"
-
-#ifdef OCCA_USING_GPU
-#  include "occa://laghos/gpu/force.okl"
-#else
-#  include "occa://laghos/cpu/force.okl"
-#endif
diff --git a/occa/kernels/gpu/force.okl b/occa/kernels/gpu/force.okl
deleted file mode 100644
index c95c591b..00000000
--- a/occa/kernels/gpu/force.okl
+++ /dev/null
@@ -1,526 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "occa://mfem/fem/defines.okl"
-
-#define ELEMENT_BATCH 10
-
-#if L2_DOFS_1D > NUM_QUAD_1D
-#  define L2_MAX_1D L2_DOFS_1D
-#else
-#  define L2_MAX_1D NUM_QUAD_1D
-#endif
-
-#if H1_DOFS_1D > NUM_QUAD_1D
-#  define H1_MAX_1D H1_DOFS_1D
-#else
-#  define H1_MAX_1D NUM_QUAD_1D
-#endif
-
-#if L2_DOFS_1D > H1_DOFS_1D
-#  define MAX_DOFS_1D L2_DOFS_1D
-#else
-#  define MAX_DOFS_1D H1_DOFS_1D
-#endif
-
-#if H1_MAX_1D > L2_MAX_1D
-#  define INNER_SIZE H1_MAX_1D
-#else
-#  define INNER_SIZE L2_MAX_1D
-#endif
-
-#define INNER_SIZE_2D (INNER_SIZE * INNER_SIZE)
-
-typedef double* L2DofToQuad_t @dim(NUM_QUAD_1D, L2_DOFS_1D);
-typedef double* H1DofToQuad_t @dim(NUM_QUAD_1D, H1_DOFS_1D);
-
-typedef double* L2QuadToDof_t @dim(L2_DOFS_1D , NUM_QUAD_1D);
-typedef double* H1QuadToDof_t @dim(H1_DOFS_1D , NUM_QUAD_1D);
-
-#if VDIM_ORDERING == ORDERING_BY_VDIM
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-#else
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,0);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,4,0);
-#endif
-
-typedef double* E2D_t      @dim(L2_DOFS_1D, L2_DOFS_1D, numElements);
-typedef double* E3D_t      @dim(L2_DOFS_1D, L2_DOFS_1D, L2_DOFS_1D, numElements);
-typedef double* Stress2D_t @dim(NUM_DIM, NUM_DIM, NUM_QUAD_1D, NUM_QUAD_1D, numElements);
-typedef double* Stress3D_t @dim(NUM_DIM, NUM_DIM, NUM_QUAD_1D, NUM_QUAD_1D, NUM_QUAD_1D, numElements);
-
-@kernel void Mult2D(const int numElements,
-                    @restrict const L2DofToQuad_t L2DofToQuad,
-                    @restrict const H1QuadToDof_t H1QuadToDof,
-                    @restrict const H1QuadToDof_t H1QuadToDofD,
-                    @restrict const Stress2D_t stressJinvT,
-                    @restrict const E2D_t e,
-                    @restrict V2D_t v) {
-  for (int elBlock = 0; elBlock < numElements; elBlock += ELEMENT_BATCH; @outer) {
-    @shared double s_L2DofToQuad[NUM_QUAD_1D * L2_DOFS_1D]  @dim(NUM_QUAD_1D, L2_DOFS_1D);
-    @shared double s_H1QuadToDof[H1_DOFS_1D  * NUM_QUAD_1D] @dim(H1_DOFS_1D , NUM_QUAD_1D);
-    @shared double s_H1QuadToDofD[H1_DOFS_1D * NUM_QUAD_1D] @dim(H1_DOFS_1D , NUM_QUAD_1D);
-
-    @shared double s_xy[MAX_DOFS_1D * NUM_QUAD_1D] @dim(MAX_DOFS_1D, NUM_QUAD_1D);
-    @shared double s_xDy[H1_DOFS_1D * NUM_QUAD_1D] @dim(H1_DOFS_1D , NUM_QUAD_1D);
-    @shared double s_e[NUM_QUAD_2D]                @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-
-    for (int idBlock = 0; idBlock < INNER_SIZE; ++idBlock; @inner) {
-      for (int id = idBlock; id < (L2_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE) {
-        s_L2DofToQuad[id] = L2DofToQuad[id];
-      }
-      for (int id = idBlock; id < (H1_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE) {
-        s_H1QuadToDof[id]  = H1QuadToDof[id];
-        s_H1QuadToDofD[id] = H1QuadToDofD[id];
-      }
-    }
-
-    for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el) {
-      if (el < numElements) {
-        for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-          if (dx < L2_DOFS_1D) {
-            double r_x[L2_DOFS_1D];
-
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-              r_x[dy] = e(dx, dy, el);
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-              double xy = 0;
-              for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-                xy += r_x[dy] * s_L2DofToQuad(qy, dy);
-              }
-              s_xy(dx, qy) = xy;
-            }
-          }
-        }
-        for (int qy = 0; qy < INNER_SIZE; ++qy; @inner) {
-          if (qy < NUM_QUAD_1D) {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-              double r_e = 0;
-              for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-                r_e += s_xy(dx, qy) * s_L2DofToQuad(qx, dx);
-              }
-              s_e(qx, qy) = r_e;
-            }
-          }
-        }
-
-        for (int c = 0; c < NUM_DIM; ++c) {
-          for (int qx = 0; qx < INNER_SIZE; ++qx; @inner) {
-            if (qx < NUM_QUAD_1D) {
-              double r_x[NUM_QUAD_1D];
-              double r_y[NUM_QUAD_1D];
-
-              for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-                const double r_e = s_e(qx, qy);
-                r_x[qy] = r_e * stressJinvT(0, c, qx, qy, el);
-                r_y[qy] = r_e * stressJinvT(1, c, qx, qy, el);
-              }
-              for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-                double xy  = 0;
-                double xDy = 0;
-                for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-                  xy  += r_x[qy] * s_H1QuadToDof(dy, qy);
-                  xDy += r_y[qy] * s_H1QuadToDofD(dy, qy);
-                }
-                s_xy(dy, qx)  = xy;
-                s_xDy(dy, qx) = xDy;
-              }
-            }
-          }
-          for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-            if (dx < H1_DOFS_1D) {
-              for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-                double r_v = 0;
-                for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-                  r_v += ((s_xy(dy, qx) * s_H1QuadToDofD(dx, qx)) +
-                          (s_xDy(dy, qx) * s_H1QuadToDof(dx, qx)));
-                }
-                v(c, dx, dy, el) = r_v;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-@kernel void MultTranspose2D(const int numElements,
-                             @restrict const L2QuadToDof_t L2QuadToDof,
-                             @restrict const H1DofToQuad_t H1DofToQuad,
-                             @restrict const H1DofToQuad_t H1DofToQuadD,
-                             @restrict const Stress2D_t stressJinvT,
-                             @restrict const V2D_t v,
-                             @restrict E2D_t e) {
-  for (int elBlock = 0; elBlock < numElements; elBlock += ELEMENT_BATCH; @outer) {
-    @shared double s_L2QuadToDof[NUM_QUAD_1D * L2_DOFS_1D]  @dim(L2_DOFS_1D , NUM_QUAD_1D);
-    @shared double s_H1DofToQuad[H1_DOFS_1D  * NUM_QUAD_1D] @dim(NUM_QUAD_1D, H1_DOFS_1D);
-    @shared double s_H1DofToQuadD[H1_DOFS_1D * NUM_QUAD_1D] @dim(NUM_QUAD_1D, H1_DOFS_1D);
-
-    @shared double s_xy[MAX_DOFS_1D * NUM_QUAD_1D] @dim(NUM_QUAD_1D, MAX_DOFS_1D);
-    @shared double s_xDy[H1_DOFS_1D * NUM_QUAD_1D] @dim(NUM_QUAD_1D, H1_DOFS_1D);
-    @shared double s_v[NUM_QUAD_1D  * NUM_QUAD_1D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-
-    for (int idBlock = 0; idBlock < INNER_SIZE; ++idBlock; @inner) {
-      for (int id = idBlock; id < (L2_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE) {
-        s_L2QuadToDof[id] = L2QuadToDof[id];
-      }
-      for (int id = idBlock; id < (H1_DOFS_1D * NUM_QUAD_1D); id += INNER_SIZE) {
-        s_H1DofToQuad[id]  = H1DofToQuad[id];
-        s_H1DofToQuadD[id] = H1DofToQuadD[id];
-      }
-    }
-
-    for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el) {
-      if (el < numElements) {
-        for (int qBlock = 0; qBlock < INNER_SIZE; ++qBlock; @inner) {
-          for (int q = qBlock; q < NUM_QUAD; ++q) {
-            s_v[q] = 0;
-          }
-        }
-        for (int c = 0; c < NUM_DIM; ++c) {
-          for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-            if (dx < H1_DOFS_1D) {
-              double r_v[H1_DOFS_1D];
-
-              for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-                r_v[dy] = v(c, dx, dy, el);
-              }
-              for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-                double xy  = 0;
-                double xDy = 0;
-                for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-                  xy  += r_v[dy] * s_H1DofToQuad(qy, dy);
-                  xDy += r_v[dy] * s_H1DofToQuadD(qy, dy);
-                }
-                s_xy(qy, dx)  = xy;
-                s_xDy(qy, dx) = xDy;
-              }
-            }
-          }
-          for (int qx = 0; qx < INNER_SIZE; ++qx; @inner) {
-            if (qx < NUM_QUAD_1D) {
-              for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-                double Dxy = 0;
-                double xDy = 0;
-                for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-                  Dxy += (s_xy(qy, dx)  * s_H1DofToQuadD(qx, dx));
-                  xDy += (s_xDy(qy, dx) * s_H1DofToQuad(qx, dx));
-                }
-                s_v(qx, qy) += ((Dxy * stressJinvT(0, c, qx, qy, el)) +
-                                (xDy * stressJinvT(1, c, qx, qy, el)));
-              }
-            }
-          }
-        }
-        for (int qx = 0; qx < INNER_SIZE; ++qx; @inner) {
-          if (qx < NUM_QUAD_1D) {
-            double r_x[NUM_QUAD_1D];
-
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-              r_x[qy] = s_v(qx, qy);
-            }
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-              double xy = 0;
-              for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-                xy += r_x[qy] * s_L2QuadToDof(dy, qy);
-              }
-              s_xy(qx, dy) = xy;
-            }
-          }
-        }
-        for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-          if (dy < L2_DOFS_1D) {
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-              double r_e = 0;
-              for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-                r_e += s_xy(qx, dy) * s_L2QuadToDof(dx, qx);
-              }
-              e(dx, dy, el) = r_e;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-@kernel void Mult3D(const int numElements,
-                    @restrict const L2DofToQuad_t L2DofToQuad,
-                    @restrict const H1QuadToDof_t H1QuadToDof,
-                    @restrict const H1QuadToDof_t H1QuadToDofD,
-                    @restrict const Stress3D_t stressJinvT,
-                    @restrict const E3D_t e,
-                    @restrict V3D_t v) {
-  for (int elBlock = 0; elBlock < numElements; elBlock += ELEMENT_BATCH; @outer) {
-    @shared double s_L2DofToQuad[NUM_QUAD_1D * L2_DOFS_1D]  @dim(NUM_QUAD_1D, L2_DOFS_1D);
-    @shared double s_H1QuadToDof[H1_DOFS_1D  * NUM_QUAD_1D] @dim(H1_DOFS_1D , NUM_QUAD_1D);
-    @shared double s_H1QuadToDofD[H1_DOFS_1D * NUM_QUAD_1D] @dim(H1_DOFS_1D , NUM_QUAD_1D);
-
-    @shared double s_Dxyz[INNER_SIZE_2D] @dim(INNER_SIZE, INNER_SIZE);
-    @shared double s_xDyz[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-    @shared double s_xyDz[NUM_QUAD_2D] @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-
-    @exclusive double r_z[NUM_QUAD_1D];
-
-    for (int y = 0; y < INNER_SIZE; ++y; @inner) {
-      for (int x = 0; x < INNER_SIZE; ++x; @inner) {
-        const int id = (y * INNER_SIZE) + x;
-        for (int i = id; i < (L2_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE)) {
-          s_L2DofToQuad[i] = L2DofToQuad[i];
-        }
-        for (int i = id; i < (H1_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE)) {
-          s_H1QuadToDof[i]  = H1QuadToDof[i];
-          s_H1QuadToDofD[i] = H1QuadToDofD[i];
-        }
-      }
-    }
-
-    for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el) {
-      if (el < numElements) {
-        for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-          for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-            if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D)) {
-              // Calculate D -> Q in the Z axis
-              const double r_e0 = e(dx, dy, 0, el);
-              for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-                r_z[qz] = r_e0 * s_L2DofToQuad(qz, 0);
-              }
-
-              for (int dz = 1; dz < L2_DOFS_1D; ++dz) {
-                const double r_e = e(dx, dy, dz, el);
-                for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-                  r_z[qz] += r_e * s_L2DofToQuad(qz, dz);
-                }
-              }
-            }
-          }
-        }
-        // For each xy plane
-        for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-          // Fill xy plane at given z position
-          for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-            for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-              if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D)) {
-                s_Dxyz(dx, dy) = r_z[qz];
-              }
-            }
-          }
-          // Calculate Dxyz, xDyz, xyDz in plane
-          for (int qy = 0; qy < INNER_SIZE; ++qy; @inner) {
-            for (int qx = 0; qx < INNER_SIZE; ++qx; @inner) {
-              if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D)) {
-                double q_e = 0;
-                for (int dy = 0; dy < L2_DOFS_1D; ++dy) {
-                  double q_ex = 0;
-                  for (int dx = 0; dx < L2_DOFS_1D; ++dx) {
-                    q_ex += s_Dxyz(dx, dy) * s_L2DofToQuad(qx, dx);
-                  }
-                  q_e += q_ex * s_L2DofToQuad(qy, dy);
-                }
-                r_z[qz] = q_e;
-              }
-            }
-          }
-        }
-        for (int c = 0; c < NUM_DIM; ++c) {
-          for (int dz = 0; dz < H1_DOFS_1D; ++dz) {
-            // Fill xy plane at given z position
-            for (int qy = 0; qy < INNER_SIZE; ++qy; @inner) {
-              for (int qx = 0; qx < INNER_SIZE; ++qx; @inner) {
-                if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D)) {
-                  double r_Dxyz = 0;
-                  double r_xDyz = 0;
-                  double r_xyDz = 0;
-                  for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-                    const double r_e = r_z[qz];
-                    const double wz  = s_H1QuadToDof(dz, qz);
-                    const double wDz = s_H1QuadToDofD(dz, qz);
-                    r_Dxyz += r_e * wz  * stressJinvT(0, c, qx, qy, qz, el);
-                    r_xDyz += r_e * wz  * stressJinvT(1, c, qx, qy, qz, el);
-                    r_xyDz += r_e * wDz * stressJinvT(2, c, qx, qy, qz, el);
-                  }
-                  s_Dxyz(qx, qy) = r_Dxyz;
-                  s_xDyz(qx, qy) = r_xDyz;
-                  s_xyDz(qx, qy) = r_xyDz;
-                }
-              }
-            }
-            // Finalize solution in xy plane
-            for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-              for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-                if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D)) {
-                  double r_v = 0;
-                  for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-                    const double wy  = s_H1QuadToDof(dy, qy);
-                    const double wDy = s_H1QuadToDofD(dy, qy);
-                    for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-                      const double wx  = s_H1QuadToDof(dx, qx);
-                      const double wDx = s_H1QuadToDofD(dx, qx);
-                      r_v += ((wDx * wy  * s_Dxyz(qx, qy)) +
-                              (wx  * wDy * s_xDyz(qx, qy)) +
-                              (wx  * wy  * s_xyDz(qx, qy)));
-                    }
-                  }
-                  v(c, dx, dy, dz, el) = r_v;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-@kernel void MultTranspose3D(const int numElements,
-                             @restrict const L2QuadToDof_t L2QuadToDof,
-                             @restrict const H1DofToQuad_t H1DofToQuad,
-                             @restrict const H1DofToQuad_t H1DofToQuadD,
-                             @restrict const Stress3D_t stressJinvT,
-                             @restrict const V3D_t v,
-                             @restrict E3D_t e) {
-  for (int elBlock = 0; elBlock < numElements; elBlock += ELEMENT_BATCH; @outer) {
-    @shared double s_L2QuadToDof[L2_DOFS_1D * NUM_QUAD_1D]  @dim(L2_DOFS_1D , NUM_QUAD_1D);
-    @shared double s_H1DofToQuad[H1_DOFS_1D  * NUM_QUAD_1D] @dim(NUM_QUAD_1D, H1_DOFS_1D);
-    @shared double s_H1DofToQuadD[H1_DOFS_1D * NUM_QUAD_1D] @dim(NUM_QUAD_1D, H1_DOFS_1D);
-
-    @shared double s_xyz[NUM_QUAD_2D * NUM_DIM]  @dim(NUM_DIM, NUM_QUAD_1D, NUM_QUAD_1D);
-    @shared double s_xyDz[NUM_QUAD_2D * NUM_DIM] @dim(NUM_DIM, NUM_QUAD_1D, NUM_QUAD_1D);
-    @shared double s_v[NUM_QUAD_2D]              @dim(NUM_QUAD_1D, NUM_QUAD_1D);
-
-    @exclusive double r_xyz[NUM_QUAD_1D * NUM_DIM]  @dim(NUM_DIM, NUM_QUAD_1D);
-    @exclusive double r_xyDz[NUM_QUAD_1D * NUM_DIM] @dim(NUM_DIM, NUM_QUAD_1D);
-
-    for (int y = 0; y < INNER_SIZE; ++y; @inner) {
-      for (int x = 0; x < INNER_SIZE; ++x; @inner) {
-        const int id = (y * INNER_SIZE) + x;
-        for (int i = id; i < (L2_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE)) {
-          s_L2QuadToDof[i] = L2QuadToDof[i];
-        }
-        for (int i = id; i < (H1_DOFS_1D * NUM_QUAD_1D); i += (INNER_SIZE*INNER_SIZE)) {
-          s_H1DofToQuad[i]  = H1DofToQuad[i];
-          s_H1DofToQuadD[i] = H1DofToQuadD[i];
-        }
-      }
-    }
-    for (int el = elBlock; el < (elBlock + ELEMENT_BATCH); ++el) {
-      if (el < numElements) {
-        for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-          for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-            if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D)) {
-              double r_v[NUM_DIM][H1_DOFS_1D];
-              for (int dz = 0; dz < H1_DOFS_1D; ++dz) {
-                for (int c = 0; c < NUM_DIM; ++c) {
-                  r_v[c][dz] = v(c, dx, dy, dz, el);
-                }
-              }
-              for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-                for (int c = 0; c < NUM_DIM; ++c) {
-                  double xyz  = 0;
-                  double xyDz = 0;
-                  for (int dz = 0; dz < H1_DOFS_1D; ++dz) {
-                    xyz  += r_v[c][dz] * s_H1DofToQuad(qz, dz);
-                    xyDz += r_v[c][dz] * s_H1DofToQuadD(qz, dz);
-                  }
-                  r_xyz(c, qz)  = xyz;
-                  r_xyDz(c, qz) = xyDz;
-                }
-              }
-            }
-          }
-        }
-        for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-          // Finalize solution in xy plane
-          for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-            for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-              if ((dx < H1_DOFS_1D) && (dy < H1_DOFS_1D)) {
-                for (int c = 0; c < NUM_DIM; ++c) {
-                  s_xyz(c, dx, dy)  = r_xyz(c, qz);
-                  s_xyDz(c, dx, dy) = r_xyDz(c, qz);
-                }
-              }
-            }
-          }
-          // Finalize solution in xy plane
-          for (int qy = 0; qy < INNER_SIZE; ++qy; @inner) {
-            for (int qx = 0; qx < INNER_SIZE; ++qx; @inner) {
-              if ((qx < NUM_QUAD_1D) && (qy < NUM_QUAD_1D)) {
-                double r_qv = 0;
-                for (int c = 0; c < NUM_DIM; ++c) {
-                  double Dxyz = 0;
-                  double xDyz = 0;
-                  double xyDz = 0;
-                  for (int dy = 0; dy < H1_DOFS_1D; ++dy) {
-                    const double wy  = s_H1DofToQuad(qy, dy);
-                    const double wDy = s_H1DofToQuadD(qy, dy);
-                    double Dxz = 0;
-                    double xz  = 0;
-                    double xDz = 0;
-                    for (int dx = 0; dx < H1_DOFS_1D; ++dx) {
-                      const double wx  = s_H1DofToQuad(qx, dx);
-                      const double wDx = s_H1DofToQuadD(qx, dx);
-                      Dxz += wDx * s_xyz(c, dx, dy);
-                      xz  += wx  * s_xyz(c, dx, dy);
-                      xDz += wx  * s_xyDz(c, dx, dy);
-                    }
-                    Dxyz += wy  * Dxz;
-                    xDyz += wDy * xz;
-                    xyDz += wy  * xDz;
-                  }
-                  r_qv += ((Dxyz * stressJinvT(0, c, qx, qy, qz, el)) +
-                           (xDyz * stressJinvT(1, c, qx, qy, qz, el)) +
-                           (xyDz * stressJinvT(2, c, qx, qy, qz, el)));
-                }
-                s_v(qx, qy) = r_qv;
-              }
-            }
-          }
-          for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-            for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-              if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D)) {
-                double r_e = 0;
-                for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-                  double r_ex = 0;
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx) {
-                    r_ex += s_v(qx, qy) * s_L2QuadToDof(dx, qx);
-                  }
-                  r_e += r_ex * s_L2QuadToDof(dy, qy);
-                }
-                r_xyz[qz] = r_e;
-              }
-            }
-          }
-        }
-        for (int dy = 0; dy < INNER_SIZE; ++dy; @inner) {
-          for (int dx = 0; dx < INNER_SIZE; ++dx; @inner) {
-            if ((dx < L2_DOFS_1D) && (dy < L2_DOFS_1D)) {
-              for (int dz = 0; dz < L2_DOFS_1D; ++dz) {
-                double r_e = 0;
-                for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-                  r_e += r_xyz[qz] * s_L2QuadToDof(dz, qz);
-                }
-                e(dx, dy, dz, el) = r_e;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/occa/kernels/gpu/quadratureData.okl b/occa/kernels/gpu/quadratureData.okl
deleted file mode 100644
index 3463da9c..00000000
--- a/occa/kernels/gpu/quadratureData.okl
+++ /dev/null
@@ -1,498 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "occa://mfem/fem/defines.okl"
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-#ifndef GAMMA
-#  if defined(VORTEX_PROBLEM)
-#    define GAMMA(q, el) (5.0 / 3.0)
-#  elif defined(SEDOV_PROBLEM) || defined(SHOCK_TUBE_PROBLEM)
-#    define GAMMA(q, el) 1.4
-#  else
-#    error "Cannot handle this problem ... yet!"
-#  endif
-#endif
-
-#if VDIM_ORDERING == ORDERING_BY_VDIM
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements);
-#else
-typedef double* V2D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,0);
-typedef double* V3D_t @dim(NUM_DIM, NUM_DOFS_1D, NUM_DOFS_1D, NUM_DOFS_1D, numElements) @dimOrder(1,2,3,4,0);
-#endif
-
-typedef double* QJacobian_t @dim(NUM_DIM, NUM_DIM, NUM_QUAD, numElements);
-typedef double* Stress_t    @dim(NUM_DIM, NUM_DIM, NUM_QUAD, numElements);
-
-@kernel void InitQuadratureData(const int numElements,
-                                @restrict const QLocal_t rho0,
-                                @restrict const QLocal_t detJ,
-                                @restrict const double * quadWeights,
-                                @restrict QLocal_t rho0DetJ0w) {
-  for (int el = 0; el < numElements; ++el; @outer) {
-    for (int q = 0; q < NUM_QUAD; ++q; @inner) {
-      rho0DetJ0w(q, el) = rho0(q, el) * detJ(q, el) * quadWeights[q];
-    }
-  }
-}
-
-@kernel void UpdateQuadratureData2D(const int numElements,
-                                    @restrict const DofToQuad_t dofToQuad,
-                                    @restrict const DofToQuad_t dofToQuadD,
-                                    @restrict const double * quadWeights,
-                                    @restrict const V2D_t v,
-                                    @restrict const QLocal_t e,
-                                    @restrict const QLocal_t rho0DetJ0w,
-                                    @restrict const QJacobian_t invJ0,
-                                    @restrict const QJacobian_t J,
-                                    @restrict const QJacobian_t invJ,
-                                    @restrict const QLocal_t detJ,
-                                    @restrict Stress_t stressJinvT,
-                                    @restrict QLocal_t dtEst) {
-  for (int el = 0; el < numElements; ++el; @outer) {
-    @shared double s_dofToQuad[NUM_QUAD_DOFS_1D] @dim(NUM_QUAD_1D, NUM_DOFS_1D);
-    @shared double s_dofToQuadD[NUM_QUAD_DOFS_1D] @dim(NUM_QUAD_1D, NUM_DOFS_1D);
-
-    @shared double s_xy[NUM_DIM * NUM_QUAD_DOFS_1D]  @dim(NUM_DIM, NUM_DOFS_1D, NUM_QUAD_1D);
-    @shared double s_xDy[NUM_DIM * NUM_QUAD_DOFS_1D] @dim(NUM_DIM, NUM_DOFS_1D, NUM_QUAD_1D);
-
-    @shared double s_gradv[NUM_DIM * NUM_DIM * NUM_QUAD_2D] @dim(NUM_DIM, NUM_DIM, NUM_QUAD_2D);
-
-    @exclusive double r_v[NUM_DIM * NUM_DOFS_1D] @dim(NUM_DIM, NUM_DOFS_1D);
-
-    for (int x = 0; x < NUM_MAX_1D; ++x; @inner) {
-      for (int id = x; id < NUM_QUAD_DOFS_1D; id += NUM_MAX_1D) {
-        s_dofToQuad[id]  = dofToQuad[id];
-        s_dofToQuadD[id] = dofToQuadD[id];
-      }
-    }
-
-    for (int dx = 0; dx < NUM_MAX_1D; ++dx; @inner) {
-      if (dx < NUM_DOFS_1D) {
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          for (int vi = 0; vi < NUM_DIM; ++vi) {
-            s_xy(vi, dx, qy) = 0;
-            s_xDy(vi, dx, qy) = 0;
-          }
-        }
-        for (int dy = 0; dy < NUM_DOFS_1D; ++dy) {
-          for (int vi = 0; vi < NUM_DIM; ++vi) {
-            r_v(vi, dy) = v(vi, dx, dy, el);
-          }
-        }
-        for (int qy = 0; qy < NUM_QUAD_1D; ++qy) {
-          double xy[NUM_DIM];
-          double xDy[NUM_DIM];
-          for (int vi = 0; vi < NUM_DIM; ++vi) {
-            xy[vi]  = 0;
-            xDy[vi] = 0;
-          }
-          for (int dy = 0; dy < NUM_DOFS_1D; ++dy) {
-            for (int vi = 0; vi < NUM_DIM; ++vi) {
-              xy[vi]  += r_v(vi, dy) * s_dofToQuad(qy, dy);
-              xDy[vi] += r_v(vi, dy) * s_dofToQuadD(qy, dy);
-            }
-          }
-          for (int vi = 0; vi < NUM_DIM; ++vi) {
-            s_xy(vi, dx, qy)  = xy[vi];
-            s_xDy(vi, dx, qy) = xDy[vi];
-          }
-        }
-      }
-    }
-
-    for (int qy = 0; qy < NUM_MAX_1D; ++qy; @inner) {
-      if (qy < NUM_QUAD_1D) {
-        for (int qx = 0; qx < NUM_MAX_1D; ++qx) {
-          double gradX[NUM_DIM];
-          double gradY[NUM_DIM];
-          for (int vi = 0; vi < NUM_DIM; ++vi) {
-            gradX[vi] = 0;
-            gradY[vi] = 0;
-          }
-          for (int dx = 0; dx < NUM_DOFS_1D; ++dx) {
-            for (int vi = 0; vi < NUM_DIM; ++vi) {
-              gradX[vi] += s_xy(vi, dx, qy)  * s_dofToQuadD(qx, dx);
-              gradY[vi] += s_xDy(vi, dx, qy) * s_dofToQuad(qx, dx);
-            }
-          }
-          for (int vi = 0; vi < NUM_DIM; ++vi) {
-            s_gradv(vi, 0, qx + qy*NUM_QUAD_1D) = gradX[vi];
-            s_gradv(vi, 1, qx + qy*NUM_QUAD_1D) = gradY[vi];
-          }
-        }
-      }
-    }
-
-    for (int qBlock = 0; qBlock < NUM_MAX_1D; ++qBlock; @inner) {
-      for (int q = qBlock; q < NUM_QUAD; q += NUM_MAX_1D) {
-        double q_gradv[NUM_DIM * NUM_DIM]  @dim(NUM_DIM, NUM_DIM);
-        double q_stress[NUM_DIM * NUM_DIM] @dim(NUM_DIM, NUM_DIM);
-
-        const double invJ_00 = invJ(0, 0, q, el), invJ_10 = invJ(1, 0, q, el);
-        const double invJ_01 = invJ(0, 1, q, el), invJ_11 = invJ(1, 1, q, el);
-
-        q_gradv(0, 0) = ((s_gradv(0, 0, q) * invJ_00) + (s_gradv(1, 0, q) * invJ_01));
-        q_gradv(1, 0) = ((s_gradv(0, 0, q) * invJ_10) + (s_gradv(1, 0, q) * invJ_11));
-        q_gradv(0, 1) = ((s_gradv(0, 1, q) * invJ_00) + (s_gradv(1, 1, q) * invJ_01));
-        q_gradv(1, 1) = ((s_gradv(0, 1, q) * invJ_10) + (s_gradv(1, 1, q) * invJ_11));
-
-        const double q_gamma = GAMMA(q, el);
-        const double q_Jw = detJ(q, el) * quadWeights[q];
-
-        const double q_rho = rho0DetJ0w(q, el) / q_Jw;
-        const double q_e   = max(0.0, e(q, el));
-
-        // TODO: Input OccaVector eos(q,e) -> (stress, soundSpeed)
-        const double s = -(q_gamma - 1.0) * q_rho * q_e;
-        q_stress(0, 0) = s; q_stress(1, 0) = 0;
-        q_stress(0, 1) = 0; q_stress(1, 1) = s;
-
-        const double gradv00 = q_gradv(0, 0);
-        const double gradv11 = q_gradv(1, 1);
-        const double gradv10 = 0.5 * (q_gradv(1, 0) + q_gradv(0, 1));
-        q_gradv(1, 0) = gradv10;
-        q_gradv(0, 1) = gradv10;
-
-        double comprDirX = 1;
-        double comprDirY = 0;
-        double minEig = 0;
-        // linalg/densemat.cpp: Eigensystem2S()
-        if (gradv10 == 0) {
-          minEig = (gradv00 < gradv11) ? gradv00 : gradv11;
-        } else {
-          const double zeta  = (gradv11 - gradv00) / (2.0 * gradv10);
-          const double azeta = fabs(zeta);
-          double t = 1.0 / (azeta + sqrt(1.0 + zeta*zeta));
-          if ((t < 0) != (zeta < 0)) {
-            t = -t;
-          }
-
-          const double c = sqrt(1.0 / (1.0 + t*t));
-          const double s = c * t;
-          t *= gradv10;
-
-          if ((gradv00 - t) <= (gradv11 + t)) {
-            minEig = gradv00 - t;
-            comprDirX = c;
-            comprDirY = -s;
-          } else {
-            minEig = gradv11 + t;
-            comprDirX = s;
-            comprDirY = c;
-          }
-        }
-
-        // Computes the initial->physical transformation Jacobian.
-        const double J_00 = J(0, 0, q, el), J_10 = J(1, 0, q, el);
-        const double J_01 = J(0, 1, q, el), J_11 = J(1, 1, q, el);
-
-        const double invJ0_00 = invJ0(0, 0, q, el), invJ0_10 = invJ0(1, 0, q, el);
-        const double invJ0_01 = invJ0(0, 1, q, el), invJ0_11 = invJ0(1, 1, q, el);
-
-        const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01));
-        const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11));
-        const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01));
-        const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11));
-
-        const double physDirX = (Jpi_00 * comprDirX) + (Jpi_10 * comprDirY);
-        const double physDirY = (Jpi_01 * comprDirX) + (Jpi_11 * comprDirY);
-
-        const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY));
-
-        // TODO: soundSpeed will be an input as well (function call or values per q)
-        const double soundSpeed = sqrt(q_gamma * (q_gamma - 1.0) * q_e);
-        dtEst(q, el) = CFL * q_h / soundSpeed;
-
-        if (USE_VISCOSITY) {
-          // TODO: Check how we can extract outside of kernel
-          const double mu = minEig;
-          double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-          if (mu < 0) {
-            coeff += 0.5 * q_rho * q_h * soundSpeed;
-          }
-          for (int y = 0; y < NUM_DIM; ++y) {
-            for (int x = 0; x < NUM_DIM; ++x) {
-              q_stress(x, y) += coeff * q_gradv(x, y);
-            }
-          }
-        }
-        const double S00 = q_stress(0, 0), S10 = q_stress(1, 0);
-        const double S01 = q_stress(0, 1), S11 = q_stress(1, 1);
-
-        stressJinvT(0, 0, q, el) = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01));
-        stressJinvT(1, 0, q, el) = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11));
-
-        stressJinvT(0, 1, q, el) = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01));
-        stressJinvT(1, 1, q, el) = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11));
-      }
-    }
-  }
-}
-
-@kernel void UpdateQuadratureData3D(const int numElements,
-                                    @restrict const DofToQuad_t dofToQuad,
-                                    @restrict const DofToQuad_t dofToQuadD,
-                                    @restrict const double * quadWeights,
-                                    @restrict const V3D_t v,
-                                    @restrict const QLocal_t e,
-                                    @restrict const QLocal_t rho0DetJ0w,
-                                    @restrict const QJacobian_t invJ0,
-                                    @restrict const QJacobian_t J,
-                                    @restrict const QJacobian_t invJ,
-                                    @restrict const QLocal_t detJ,
-                                    @restrict Stress_t stressJinvT,
-                                    @restrict QLocal_t dtEst) {
-  for (int el = 0; el < numElements; ++el; @outer) {
-    @shared double s_dofToQuad[NUM_QUAD_DOFS_1D] @dim(NUM_QUAD_1D, NUM_DOFS_1D);
-    @shared double s_dofToQuadD[NUM_QUAD_DOFS_1D] @dim(NUM_QUAD_1D, NUM_DOFS_1D);
-
-    for (int y = 0; y < NUM_QUAD_1D; ++y; @inner) {
-      for (int x = 0; x < NUM_QUAD_1D; ++x; @inner) {
-        const int id = (y * NUM_QUAD_1D) + x;
-        for (int i = id; i < (NUM_DOFS_1D * NUM_QUAD_1D); i += NUM_QUAD_2D) {
-          s_dofToQuad[id]  = dofToQuad[id];
-          s_dofToQuadD[id] = dofToQuadD[id];
-        }
-      }
-    }
-
-    for (int qz = 0; qz < NUM_QUAD_1D; ++qz) {
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy; @inner) {
-        for (int qx = 0; qx < NUM_QUAD_1D; ++qx; @inner) {
-          const int q = qx + qy*NUM_QUAD_1D + qz*NUM_QUAD_2D;
-          double gradv[9]  @dim(3, 3);
-          double q_gradv[9]  @dim(3, 3);
-          double q_stress[9] @dim(3, 3);
-
-          // Brute-force convertion of dof -> quad for now
-          for (int i = 0; i < 9; ++i) {
-            gradv[i] = 0;
-          }
-          for (int dz = 0; dz < NUM_DOFS_1D; ++dz) {
-            double xy[3];
-            double Dxy[3];
-            double xDy[3];
-            for (int vi = 0; vi < 3; ++vi) {
-              xy[vi] = Dxy[vi] = xDy[vi] = 0;
-            }
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy) {
-              double x[3];
-              double Dx[3];
-              for (int vi = 0; vi < 3; ++vi) {
-                x[vi] = Dx[vi] = 0;
-              }
-              for (int dx = 0; dx < NUM_DOFS_1D; ++dx) {
-                const double wx  = s_dofToQuad(qx, dx);
-                const double wDx = s_dofToQuadD(qx, dx);
-                for (int vi = 0; vi < 3; ++vi) {
-                  const double r_v = v(vi, dx, dy, dz, el);
-                  x[vi]  += wx  * r_v;
-                  Dx[vi] += wDx * r_v;
-                }
-              }
-              const double wy  = s_dofToQuad(qy, dy);
-              const double wDy = s_dofToQuadD(qy, dy);
-              for (int vi = 0; vi < 3; ++vi) {
-                xy[vi]  += wy  * x[vi];
-                Dxy[vi] += wy  * Dx[vi];
-                xDy[vi] += wDy * x[vi];
-              }
-            }
-            const double wz  = s_dofToQuad(qz, dz);
-            const double wDz = s_dofToQuadD(qz, dz);
-            for (int vi = 0; vi < 3; ++vi) {
-              gradv(vi, 0) += wz  * Dxy[vi];
-              gradv(vi, 1) += wz  * xDy[vi];
-              gradv(vi, 2) += wDz * xy[vi];
-            }
-          }
-
-          const double invJ_00 = invJ(0, 0, q, el), invJ_10 = invJ(1, 0, q, el), invJ_20 = invJ(2, 0, q, el);
-          const double invJ_01 = invJ(0, 1, q, el), invJ_11 = invJ(1, 1, q, el), invJ_21 = invJ(2, 1, q, el);
-          const double invJ_02 = invJ(0, 2, q, el), invJ_12 = invJ(1, 2, q, el), invJ_22 = invJ(2, 2, q, el);
-
-          q_gradv(0, 0) = ((gradv(0, 0) * invJ_00) + (gradv(1, 0) * invJ_01) + (gradv(2, 0) * invJ_02));
-          q_gradv(1, 0) = ((gradv(0, 0) * invJ_10) + (gradv(1, 0) * invJ_11) + (gradv(2, 0) * invJ_12));
-          q_gradv(2, 0) = ((gradv(0, 0) * invJ_20) + (gradv(1, 0) * invJ_21) + (gradv(2, 0) * invJ_22));
-
-          q_gradv(0, 1) = ((gradv(0, 1) * invJ_00) + (gradv(1, 1) * invJ_01) + (gradv(2, 1) * invJ_02));
-          q_gradv(1, 1) = ((gradv(0, 1) * invJ_10) + (gradv(1, 1) * invJ_11) + (gradv(2, 1) * invJ_12));
-          q_gradv(2, 1) = ((gradv(0, 1) * invJ_20) + (gradv(1, 1) * invJ_21) + (gradv(2, 1) * invJ_22));
-
-          q_gradv(0, 2) = ((gradv(0, 2) * invJ_00) + (gradv(1, 2) * invJ_01) + (gradv(2, 2) * invJ_02));
-          q_gradv(1, 2) = ((gradv(0, 2) * invJ_10) + (gradv(1, 2) * invJ_11) + (gradv(2, 2) * invJ_12));
-          q_gradv(2, 2) = ((gradv(0, 2) * invJ_20) + (gradv(1, 2) * invJ_21) + (gradv(2, 2) * invJ_22));
-
-          const double q_gamma = GAMMA(q, el);
-          const double q_Jw = detJ(q, el) * quadWeights[q];
-
-          const double q_rho = rho0DetJ0w(q, el) / q_Jw;
-          const double q_e   = max(0.0, e(q, el));
-
-          const double s = -(q_gamma - 1.0) * q_rho * q_e;
-          q_stress(0, 0) = s; q_stress(1, 0) = 0; q_stress(2, 0) = 0;
-          q_stress(0, 1) = 0; q_stress(1, 1) = s; q_stress(2, 1) = 0;
-          q_stress(0, 2) = 0; q_stress(1, 2) = 0; q_stress(2, 2) = s;
-
-          const double gradv00 = q_gradv(0, 0);
-          const double gradv11 = q_gradv(1, 1);
-          const double gradv22 = q_gradv(2, 2);
-          const double gradv10 = 0.5 * (q_gradv(1, 0) + q_gradv(0, 1));
-          const double gradv20 = 0.5 * (q_gradv(2, 0) + q_gradv(0, 2));
-          const double gradv21 = 0.5 * (q_gradv(2, 1) + q_gradv(1, 2));
-          q_gradv(1, 0) = gradv10; q_gradv(2, 0) = gradv20;
-          q_gradv(0, 1) = gradv10; q_gradv(2, 1) = gradv21;
-          q_gradv(0, 2) = gradv20; q_gradv(1, 2) = gradv21;
-
-          double minEig = 0;
-          double comprDirX = 1;
-          double comprDirY = 0;
-          double comprDirZ = 0;
-
-          {
-            // Compute eigenvalues using quadrature formula
-            const double q_ = (gradv00 + gradv11 + gradv22) / 3.0;
-            const double gradv_q00 = (gradv00 - q_);
-            const double gradv_q11 = (gradv11 - q_);
-            const double gradv_q22 = (gradv22 - q_);
-
-            const double p1 = ((gradv10 * gradv10) +
-                               (gradv20 * gradv20) +
-                               (gradv21 * gradv21));
-            const double p2 = ((gradv_q00 * gradv_q00) +
-                               (gradv_q11 * gradv_q11) +
-                               (gradv_q22 * gradv_q22) +
-                               (2.0 * p1));
-            const double p    = sqrt(p2 / 6.0);
-            const double pinv = 1.0 / p;
-            // det(pinv * (gradv - q*I))
-            const double r = (0.5 * pinv * pinv * pinv *
-                              ((gradv_q00 * gradv_q11 * gradv_q22) +
-                               (2.0 * gradv10 * gradv21 * gradv20) -
-                               (gradv_q11 * gradv20 * gradv20) -
-                               (gradv_q22 * gradv10 * gradv10) -
-                               (gradv_q00 * gradv21 * gradv21)));
-
-            double phi = 0;
-            if (r <= -1.0) {
-              phi = M_PI / 3.0;
-            } else if (r < 1.0) {
-              phi = acos(r) / 3.0;
-            }
-
-            minEig = q_ + (2.0 * p * cos(phi + (2.0 * M_PI / 3.0)));
-            const double eig3 = q_ + (2.0 * p * cos(phi));
-            const double eig2 = 3.0 * q_ - minEig - eig3;
-            double maxNorm = 0;
-
-            for (int i = 0; i < 3; ++i) {
-              const double x = q_gradv[i + 3*0] - (i == 0)*eig3;
-              const double y = q_gradv[i + 3*1] - (i == 1)*eig3;
-              const double z = q_gradv[i + 3*2] - (i == 2)*eig3;
-              const double cx = ((x * (gradv00 - eig2)) +
-                                 (y * gradv10) +
-                                 (z * gradv20));
-              const double cy = ((x * gradv10) +
-                                 (y * (gradv11 - eig2)) +
-                                 (z * gradv21));
-              const double cz = ((x * gradv20) +
-                                 (y * gradv21) +
-                                 (z * (gradv22 - eig2)));
-              const double cNorm = (cx*cx + cy*cy + cz*cz);
-              if ((cNorm > 1e-16) && (maxNorm < cNorm)) {
-                comprDirX = cx;
-                comprDirY = cy;
-                comprDirZ = cz;
-                maxNorm = cNorm;
-              }
-            }
-            if (maxNorm > 1e-16) {
-              const double maxNormInv = 1.0 / sqrt(maxNorm);
-              comprDirX *= maxNormInv;
-              comprDirY *= maxNormInv;
-              comprDirZ *= maxNormInv;
-            }
-          }
-
-          // Computes the initial->physical transformation Jacobian.
-          const double J_00 = J(0, 0, q, el), J_10 = J(1, 0, q, el), J_20 = J(2, 0, q, el);
-          const double J_01 = J(0, 1, q, el), J_11 = J(1, 1, q, el), J_21 = J(2, 1, q, el);
-          const double J_02 = J(0, 2, q, el), J_12 = J(1, 2, q, el), J_22 = J(2, 2, q, el);
-
-          const double invJ0_00 = invJ0(0, 0, q, el), invJ0_10 = invJ0(1, 0, q, el), invJ0_20 = invJ0(2, 0, q, el);
-          const double invJ0_01 = invJ0(0, 1, q, el), invJ0_11 = invJ0(1, 1, q, el), invJ0_21 = invJ0(2, 1, q, el);
-          const double invJ0_02 = invJ0(0, 2, q, el), invJ0_12 = invJ0(1, 2, q, el), invJ0_22 = invJ0(2, 2, q, el);
-
-          const double Jpi_00 = ((J_00 * invJ0_00) + (J_10 * invJ0_01) + (J_20 * invJ0_02));
-          const double Jpi_10 = ((J_00 * invJ0_10) + (J_10 * invJ0_11) + (J_20 * invJ0_12));
-          const double Jpi_20 = ((J_00 * invJ0_20) + (J_10 * invJ0_21) + (J_20 * invJ0_22));
-
-          const double Jpi_01 = ((J_01 * invJ0_00) + (J_11 * invJ0_01) + (J_21 * invJ0_02));
-          const double Jpi_11 = ((J_01 * invJ0_10) + (J_11 * invJ0_11) + (J_21 * invJ0_12));
-          const double Jpi_21 = ((J_01 * invJ0_20) + (J_11 * invJ0_21) + (J_21 * invJ0_22));
-
-          const double Jpi_02 = ((J_02 * invJ0_00) + (J_12 * invJ0_01) + (J_22 * invJ0_02));
-          const double Jpi_12 = ((J_02 * invJ0_10) + (J_12 * invJ0_11) + (J_22 * invJ0_12));
-          const double Jpi_22 = ((J_02 * invJ0_20) + (J_12 * invJ0_21) + (J_22 * invJ0_22));
-
-          const double physDirX = ((Jpi_00 * comprDirX) + (Jpi_10 * comprDirY) + (Jpi_20 * comprDirZ));
-          const double physDirY = ((Jpi_01 * comprDirX) + (Jpi_11 * comprDirY) + (Jpi_21 * comprDirZ));
-          const double physDirZ = ((Jpi_02 * comprDirX) + (Jpi_12 * comprDirY) + (Jpi_22 * comprDirZ));
-
-          const double q_h = H0 * sqrt((physDirX * physDirX) + (physDirY * physDirY) + (physDirZ * physDirZ));
-
-          const double soundSpeed = sqrt(q_gamma * (q_gamma - 1.0) * q_e);
-          dtEst(q, el) = CFL * q_h / soundSpeed;
-
-          if (USE_VISCOSITY) {
-            // TODO: Check how we can extract outside of kernel
-            const double mu = minEig;
-            double coeff = 2.0 * q_rho * q_h * q_h * fabs(mu);
-            if (mu < 0) {
-              coeff += 0.5 * q_rho * q_h * soundSpeed;
-            }
-            for (int y = 0; y < 3; ++y) {
-              for (int x = 0; x < 3; ++x) {
-                q_stress(x, y) += coeff * q_gradv(x, y);
-              }
-            }
-          }
-
-          const double S00 = q_stress(0, 0), S10 = q_stress(1, 0), S20 = q_stress(2, 0);
-          const double S01 = q_stress(0, 1), S11 = q_stress(1, 1), S21 = q_stress(2, 1);
-          const double S02 = q_stress(0, 2), S12 = q_stress(1, 2), S22 = q_stress(2, 2);
-
-          stressJinvT(0, 0, q, el) = q_Jw * ((S00 * invJ_00) + (S10 * invJ_01) + (S20 * invJ_02));
-          stressJinvT(1, 0, q, el) = q_Jw * ((S00 * invJ_10) + (S10 * invJ_11) + (S20 * invJ_12));
-          stressJinvT(2, 0, q, el) = q_Jw * ((S00 * invJ_20) + (S10 * invJ_21) + (S20 * invJ_22));
-
-          stressJinvT(0, 1, q, el) = q_Jw * ((S01 * invJ_00) + (S11 * invJ_01) + (S21 * invJ_02));
-          stressJinvT(1, 1, q, el) = q_Jw * ((S01 * invJ_10) + (S11 * invJ_11) + (S21 * invJ_12));
-          stressJinvT(2, 1, q, el) = q_Jw * ((S01 * invJ_20) + (S11 * invJ_21) + (S21 * invJ_22));
-
-          stressJinvT(0, 2, q, el) = q_Jw * ((S02 * invJ_00) + (S12 * invJ_01) + (S22 * invJ_02));
-          stressJinvT(1, 2, q, el) = q_Jw * ((S02 * invJ_10) + (S12 * invJ_11) + (S22 * invJ_12));
-          stressJinvT(2, 2, q, el) = q_Jw * ((S02 * invJ_20) + (S12 * invJ_21) + (S22 * invJ_22));
-        }
-      }
-    }
-  }
-}
diff --git a/occa/kernels/quadratureData.okl b/occa/kernels/quadratureData.okl
deleted file mode 100644
index 0f2c2a11..00000000
--- a/occa/kernels/quadratureData.okl
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "occa://mfem/fem/defines.okl"
-
-#ifdef OCCA_USING_GPU
-#  include "occa://laghos/gpu/quadratureData.okl"
-#else
-#  include "occa://laghos/cpu/quadratureData.okl"
-#endif
diff --git a/occa/laghos.cpp b/occa/laghos.cpp
deleted file mode 100644
index 5c9470e1..00000000
--- a/occa/laghos.cpp
+++ /dev/null
@@ -1,759 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-//
-//                     __                __
-//                    / /   ____  ____  / /_  ____  _____
-//                   / /   / __ `/ __ `/ __ \/ __ \/ ___/
-//                  / /___/ /_/ / /_/ / / / / /_/ (__  )
-//                 /_____/\__,_/\__, /_/ /_/\____/____/
-//                             /____/
-//
-//             High-order Lagrangian Hydrodynamics Miniapp
-//
-//                             OCCA version
-//
-// Laghos(LAGrangian High-Order Solver) is a miniapp that solves the
-// time-dependent Euler equation of compressible gas dynamics in a moving
-// Lagrangian frame using unstructured high-order finite element spatial
-// discretization and explicit high-order time-stepping. Laghos is based on the
-// numerical algorithm described in the following article:
-//
-//    V. Dobrev, Tz. Kolev and R. Rieben, "High-order curvilinear finite element
-//    methods for Lagrangian hydrodynamics", SIAM Journal on Scientific
-//    Computing, (34) 2012, pp. B606–B641, https://doi.org/10.1137/120864672.
-//
-// Sample runs:
-//    mpirun -np 8 laghos -p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m ../data/square01_tri.mesh  -rs 1 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m ../data/cube01_hex.mesh    -rs 1 -tf 2.0
-//    mpirun -np 8 laghos -p 1 -m ../data/square01_quad.mesh -rs 3 -tf 0.8
-//    mpirun -np 8 laghos -p 1 -m ../data/square01_quad.mesh -rs 0 -tf 0.8 -ok 7 -ot 6
-//    mpirun -np 8 laghos -p 1 -m ../data/cube01_hex.mesh    -rs 2 -tf 0.6
-//
-// Test problems:
-//    p = 0  --> Taylor-Green vortex (smooth problem).
-//    p = 1  --> Sedov blast.
-
-
-#include "laghos_solver.hpp"
-#include <memory>
-#include <iostream>
-#include <fstream>
-
-using namespace std;
-using namespace mfem;
-using namespace mfem::hydrodynamics;
-
-ProblemOption problem = sedov;
-
-void display_banner(ostream & os);
-
-int main(int argc, char *argv[])
-{
-   // Initialize MPI.
-   MPI_Session mpi(argc, argv);
-   int myid = mpi.WorldRank();
-
-   mpiout.setup();
-
-   // Print the banner.
-   if (mpi.Root()) { display_banner(cout); }
-
-   // Parse command-line options.
-   const char *mesh_file = "../data/cube01_hex.mesh";
-   int rs_levels = 2;
-   int rp_levels = 0;
-   int order_v = 2;
-   int order_e = 1;
-   ODESolverOption odeSolverType = RK4;
-   double t_final = 0.6;
-   double cfl = 0.5;
-   double cg_tol = 1e-8;
-   int cg_max_iter = 300;
-   int max_tsteps = -1;
-   bool visualization = false;
-   int vis_steps = 5;
-   bool visit = false;
-   bool gfprint = false;
-   const char *basename = "results/Laghos";
-   int partition_type = 111;
-
-   std::string device_info_str = "mode: 'Serial'";
-   const char *device_info = "";
-   const char *occa_config = "";
-   bool occa_verbose = false;
-
-   OptionsParser args(argc, argv);
-   args.AddOption(&mesh_file, "-m", "--mesh",
-                  "Mesh file to use.");
-   args.AddOption(&rs_levels, "-rs", "--refine-serial",
-                  "Number of times to refine the mesh uniformly in serial.");
-   args.AddOption(&rp_levels, "-rp", "--refine-parallel",
-                  "Number of times to refine the mesh uniformly in parallel.");
-   args.AddOption((int*) &problem, "-p", "--problem", "Problem setup to use.");
-   args.AddOption(&order_v, "-ok", "--order-kinematic",
-                  "Order (degree) of the kinematic finite element space.");
-   args.AddOption(&order_e, "-ot", "--order-thermo",
-                  "Order (degree) of the thermodynamic finite element space.");
-   args.AddOption((int*) &odeSolverType, "-s", "--ode-solver",
-                  "ODE solver: 1 - Forward Euler,\n\t"
-                  "            2 - RK2 SSP, 3 - RK3 SSP, 4 - RK4, 6 - RK6.");
-   args.AddOption(&t_final, "-tf", "--t-final",
-                  "Final time; start time is 0.");
-   args.AddOption(&cfl, "-cfl", "--cfl", "CFL-condition number.");
-   args.AddOption(&cg_tol, "-cgt", "--cg-tol",
-                  "Relative CG tolerance (velocity linear solve).");
-   args.AddOption(&cg_max_iter, "-cgm", "--cg-max-steps",
-                  "Maximum number of CG iterations (velocity linear solve).");
-   args.AddOption(&max_tsteps, "-ms", "--max-steps",
-                  "Maximum number of steps (negative means no restriction).");
-   args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
-                  "--no-visualization",
-                  "Enable or disable GLVis visualization.");
-   args.AddOption(&vis_steps, "-vs", "--visualization-steps",
-                  "Visualize every n-th timestep.");
-   args.AddOption(&visit, "-visit", "--visit", "-no-visit", "--no-visit",
-                  "Enable or disable VisIt visualization.");
-   args.AddOption(&gfprint, "-print", "--print", "-no-print", "--no-print",
-                  "Enable or disable result output (files in mfem format).");
-   args.AddOption(&basename, "-k", "--outputfilename",
-                  "Name of the visit dump files");
-   args.AddOption(&partition_type, "-pt", "--partition",
-                  "Customized x/y/z Cartesian MPI partitioning of the serial mesh.\n\t"
-                  "Here x,y,z are relative task ratios in each direction.\n\t"
-                  "Example: with 48 mpi tasks and -pt 321, one would get a Cartesian\n\t"
-                  "partition of the serial mesh by (6,4,2) MPI tasks in (x,y,z).\n\t"
-                  "NOTE: the serially refined mesh must have the appropriate number\n\t"
-                  "of zones in each direction, e.g., the number of zones in direction x\n\t"
-                  "must be divisible by the number of MPI tasks in direction x.\n\t"
-                  "Available options: 11, 21, 111, 211, 221, 311, 321, 322, 432.");
-   args.AddOption(&device_info, "-d", "--device-info",
-                  "Device information to run example on (default: \"mode: 'Serial'\").");
-   args.AddOption(&occa_config,
-                  "-oc", "--occa-config",
-                  "Load OCCA information from the .json config file. --device-info overrides the config");
-   args.AddOption(&occa_verbose,
-                  "-ov", "--occa-verbose",
-                  "--no-ov", "--no-occa-verbose",
-                  "Print verbose information about OCCA kernel compilation.");
-   args.Parse();
-   if (!args.Good())
-   {
-      if (mpi.Root()) { args.PrintUsage(cout); }
-      return 1;
-   }
-   if (mpi.Root()) { args.PrintOptions(cout); }
-
-   if (strlen(occa_config))
-   {
-      occa::json config = occa::json::parse(occa_config);
-      if (!config.has("devices"))
-      {
-         std::cout << "Config file \"" << occa_config << "\" does not have 'devices'.\n";
-         return 1;
-      }
-
-      occa::json devices = config["devices"];
-      occa::json specificDevices = config["specificDevices"];
-
-      const std::string mpiRankStr = occa::toString(myid);
-
-      if (specificDevices.has(mpiRankStr))
-      {
-         device_info_str = specificDevices[mpiRankStr].toString();
-      }
-      else
-      {
-         const int procsPerNode = devices.size();
-         const int deviceID = (myid % procsPerNode);
-         device_info_str = devices[deviceID].toString();
-      }
-
-      device_info = device_info_str.c_str();
-   }
-   else if (!strlen(device_info))
-   {
-      device_info = device_info_str.c_str();
-   }
-
-   mfem::Init();
-   occa::io::addLibraryPath(
-      "laghos",
-      occa::io::dirname(__FILE__) + "/kernels"
-   );
-
-   // Set the OCCA device to run example in
-   occa::setDevice(device_info);
-
-   // Load cached kernels
-   occa::loadKernels();
-   occa::loadKernels("mfem");
-   occa::loadKernels("laghos");
-
-   // Set as the background device
-   occa::settings()["verboseCompilation"] = occa_verbose;
-
-   // Set properties that determine the problem
-   occa::properties props = GetProblemProperties();
-   props["defines/PROBLEM"] = problem;
-
-   // Read the serial mesh from the given mesh file on all processors.
-   // Refine the mesh in serial to increase the resolution.
-   Mesh *mesh = new Mesh(mesh_file, 1, 1);
-   const int dim = mesh->Dimension();
-   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
-
-   // Parallel partitioning of the mesh.
-   ParMesh *pmesh = NULL;
-   const int num_tasks = mpi.WorldSize(); int unit;
-   int *nxyz = new int[dim];
-   switch (partition_type)
-   {
-      case 11:
-      case 111:
-         unit = floor(pow(num_tasks, 1.0 / dim) + 1e-2);
-         for (int d = 0; d < dim; d++) { nxyz[d] = unit; }
-         if (dim == 2) { nxyz[2] = 0; }
-         break;
-      case 21: // 2D
-         unit = floor(pow(num_tasks / 2, 1.0 / 2) + 1e-2);
-         nxyz[0] = 2 * unit; nxyz[1] = unit; nxyz[2] = 0;
-         break;
-      case 211: // 3D.
-         unit = floor(pow(num_tasks / 2, 1.0 / 3) + 1e-2);
-         nxyz[0] = 2 * unit; nxyz[1] = unit; nxyz[2] = unit;
-         break;
-      case 221: // 3D.
-         unit = floor(pow(num_tasks / 4, 1.0 / 3) + 1e-2);
-         nxyz[0] = 2 * unit; nxyz[1] = 2 * unit; nxyz[2] = unit;
-         break;
-      case 311: // 3D.
-         unit = floor(pow(num_tasks / 3, 1.0 / 3) + 1e-2);
-         nxyz[0] = 3 * unit; nxyz[1] = unit; nxyz[2] = unit;
-         break;
-      case 321: // 3D.
-         unit = floor(pow(num_tasks / 6, 1.0 / 3) + 1e-2);
-         nxyz[0] = 3 * unit; nxyz[1] = 2 * unit; nxyz[2] = unit;
-         break;
-      case 322: // 3D.
-         unit = floor(pow(2 * num_tasks / 3, 1.0 / 3) + 1e-2);
-         nxyz[0] = 3 * unit / 2; nxyz[1] = unit; nxyz[2] = unit;
-         break;
-      case 432: // 3D.
-         unit = floor(pow(num_tasks / 3, 1.0 / 3) + 1e-2);
-         nxyz[0] = 2 * unit; nxyz[1] = 3 * unit / 2; nxyz[2] = unit;
-         break;
-      default:
-         if (myid == 0)
-         {
-            cout << "Unknown partition type: " << partition_type << '\n';
-         }
-         delete mesh;
-         MPI_Finalize();
-         return 3;
-   }
-   int product = 1;
-   for (int d = 0; d < dim; d++) { product *= nxyz[d]; }
-   if (product == num_tasks)
-   {
-      int *partitioning = mesh->CartesianPartitioning(nxyz);
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
-      delete partitioning;
-   }
-   else
-   {
-      if (myid == 0)
-      {
-         cout << "Non-Cartesian partitioning through METIS will be used.\n";
-#ifndef MFEM_USE_METIS
-         cout << "MFEM was built without METIS. "
-              << "Adjust the number of tasks to use a Cartesian split." << endl;
-#endif
-      }
-#ifndef MFEM_USE_METIS
-      return 1;
-#endif
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh);
-   }
-   delete [] nxyz;
-   delete mesh;
-
-   // Refine the mesh further in parallel to increase the resolution.
-   for (int lev = 0; lev < rp_levels; lev++) { pmesh->UniformRefinement(); }
-
-   int nzones = pmesh->GetNE(), nzones_min, nzones_max;
-   MPI_Reduce(&nzones, &nzones_min, 1, MPI_INT, MPI_MIN, 0, pmesh->GetComm());
-   MPI_Reduce(&nzones, &nzones_max, 1, MPI_INT, MPI_MAX, 0, pmesh->GetComm());
-   if (myid == 0)
-   { cout << "Zones min/max: " << nzones_min << " " << nzones_max << endl; }
-
-   // Define the parallel finite element spaces. We use:
-   // - H1 (Gauss-Lobatto, continuous) for position and velocity.
-   // - L2 (Bernstein, discontinuous) for specific internal energy.
-   L2_FECollection L2FEC(order_e, dim, BasisType::Positive);
-   H1_FECollection H1FEC(order_v, dim);
-   OccaFiniteElementSpace o_L2FESpace(pmesh, &L2FEC, Ordering::byNODES);
-   OccaFiniteElementSpace o_H1FESpace(pmesh, &H1FEC, pmesh->Dimension(),
-                                      Ordering::byNODES);
-
-   // Boundary conditions: all tests use v.n = 0 on the boundary, and we assume
-   // that the boundaries are straight.
-   Array<int> ess_tdofs;
-   {
-      Array<int> ess_bdr(pmesh->bdr_attributes.Max()), tdofs1d;
-      for (int d = 0; d < pmesh->Dimension(); d++)
-      {
-         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e., we must
-         // enforce v_x/y/z = 0 for the velocity components.
-         ess_bdr = 0; ess_bdr[d] = 1;
-         o_H1FESpace.GetFESpace()->GetEssentialTrueDofs(ess_bdr, tdofs1d, d);
-         ess_tdofs.Append(tdofs1d);
-      }
-   }
-
-   // Define the explicit ODE solver used for time integration.
-   ODESolver *ode_solver = NULL;
-   switch (odeSolverType)
-   {
-      case ForwardEuler: ode_solver = new OccaForwardEulerSolver; break;
-      case RK2: ode_solver = new OccaRK2Solver(0.5); break;
-      case RK3: ode_solver = new OccaRK3SSPSolver; break;
-      case RK4: ode_solver = new OccaRK4Solver; break;
-      case RK6: ode_solver = new OccaRK6Solver; break;
-      default:
-         if (myid == 0)
-         {
-            cout << "Unknown ODE solver type: " << odeSolverType << '\n';
-         }
-         delete pmesh;
-         MPI_Finalize();
-         return 3;
-   }
-
-   HYPRE_Int glob_size_l2 = o_L2FESpace.GetGlobalTrueVSize();
-   HYPRE_Int glob_size_h1 = o_H1FESpace.GetGlobalTrueVSize();
-
-   if (mpi.Root())
-   {
-      cout << "Number of kinematic (position, velocity) dofs: "
-           << glob_size_h1 << endl;
-      cout << "Number of specific internal energy dofs: "
-           << glob_size_l2 << endl;
-   }
-
-   const int Vsize_l2 = o_L2FESpace.GetVSize();
-   const int Vsize_h1 = o_H1FESpace.GetVSize();
-
-   // The monolithic BlockVector stores unknown fields as:
-   // - (H1) position
-   // - (H1) velocity
-   // - (L2) specific internal energy
-   OccaVector S(2*Vsize_h1 + Vsize_l2);
-
-   // Define GridFunction objects for the position, velocity and specific
-   // internal energy.  There is no function for the density, as we can always
-   // compute the density values given the current mesh position, using the
-   // property of pointwise mass conservation.
-   ParGridFunction x_gf((ParFiniteElementSpace*) o_H1FESpace.GetFESpace());
-   ParGridFunction v_gf((ParFiniteElementSpace*) o_H1FESpace.GetFESpace());
-   ParGridFunction e_gf((ParFiniteElementSpace*) o_L2FESpace.GetFESpace());
-
-   OccaGridFunction o_x_gf(&o_H1FESpace, S.GetRange(0         , Vsize_h1));
-   OccaGridFunction o_v_gf(&o_H1FESpace, S.GetRange(Vsize_h1  , Vsize_h1));
-   OccaGridFunction o_e_gf(&o_L2FESpace, S.GetRange(2*Vsize_h1, Vsize_l2));
-
-   // Initialize x_gf using the starting mesh coordinates. This also links the
-   // mesh positions to the values in x_gf.
-   pmesh->SetNodalGridFunction(&x_gf);
-   o_x_gf = x_gf;
-
-   // Initialize the velocity.
-   VectorFunctionCoefficient v_coeff(pmesh->Dimension(), v0);
-   v_gf.ProjectCoefficient(v_coeff);
-   o_v_gf = v_gf;
-
-   // Initialize density and specific internal energy values. We interpolate in
-   // a non-positive basis to get the correct values at the dofs.  Then we do an
-   // L2 projection to the positive basis in which we actually compute. The goal
-   // is to get a high-order representation of the initial condition. Note that
-   // this density is a temporary function and it will not be updated during the
-   // time evolution.
-   ParGridFunction rho((ParFiniteElementSpace*) o_L2FESpace.GetFESpace());
-   FunctionCoefficient rho_coeff(hydrodynamics::rho0);
-   L2_FECollection l2_fec(order_e, pmesh->Dimension());
-   OccaFiniteElementSpace o_l2_fes(pmesh, &l2_fec, Ordering::byNODES);
-
-   ParFiniteElementSpace *l2_fes = (ParFiniteElementSpace*) o_l2_fes.GetFESpace();
-   ParGridFunction l2_rho(l2_fes), l2_e(l2_fes);
-   l2_rho.ProjectCoefficient(rho_coeff);
-   rho.ProjectGridFunction(l2_rho);
-   if (problem == sedov)
-   {
-      // For the Sedov test, we use a delta function at the origin.
-      DeltaCoefficient e_coeff(0, 0, 0.25);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   else
-   {
-      FunctionCoefficient e_coeff(e0);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   e_gf.ProjectGridFunction(l2_e);
-
-   OccaGridFunction o_rho(&o_L2FESpace);
-   o_rho = rho;
-   o_e_gf = e_gf;
-
-   // Piecewise constant ideal gas coefficient over the Lagrangian mesh. The
-   // gamma values are projected on a function that stays constant on the moving
-   // mesh.
-   L2_FECollection mat_fec(0, pmesh->Dimension());
-   ParFiniteElementSpace mat_fes(pmesh, &mat_fec);
-   ParGridFunction mat_gf(&mat_fes);
-   FunctionCoefficient mat_coeff(hydrodynamics::gamma);
-   mat_gf.ProjectCoefficient(mat_coeff);
-   GridFunctionCoefficient *mat_gf_coeff = new GridFunctionCoefficient(&mat_gf);
-
-   // Additional details, depending on the problem.
-   bool use_viscosity; double gamma;
-   switch (problem)
-   {
-      case vortex:
-         use_viscosity = false; break;
-      case sedov:
-      case shockTube:
-      case triplePoint:
-         use_viscosity = true; break;
-      default:
-         MFEM_ABORT("Wrong problem specification!");
-   }
-
-   LagrangianHydroOperator oper(problem, o_H1FESpace, o_L2FESpace,
-                                ess_tdofs, o_rho, cfl, mat_gf_coeff,
-                                use_viscosity, cg_tol, cg_max_iter,
-                                props);
-
-   socketstream vis_rho, vis_v, vis_e;
-   char vishost[] = "localhost";
-   int  visport   = 19916;
-
-   ParGridFunction rho_gf;
-   if (visualization || visit) { oper.ComputeDensity(rho_gf); }
-
-   if (visualization)
-   {
-      // Make sure all MPI ranks have sent their 'v' solution before initiating
-      // another set of GLVis connections (one from each rank):
-      MPI_Barrier(pmesh->GetComm());
-
-      vis_rho.precision(8);
-      vis_v.precision(8);
-      vis_e.precision(8);
-
-      int Wx = 0, Wy = 0; // window position
-      const int Ww = 350, Wh = 350; // window size
-      int offx = Ww+10; // window offsets
-
-      VisualizeField(vis_rho, vishost, visport, rho_gf,
-                     "Density", Wx, Wy, Ww, Wh);
-      Wx += offx;
-      VisualizeField(vis_v, vishost, visport, v_gf,
-                     "Velocity", Wx, Wy, Ww, Wh);
-      Wx += offx;
-      VisualizeField(vis_e, vishost, visport, e_gf,
-                     "Specific Internal Energy", Wx, Wy, Ww, Wh);
-   }
-
-   // Save data for VisIt visualization.
-   VisItDataCollection visit_dc(basename, pmesh);
-   if (visit)
-   {
-      visit_dc.RegisterField("Density",  &rho_gf);
-      visit_dc.RegisterField("Velocity", &v_gf);
-      visit_dc.RegisterField("Specific Internal Energy", &e_gf);
-      visit_dc.SetCycle(0);
-      visit_dc.SetTime(0.0);
-      visit_dc.Save();
-   }
-
-   // Perform time-integration (looping over the time iterations, ti, with a
-   // time-step dt). The object oper is of type LagrangianHydroOperator that
-   // defines the Mult() method that used by the time integrators.
-   ode_solver->Init(oper);
-   oper.ResetTimeStepEstimate();
-   double t = 0.0, dt = oper.GetTimeStepEstimate(S), t_old;
-   bool last_step = false;
-   int steps = 0;
-   OccaVector S_old(S);
-   for (int ti = 1; !last_step; ti++)
-   {
-      if (t + dt >= t_final)
-      {
-         dt = t_final - t;
-         last_step = true;
-      }
-      if (steps == max_tsteps) { last_step = true; }
-
-      S_old = S;
-      t_old = t;
-      oper.ResetTimeStepEstimate();
-
-      // S is the vector of dofs, t is the current time, and dt is the time step
-      // to advance.
-      ode_solver->Step(S, t, dt);
-      steps++;
-
-      // Adaptive time step control.
-      const double dt_est = oper.GetTimeStepEstimate(S);
-      if (dt_est < dt)
-      {
-         // Repeat (solve again) with a decreased time step - decrease of the
-         // time estimate suggests appearance of oscillations.
-         dt *= 0.85;
-         if (dt < numeric_limits<double>::epsilon())
-         { MFEM_ABORT("The time step crashed!"); }
-         t = t_old;
-         S = S_old;
-         oper.ResetQuadratureData();
-         if (mpi.Root()) { cout << "Repeating step " << ti << endl; }
-         ti--; continue;
-      }
-      else if (dt_est > 1.25 * dt) { dt *= 1.02; }
-
-      // Make sure that the mesh corresponds to the new solution state.
-      x_gf = o_x_gf;
-      pmesh->NewNodes(x_gf, false);
-
-      if (last_step || (ti % vis_steps) == 0)
-      {
-         double loc_norm = o_e_gf * o_e_gf;
-         double tot_norm;
-         MPI_Allreduce(&loc_norm, &tot_norm, 1, MPI_DOUBLE, MPI_SUM,
-                       pmesh->GetComm());
-         if (mpi.Root())
-         {
-            cout << fixed;
-            cout << "step " << setw(5) << ti
-                 << ",\tt = " << setw(5) << setprecision(4) << t
-                 << ",\tdt = " << setw(5) << setprecision(6) << dt
-                 << ",\t|e| = " << setprecision(10)
-                 << sqrt(tot_norm) << endl;
-         }
-
-         // Make sure all ranks have sent their 'v' solution before initiating
-         // another set of GLVis connections (one from each rank):
-         MPI_Barrier(pmesh->GetComm());
-
-         if (visualization || visit || gfprint) { oper.ComputeDensity(rho_gf); }
-         if (visualization)
-         {
-            int Wx = 0, Wy = 0; // window position
-            int Ww = 350, Wh = 350; // window size
-            int offx = Ww+10; // window offsets
-
-            VisualizeField(vis_rho, vishost, visport, rho_gf,
-                           "Density", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_v, vishost, visport,
-                           v_gf, "Velocity", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_e, vishost, visport, e_gf,
-                           "Specific Internal Energy", Wx, Wy, Ww,Wh);
-            Wx += offx;
-         }
-
-         if (visit)
-         {
-            visit_dc.SetCycle(ti);
-            visit_dc.SetTime(t);
-            visit_dc.Save();
-         }
-
-         if (gfprint)
-         {
-            ostringstream mesh_name, rho_name, v_name, e_name;
-            mesh_name << basename << "_" << ti
-                      << "_mesh." << setfill('0') << setw(6) << myid;
-            rho_name  << basename << "_" << ti
-                      << "_rho." << setfill('0') << setw(6) << myid;
-            v_name << basename << "_" << ti
-                   << "_v." << setfill('0') << setw(6) << myid;
-            e_name << basename << "_" << ti
-                   << "_e." << setfill('0') << setw(6) << myid;
-
-            ofstream mesh_ofs(mesh_name.str().c_str());
-            mesh_ofs.precision(8);
-            pmesh->Print(mesh_ofs);
-            mesh_ofs.close();
-
-            ofstream rho_ofs(rho_name.str().c_str());
-            rho_ofs.precision(8);
-            rho_gf.Save(rho_ofs);
-            rho_ofs.close();
-
-            ofstream v_ofs(v_name.str().c_str());
-            v_ofs.precision(8);
-            v_gf.Save(v_ofs);
-            v_ofs.close();
-
-            ofstream e_ofs(e_name.str().c_str());
-            e_ofs.precision(8);
-            e_gf.Save(e_ofs);
-            e_ofs.close();
-         }
-      }
-   }
-
-   switch (odeSolverType)
-   {
-      case RK2: steps *= 2; break;
-      case RK3: steps *= 3; break;
-      case RK4: steps *= 4; break;
-      case RK6: steps *= 6; break;
-      default:;
-   }
-   oper.PrintTimingData(mpi.Root(), steps);
-
-   if (visualization)
-   {
-      vis_v.close();
-      vis_e.close();
-   }
-
-   // Free the used memory.
-   delete ode_solver;
-   delete pmesh;
-   delete mat_gf_coeff;
-
-   return 0;
-}
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-double rho0(const Vector &x)
-{
-   switch (problem)
-   {
-      case vortex:
-         return 1.0;
-      case sedov:
-         return 1.0;
-      case shockTube:
-         if (x(0) < 0.5) { return 1.0; }
-         return 0.1;
-      case triplePoint:
-         if (x(0) > 1.0 && x(1) <= 1.5) { return 1.0; }
-         return 0.125;
-      default:
-         MFEM_ABORT("Bad number given for problem id!");
-         return 0.0;
-   }
-}
-
-double gamma(const Vector &x)
-{
-   switch (problem)
-   {
-      case vortex:
-         return 5./3.;
-      case sedov:
-         return 1.4;
-      case shockTube:
-         return 1.4;
-      case triplePoint:
-         if (x(0) > 1.0 && x(1) <= 1.5) { return 1.4; }
-         return 1.5;
-      default:
-         MFEM_ABORT("Bad number given for problem id!");
-         return 0.0;
-   }
-}
-
-void v0(const Vector &x, Vector &v)
-{
-   switch (problem)
-   {
-      case vortex:
-         v(0) =  sin(M_PI*x(0)) * cos(M_PI*x(1));
-         v(1) = -cos(M_PI*x(0)) * sin(M_PI*x(1));
-         if (x.Size() == 3)
-         {
-            v(0) *= cos(M_PI*x(2));
-            v(1) *= cos(M_PI*x(2));
-            v(2) = 0.0;
-         }
-         break;
-      case sedov:
-      case shockTube:
-      case triplePoint:
-         v = 0.0; break;
-      default:
-         MFEM_ABORT("Bad number given for problem id!");
-   }
-}
-
-double e0(const Vector &x)
-{
-   switch (problem)
-   {
-      case vortex:
-      {
-         const double denom = 2.0 / 3.0;  // (5/3 - 1) * density.
-         double val;
-         if (x.Size() == 2)
-         {
-            val = 1.0 + (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) / 4.0;
-         }
-         else
-         {
-            val = 100.0 + ((cos(2*M_PI*x(2)) + 2) *
-                           (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) - 2) / 16.0;
-         }
-         return val/denom;
-      }
-      case sedov:
-         // This case in initialized in main().
-         return 0.0;
-      case shockTube:
-         if (x(0) < 0.5) { return 1.0 / rho0(x) / (gamma(x) - 1.0); }
-         return 0.1 / rho0(x) / (gamma(x) - 1.0);
-      case triplePoint:
-         if (x(0) > 1.0) { return 0.1 / rho0(x) / (gamma(x) - 1.0); }
-         return 1.0 / rho0(x) / (gamma(x) - 1.0);
-      default:
-         MFEM_ABORT("Bad number given for problem id!");
-         return 0.0;
-   }
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-void display_banner(ostream & os)
-{
-   os << endl
-      << "       __                __                 " << endl
-      << "      / /   ____  ____  / /_  ____  _____   " << endl
-      << "     / /   / __ `/ __ `/ __ \\/ __ \\/ ___/ " << endl
-      << "    / /___/ /_/ / /_/ / / / / /_/ (__  )    " << endl
-      << "   /_____/\\__,_/\\__, /_/ /_/\\____/____/  " << endl
-      << "               /____/                       " << endl << endl;
-}
diff --git a/occa/laghos_assembly.cpp b/occa/laghos_assembly.cpp
deleted file mode 100644
index 589b6813..00000000
--- a/occa/laghos_assembly.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project (17-SC-20-SC)
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_assembly.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-MpiOstream mpiout;
-
-namespace mfem
-{
-namespace hydrodynamics
-{
-QuadratureData::QuadratureData(int dim,
-                               int elements,
-                               int nqp)
-{
-   Setup(occa::getDevice(), dim, elements, nqp);
-}
-
-QuadratureData::QuadratureData(occa::device device_,
-                               int dim,
-                               int elements,
-                               int nqp)
-{
-   Setup(device_, dim, elements, nqp);
-}
-
-void QuadratureData::Setup(occa::device device_,
-                           int dim,
-                           int elements,
-                           int nqp)
-{
-   device = device_;
-
-   rho0DetJ0w.SetSize(device, nqp * elements);
-   stressJinvT.SetSize(device, dim * dim * nqp * elements);
-   dtEst.SetSize(device, nqp * elements);
-}
-
-void DensityIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
-                                               ElementTransformation &Tr,
-                                               const IntegrationRule &integ_rule,
-                                               Vector &rho0DetJ0w,
-                                               Vector &elvect)
-{
-   const int ip_cnt = integ_rule.GetNPoints();
-   Vector shape(fe.GetDof());
-
-   elvect.SetSize(fe.GetDof());
-   elvect = 0.0;
-
-   for (int q = 0; q < ip_cnt; q++)
-   {
-      fe.CalcShape(integ_rule.IntPoint(q), shape);
-      // Note that rhoDetJ = rho0DetJ0.
-      shape *= rho0DetJ0w(Tr.ElementNo*ip_cnt + q);
-      elvect += shape;
-   }
-}
-
-OccaMassOperator::OccaMassOperator(OccaFiniteElementSpace &fes_,
-                                   const IntegrationRule &integ_rule_,
-                                   QuadratureData *quad_data_)
-   : Operator(fes_.GetTrueVSize()),
-     device(occa::getDevice()),
-     fes(fes_),
-     integ_rule(integ_rule_),
-     bilinearForm(&fes),
-     quad_data(quad_data_),
-     x_gf(device, &fes),
-     y_gf(device, &fes) {}
-
-OccaMassOperator::OccaMassOperator(occa::device device_,
-                                   OccaFiniteElementSpace &fes_,
-                                   const IntegrationRule &integ_rule_,
-                                   QuadratureData *quad_data_)
-   : Operator(fes_.GetTrueVSize()),
-     device(device_),
-     fes(fes_),
-     integ_rule(integ_rule_),
-     bilinearForm(&fes),
-     quad_data(quad_data_),
-     x_gf(device, &fes),
-     y_gf(device, &fes) {}
-
-void OccaMassOperator::Setup()
-{
-   dim = fes.GetMesh()->Dimension();
-   elements = fes.GetMesh()->GetNE();
-
-   ess_tdofs_count = 0;
-
-   OccaMassIntegrator &massInteg = *(new OccaMassIntegrator());
-   massInteg.SetIntegrationRule(integ_rule);
-   massInteg.SetOperator(quad_data->rho0DetJ0w);
-
-   bilinearForm.AddDomainIntegrator(&massInteg);
-   bilinearForm.Assemble();
-
-   bilinearForm.FormOperator(Array<int>(), massOperator);
-}
-
-void OccaMassOperator::SetEssentialTrueDofs(Array<int> &dofs)
-{
-   ess_tdofs_count = dofs.Size();
-   if (ess_tdofs_count == 0)
-   {
-      return;
-   }
-   if (ess_tdofs.size<int>() < ess_tdofs_count)
-   {
-      ess_tdofs = device.malloc(ess_tdofs_count * sizeof(int),
-                                dofs.GetData());
-   }
-   else
-   {
-      ess_tdofs.copyFrom(dofs.GetData(),
-                         ess_tdofs_count * sizeof(int));
-   }
-}
-
-void OccaMassOperator::Mult(const OccaVector &x, OccaVector &y) const
-{
-   distX = x;
-   if (ess_tdofs_count)
-   {
-      distX.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-
-   massOperator->Mult(distX, y);
-
-   if (ess_tdofs_count)
-   {
-      y.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-void OccaMassOperator::EliminateRHS(OccaVector &b)
-{
-   if (ess_tdofs_count)
-   {
-      b.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-OccaForceOperator::OccaForceOperator(OccaFiniteElementSpace &h1fes_,
-                                     OccaFiniteElementSpace &l2fes_,
-                                     const IntegrationRule &integ_rule_,
-                                     QuadratureData *quad_data_)
-   : Operator(l2fes_.GetTrueVSize(), h1fes_.GetTrueVSize()),
-     device(occa::getDevice()),
-     dim(h1fes_.GetMesh()->Dimension()),
-     elements(h1fes_.GetMesh()->GetNE()),
-     h1fes(h1fes_),
-     l2fes(l2fes_),
-     integ_rule(integ_rule_),
-     quad_data(quad_data_),
-     gVecL2(device, l2fes.GetLocalDofs() * elements),
-     gVecH1(device, h1fes.GetVDim() * h1fes.GetLocalDofs() * elements) {}
-
-OccaForceOperator::OccaForceOperator(occa::device device_,
-                                     OccaFiniteElementSpace &h1fes_,
-                                     OccaFiniteElementSpace &l2fes_,
-                                     const IntegrationRule &integ_rule_,
-                                     QuadratureData *quad_data_)
-   : Operator(l2fes_.GetTrueVSize(), h1fes_.GetTrueVSize()),
-     device(device_),
-     dim(h1fes_.GetMesh()->Dimension()),
-     elements(h1fes_.GetMesh()->GetNE()),
-     h1fes(h1fes_),
-     l2fes(l2fes_),
-     integ_rule(integ_rule_),
-     quad_data(quad_data_),
-     gVecL2(device, l2fes.GetLocalDofs() * elements),
-     gVecH1(device, h1fes.GetVDim() * h1fes.GetLocalDofs() * elements) {}
-
-void OccaForceOperator::Setup()
-{
-   occa::properties h1Props, l2Props, props;
-   SetProperties(h1fes, integ_rule, h1Props);
-   SetProperties(l2fes, integ_rule, l2Props);
-
-   props = h1Props;
-   props["defines/L2_DOFS_1D"] = l2Props["defines/NUM_DOFS_1D"];
-   props["defines/H1_DOFS_1D"] = h1Props["defines/NUM_DOFS_1D"];
-
-   multKernel = device.buildKernel("occa://laghos/force.okl",
-                                   stringWithDim("Mult", dim),
-                                   props);
-
-   multTransposeKernel = device.buildKernel("occa://laghos/force.okl",
-                                            stringWithDim("MultTranspose", dim),
-                                            props);
-
-   h1D2Q = OccaDofQuadMaps::Get(device, h1fes, integ_rule);
-   l2D2Q = OccaDofQuadMaps::Get(device, l2fes, integ_rule);
-}
-
-void OccaForceOperator::Mult(const OccaVector &vecL2, OccaVector &vecH1) const
-{
-   l2fes.GlobalToLocal(vecL2, gVecL2);
-
-   multKernel(elements,
-              l2D2Q.dofToQuad,
-              h1D2Q.quadToDof,
-              h1D2Q.quadToDofD,
-              quad_data->stressJinvT,
-              gVecL2,
-              gVecH1);
-
-   h1fes.LocalToGlobal(gVecH1, vecH1);
-}
-
-void OccaForceOperator::MultTranspose(const OccaVector &vecH1,
-                                      OccaVector &vecL2) const
-{
-   h1fes.GlobalToLocal(vecH1, gVecH1);
-
-   multTransposeKernel(elements,
-                       l2D2Q.quadToDof,
-                       h1D2Q.dofToQuad,
-                       h1D2Q.dofToQuadD,
-                       quad_data->stressJinvT,
-                       gVecH1,
-                       gVecL2);
-
-   l2fes.LocalToGlobal(gVecL2, vecL2);
-}
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/occa/laghos_assembly.hpp b/occa/laghos_assembly.hpp
deleted file mode 100644
index 80644bc1..00000000
--- a/occa/laghos_assembly.hpp
+++ /dev/null
@@ -1,193 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_ASSEMBLY
-#define MFEM_LAGHOS_ASSEMBLY
-
-#include "mfem.hpp"
-
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-#include "mpi_utils.hpp"
-
-namespace mfem
-{
-namespace hydrodynamics
-{
-// Container for all data needed at quadrature points.
-struct QuadratureData
-{
-   // TODO: use QuadratureFunctions?
-   occa::device device;
-
-   // Reference to physical Jacobian for the initial mesh. These are computed
-   // only at time zero and stored here.
-   OccaVector Jac0inv;
-
-   // Quadrature data used for full/partial assembly of the force operator. At
-   // each quadrature point, it combines the stress, inverse Jacobian,
-   // determinant of the Jacobian and the integration weight. It must be
-   // recomputed in every time step.
-   OccaVector stressJinvT;
-
-   // Quadrature data used for full/partial assembly of the mass matrices. At
-   // time zero, we compute and store (rho0 * det(J0) * qp_weight) at each
-   // quadrature point. Note the at any other time, we can compute
-   // rho = rho0 * det(J0) / det(J), representing the notion of pointwise mass
-   // conservation.
-   OccaVector rho0DetJ0w;
-
-   // Initial length scale. This represents a notion of local mesh size. We
-   // assume that all initial zones have similar size.
-   double h0;
-
-   // Estimate of the minimum time step over all quadrature points. This is
-   // recomputed at every time step to achieve adaptive time stepping.
-   double dt_est;
-
-   // Occa stuff
-   occa::properties props;
-
-   OccaDofQuadMaps dqMaps;
-   OccaGeometry geom;
-   OccaVector dtEst;
-
-   QuadratureData(int dim,
-                  int elements,
-                  int nqp);
-
-   QuadratureData(occa::device device_,
-                  int dim,
-                  int elements,
-                  int nqp);
-
-   void Setup(occa::device device_,
-              int dim,
-              int elements,
-              int nqp);
-};
-
-// This class is used only for visualization. It assembles (rho, phi) in each
-// zone, which is used by LagrangianHydroOperator::ComputeDensity to do an L2
-// projection of the density.
-class DensityIntegrator
-{
-private:
-   const QuadratureData &quad_data;
-
-public:
-   DensityIntegrator(QuadratureData &quad_data_) : quad_data(quad_data_) { }
-
-   void AssembleRHSElementVect(const FiniteElement &fe,
-                               ElementTransformation &Tr,
-                               const IntegrationRule &integ_rule,
-                               Vector &rho0DetJ0w,
-                               Vector &elvect);
-};
-
-class OccaMassOperator : public Operator
-{
-private:
-   occa::device device;
-
-   int dim, elements;
-   OccaFiniteElementSpace &fes;
-
-   const IntegrationRule &integ_rule;
-
-   int ess_tdofs_count;
-   occa::memory ess_tdofs;
-
-   OccaBilinearForm bilinearForm;
-   Operator *massOperator;
-
-   QuadratureData *quad_data;
-
-   // For distributing X
-   mutable OccaVector distX;
-   mutable OccaGridFunction x_gf, y_gf;
-
-public:
-   OccaMassOperator(OccaFiniteElementSpace &fes_,
-                    const IntegrationRule &integ_rule_,
-                    QuadratureData *quad_data_);
-
-   OccaMassOperator(occa::device device_,
-                    OccaFiniteElementSpace &fes_,
-                    const IntegrationRule &integ_rule_,
-                    QuadratureData *quad_data_);
-
-   void Setup();
-
-   void SetEssentialTrueDofs(Array<int> &dofs);
-
-   // Can be used for both velocity and specific internal energy. For the case
-   // of velocity, we only work with one component at a time.
-   virtual void Mult(const OccaVector &x, OccaVector &y) const;
-
-   void EliminateRHS(OccaVector &b);
-};
-
-// Performs partial assembly, which corresponds to (and replaces) the use of the
-// LagrangianHydroOperator::Force global matrix.
-class OccaForceOperator : public Operator
-{
-private:
-   occa::device device;
-   int dim, elements;
-
-   OccaFiniteElementSpace &h1fes, &l2fes;
-   const IntegrationRule &integ_rule;
-
-   QuadratureData *quad_data;
-
-   occa::kernel multKernel, multTransposeKernel;
-
-   OccaDofQuadMaps l2D2Q, h1D2Q;
-   mutable OccaVector gVecL2, gVecH1;
-
-   void MultHex(const Vector &vecL2, Vector &vecH1) const;
-   void MultTransposeHex(const Vector &vecH1, Vector &vecL2) const;
-
-public:
-   OccaForceOperator(OccaFiniteElementSpace &h1fes_,
-                     OccaFiniteElementSpace &l2fes_,
-                     const IntegrationRule &integ_rule,
-                     QuadratureData *quad_data_);
-
-   OccaForceOperator(occa::device device_,
-                     OccaFiniteElementSpace &h1fes_,
-                     OccaFiniteElementSpace &l2fes_,
-                     const IntegrationRule &integ_rule,
-                     QuadratureData *quad_data_);
-
-   void Setup();
-
-   virtual void Mult(const OccaVector &vecL2, OccaVector &vecH1) const;
-   virtual void MultTranspose(const OccaVector &vecH1, OccaVector &vecL2) const;
-
-   ~OccaForceOperator() { }
-};
-
-} // namespace hydrodynamics
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS_ASSEMBLY
diff --git a/occa/laghos_solver.cpp b/occa/laghos_solver.cpp
deleted file mode 100644
index 68293e11..00000000
--- a/occa/laghos_solver.cpp
+++ /dev/null
@@ -1,485 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_solver.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-namespace mfem
-{
-namespace hydrodynamics
-{
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x, int y, int w, int h, bool vec)
-{
-   ParMesh &pmesh = *gf.ParFESpace()->GetParMesh();
-   MPI_Comm comm = pmesh.GetComm();
-
-   int num_procs, myid;
-   MPI_Comm_size(comm, &num_procs);
-   MPI_Comm_rank(comm, &myid);
-
-   bool newly_opened = false;
-   int connection_failed;
-
-   do
-   {
-      if (myid == 0)
-      {
-         if (!sock.is_open() || !sock)
-         {
-            sock.open(vishost, visport);
-            sock.precision(8);
-            newly_opened = true;
-         }
-         sock << "solution\n";
-      }
-
-      pmesh.PrintAsOne(sock);
-      gf.SaveAsOne(sock);
-
-      if (myid == 0 && newly_opened)
-      {
-         sock << "window_title '" << title << "'\n"
-              << "window_geometry "
-              << x << " " << y << " " << w << " " << h << "\n"
-              << "keys maaAcl";
-         if ( vec ) { sock << "vvv"; }
-         sock << endl;
-      }
-
-      if (myid == 0)
-      {
-         connection_failed = !sock && !newly_opened;
-      }
-      MPI_Bcast(&connection_failed, 1, MPI_INT, 0, comm);
-   }
-   while (connection_failed);
-}
-
-LagrangianHydroOperator::LagrangianHydroOperator(ProblemOption problem_,
-                                                 OccaFiniteElementSpace &o_H1FESpace_,
-                                                 OccaFiniteElementSpace &o_L2FESpace_,
-                                                 Array<int> &essential_tdofs_,
-                                                 OccaGridFunction &rho0,
-                                                 double cfl_,
-                                                 Coefficient *material_,
-                                                 bool use_viscosity_,
-                                                 double cg_tol_,
-                                                 int cg_max_iter_,
-                                                 occa::properties &props)
-   : TimeDependentOperator(o_L2FESpace_.GetVSize() + 2*o_H1FESpace_.GetVSize()),
-     problem(problem_),
-     device(o_H1FESpace_.GetDevice()),
-     o_H1FESpace(o_H1FESpace_),
-     o_L2FESpace(o_L2FESpace_),
-     o_H1compFESpace(o_H1FESpace.GetMesh(),
-                     o_H1FESpace.FEColl(),
-                     1),
-     H1FESpace(*((ParFiniteElementSpace*) o_H1FESpace.GetFESpace())),
-     L2FESpace(*((ParFiniteElementSpace*) o_L2FESpace.GetFESpace())),
-     essential_tdofs(essential_tdofs_),
-     dim(H1FESpace.GetMesh()->Dimension()),
-     elements(H1FESpace.GetMesh()->GetNE()),
-     l2dofs_cnt(L2FESpace.GetFE(0)->GetDof()),
-     h1dofs_cnt(H1FESpace.GetFE(0)->GetDof()),
-     cfl(cfl_),
-     use_viscosity(use_viscosity_),
-     Mv(&H1FESpace),
-     Me_inv(l2dofs_cnt, l2dofs_cnt, elements),
-     integ_rule(IntRules.Get(H1FESpace.GetMesh()->GetElementBaseGeometry(0),
-                             3*H1FESpace.GetOrder(0) + L2FESpace.GetOrder(0) - 1)),
-     cg_max_iter(cg_max_iter_),
-     cg_rel_tol(cg_tol_),
-     cg_abs_tol(0.0),
-     cg_print_level(0),
-     quad_data(dim, elements, integ_rule.GetNPoints()),
-     quad_data_is_current(false),
-     VMass(o_H1compFESpace, integ_rule, &quad_data),
-     EMass(o_L2FESpace, integ_rule, &quad_data),
-     Force(o_H1FESpace, o_L2FESpace, integ_rule, &quad_data),
-     timer()
-{
-
-   Vector rho0_ = rho0;
-   GridFunction rho0_gf(&L2FESpace, rho0_.GetData());
-   GridFunctionCoefficient rho_coeff(&rho0_gf);
-
-   // Standard local assembly and inversion for energy mass matrices.
-   DenseMatrix Me(l2dofs_cnt);
-   DenseMatrixInverse inv(&Me);
-   MassIntegrator mi(rho_coeff, &integ_rule);
-   for (int el = 0; el < elements; ++el)
-   {
-      mi.AssembleElementMatrix(*L2FESpace.GetFE(el),
-                               *L2FESpace.GetElementTransformation(el), Me);
-      inv.Factor();
-      inv.GetInverseMatrix(Me_inv(el));
-   }
-
-   // Standard assembly for the velocity mass matrix.
-   VectorMassIntegrator *vmi = new VectorMassIntegrator(rho_coeff, &integ_rule);
-   Mv.AddDomainIntegrator(vmi);
-   Mv.Assemble();
-
-   // Initial local mesh size (assumes similar cells).
-   double loc_area = 0.0, glob_area;
-   int glob_z_cnt;
-   ParMesh *pm = H1FESpace.GetParMesh();
-   for (int el = 0; el < elements; ++el)
-   {
-      loc_area += pm->GetElementVolume(el);
-   }
-   MPI_Allreduce(&loc_area, &glob_area, 1, MPI_DOUBLE, MPI_SUM, pm->GetComm());
-   MPI_Allreduce(&elements, &glob_z_cnt, 1, MPI_INT, MPI_SUM, pm->GetComm());
-   switch (pm->GetElementBaseGeometry(0))
-   {
-      case Geometry::SQUARE:
-         quad_data.h0 = sqrt(glob_area / glob_z_cnt);
-         break;
-      case Geometry::TRIANGLE:
-         quad_data.h0 = sqrt(2.0 * glob_area / glob_z_cnt);
-         break;
-      case Geometry::CUBE:
-         quad_data.h0 = pow(glob_area / glob_z_cnt, 1.0/3.0);
-         break;
-      case Geometry::TETRAHEDRON:
-         quad_data.h0 = pow(6.0 * glob_area / glob_z_cnt, 1.0/3.0);
-         break;
-      default: MFEM_ABORT("Unknown zone type!");
-   }
-   quad_data.h0 /= (double) H1FESpace.GetOrder(0);
-
-   // Setup OCCA QuadratureData
-   quad_data.device = device;
-
-   quad_data.dqMaps = OccaDofQuadMaps::Get(device,
-                                           o_H1FESpace,
-                                           integ_rule);
-   quad_data.geom = OccaGeometry::Get(device,
-                                      o_H1FESpace,
-                                      integ_rule);
-
-   quad_data.Jac0inv = quad_data.geom.invJ;
-
-   OccaVector rhoValues;
-   rho0.ToQuad(integ_rule, rhoValues);
-
-   SetProperties(o_H1FESpace, integ_rule, quad_data.props);
-   quad_data.props["defines/H0"]            = quad_data.h0;
-   quad_data.props["defines/CFL"]           = cfl;
-   quad_data.props["defines/USE_VISCOSITY"] = use_viscosity;
-   quad_data.props += props;
-
-   occa::kernel initKernel = device.buildKernel("occa://laghos/quadratureData.okl",
-                                                "InitQuadratureData",
-                                                quad_data.props);
-
-   initKernel(elements,
-              rhoValues,
-              quad_data.geom.detJ,
-              quad_data.dqMaps.quadWeights,
-              quad_data.rho0DetJ0w);
-
-   updateKernel = device.buildKernel("occa://laghos/quadratureData.okl",
-                                     stringWithDim("UpdateQuadratureData", dim),
-                                     quad_data.props);
-
-   // Needs quad_data.rho0DetJ0w
-   Force.Setup();
-   VMass.Setup();
-   EMass.Setup();
-}
-
-void LagrangianHydroOperator::Mult(const OccaVector &S,
-                                   OccaVector &dS_dt) const
-{
-   dS_dt = 0.0;
-
-   // Make sure that the mesh positions correspond to the ones in S. This is
-   // needed only because some mfem time integrators don't update the solution
-   // vector at every intermediate stage (hence they don't change the mesh).
-   const int Vsize_h1 = H1FESpace.GetVSize();
-   const int Vsize_l2 = L2FESpace.GetVSize();
-
-   // The monolithic BlockVector stores the unknown fields as follows:
-   // - Position
-   // - Velocity
-   // - Specific Internal Energy
-   OccaVector x = S.GetRange(0         , Vsize_h1);
-   OccaVector v = S.GetRange(Vsize_h1  , Vsize_h1);
-   OccaVector e = S.GetRange(2*Vsize_h1, Vsize_l2);
-
-   OccaVector dx = dS_dt.GetRange(0         , Vsize_h1);
-   OccaVector dv = dS_dt.GetRange(Vsize_h1  , Vsize_h1);
-   OccaVector de = dS_dt.GetRange(2*Vsize_h1, Vsize_l2);
-
-   Vector h_x = x;
-   ParGridFunction h_px(&H1FESpace, h_x.GetData());
-
-   o_H1FESpace.GetMesh()->NewNodes(h_px, false);
-   UpdateQuadratureData(S);
-
-   // Set dx_dt = v (explicit).
-   dx = v;
-
-   // Solve for velocity.
-   OccaVector one(Vsize_l2);
-   OccaVector rhs(Vsize_h1);
-   one = 1.0;
-
-   timer.sw_force.Start();
-   Force.Mult(one, rhs);
-   timer.sw_force.Stop();
-   rhs.Neg();
-
-   OccaVector B(o_H1compFESpace.GetTrueVSize());
-   OccaVector X(o_H1compFESpace.GetTrueVSize());
-
-   // Partial assembly solve for each velocity component.
-   dv = 0.0;
-
-   for (int c = 0; c < dim; c++)
-   {
-      const int size = o_H1compFESpace.GetVSize();
-      OccaVector rhs_c = rhs.GetRange(c*size, size);
-      OccaVector dv_c  = dv.GetRange(c*size, size);
-
-      // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e.,
-      // we must enforce v_x/y/z = 0 for the velocity components.
-      Array<int> ess_bdr(H1FESpace.GetParMesh()->bdr_attributes.Max());
-      ess_bdr = 0;
-      ess_bdr[c] = 1;
-
-      o_H1compFESpace.GetProlongationOperator()->MultTranspose(rhs_c, B);
-      o_H1compFESpace.GetRestrictionOperator()->Mult(dv_c, X);
-
-      // True dofs as if there's only one component.
-      Array<int> c_tdofs;
-      o_H1compFESpace.GetFESpace()->GetEssentialTrueDofs(ess_bdr, c_tdofs);
-
-      VMass.SetEssentialTrueDofs(c_tdofs);
-      VMass.EliminateRHS(B);
-
-      {
-         OccaCGSolver cg(H1FESpace.GetParMesh()->GetComm());
-         cg.SetOperator(VMass);
-         cg.SetRelTol(cg_rel_tol);
-         cg.SetAbsTol(cg_abs_tol);
-         cg.SetMaxIter(cg_max_iter);
-         cg.SetPrintLevel(0);
-         timer.sw_cgH1.Start();
-
-         cg.Mult(B, X);
-
-         timer.sw_cgH1.Stop();
-         timer.H1cg_iter += cg.GetNumIterations();
-      }
-
-      o_H1compFESpace.GetProlongationOperator()->Mult(X, dv_c);
-   }
-
-   // Solve for energy, assemble the energy source if such exists.
-   LinearForm *e_source = NULL;
-   if ((problem == vortex) &&
-       (dim == 2))
-   {
-      e_source = new LinearForm(&L2FESpace);
-      TaylorCoefficient coeff;
-      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &integ_rule);
-      e_source->AddDomainIntegrator(d);
-      e_source->Assemble();
-   }
-
-   OccaVector forceRHS(Vsize_l2);
-   timer.sw_force.Start();
-   Force.MultTranspose(v, forceRHS);
-   timer.sw_force.Stop();
-
-   if (e_source)
-   {
-      forceRHS += *e_source;
-   }
-
-   {
-      // TODO: Local element-wise CG
-      OccaCGSolver cg(L2FESpace.GetParMesh()->GetComm());
-      cg.SetOperator(EMass);
-      cg.SetRelTol(sqrt(cg_rel_tol));
-      cg.SetAbsTol(sqrt(cg_abs_tol));
-      cg.SetMaxIter(cg_max_iter);
-      cg.SetPrintLevel(cg_print_level);
-
-      timer.sw_cgL2.Start();
-      cg.Mult(forceRHS, de);
-      timer.sw_cgL2.Stop();
-      timer.L2dof_iter += cg.GetNumIterations() * l2dofs_cnt;
-   }
-
-   delete e_source;
-   quad_data_is_current = false;
-}
-
-double LagrangianHydroOperator::GetTimeStepEstimate(const OccaVector &S) const
-{
-   OccaVector x = S.GetRange(0, H1FESpace.GetVSize());
-   Vector h_x = x;
-   ParGridFunction h_px(&H1FESpace, h_x.GetData());
-   o_H1FESpace.GetMesh()->NewNodes(h_px, false);
-
-   UpdateQuadratureData(S);
-
-   double glob_dt_est;
-   MPI_Allreduce(&quad_data.dt_est, &glob_dt_est, 1, MPI_DOUBLE, MPI_MIN,
-                 H1FESpace.GetParMesh()->GetComm());
-   return glob_dt_est;
-}
-
-void LagrangianHydroOperator::ResetTimeStepEstimate() const
-{
-   quad_data.dt_est = numeric_limits<double>::infinity();
-}
-
-void LagrangianHydroOperator::ComputeDensity(ParGridFunction &rho)
-{
-   rho.SetSpace(&L2FESpace);
-
-   DenseMatrix Mrho(l2dofs_cnt);
-   Vector rhs(l2dofs_cnt), rho_z(l2dofs_cnt);
-   Array<int> dofs(l2dofs_cnt);
-   DenseMatrixInverse inv(&Mrho);
-   MassIntegrator mi(&integ_rule);
-   DensityIntegrator di(quad_data);
-
-   Vector rho0DetJ0w = quad_data.rho0DetJ0w;
-
-   for (int el = 0; el < elements; ++el)
-   {
-      di.AssembleRHSElementVect(*L2FESpace.GetFE(el),
-                                *L2FESpace.GetElementTransformation(el),
-                                integ_rule,
-                                rho0DetJ0w,
-                                rhs);
-      mi.AssembleElementMatrix(*L2FESpace.GetFE(el),
-                               *L2FESpace.GetElementTransformation(el),
-                               Mrho);
-      inv.Factor();
-      inv.Mult(rhs, rho_z);
-      L2FESpace.GetElementDofs(el, dofs);
-      rho.SetSubVector(dofs, rho_z);
-   }
-}
-void LagrangianHydroOperator::PrintTimingData(bool IamRoot, int steps)
-{
-   double my_rt[5], rt_max[5];
-   my_rt[0] = timer.sw_cgH1.RealTime();
-   my_rt[1] = timer.sw_cgL2.RealTime();
-   my_rt[2] = timer.sw_force.RealTime();
-   my_rt[3] = timer.sw_qdata.RealTime();
-   my_rt[4] = my_rt[0] + my_rt[2] + my_rt[3];
-   MPI_Reduce(my_rt, rt_max, 5, MPI_DOUBLE, MPI_MAX, 0, H1FESpace.GetComm());
-
-   HYPRE_Int mydata[2], alldata[2];
-   mydata[0] = timer.L2dof_iter;
-   mydata[1] = timer.quad_tstep;
-   MPI_Reduce(mydata, alldata, 2, HYPRE_MPI_INT, MPI_SUM, 0,
-              H1FESpace.GetComm());
-
-   if (IamRoot)
-   {
-      const HYPRE_Int H1gsize = H1FESpace.GlobalTrueVSize(),
-                      L2gsize = L2FESpace.GlobalTrueVSize();
-      using namespace std;
-      cout << endl;
-      cout << "CG (H1) total time: " << rt_max[0] << endl;
-      cout << "CG (H1) rate (megadofs x cg_iterations / second): "
-           << 1e-6 * H1gsize * timer.H1cg_iter / rt_max[0] << endl;
-      cout << endl;
-      cout << "CG (L2) total time: " << rt_max[1] << endl;
-      cout << "CG (L2) rate (megadofs x cg_iterations / second): "
-           << 1e-6 * alldata[0] / rt_max[1] << endl;
-      cout << endl;
-      // The Force operator is applied twice per time step, on the H1 and the L2
-      // vectors, respectively.
-      cout << "Forces total time: " << rt_max[2] << endl;
-      cout << "Forces rate (megadofs x timesteps / second): "
-           << 1e-6 * steps * (H1gsize + L2gsize) / rt_max[2] << endl;
-      cout << endl;
-      cout << "UpdateQuadData total time: " << rt_max[3] << endl;
-      cout << "UpdateQuadData rate (megaquads x timesteps / second): "
-           << 1e-6 * alldata[1] * integ_rule.GetNPoints() / rt_max[3] << endl;
-      cout << endl;
-      cout << "Major kernels total time (seconds): " << rt_max[4] << endl;
-      cout << "Major kernels total rate (megadofs x time steps / second): "
-           << 1e-6 * H1gsize * steps / rt_max[4] << endl;
-   }
-}
-
-LagrangianHydroOperator::~LagrangianHydroOperator() {}
-
-void LagrangianHydroOperator::UpdateQuadratureData(const OccaVector &S) const
-{
-   if (quad_data_is_current)
-   {
-      return;
-   }
-
-   timer.sw_qdata.Start();
-   quad_data_is_current = true;
-
-   const int vSize = o_H1FESpace.GetVSize();
-   const int eSize = o_L2FESpace.GetVSize();
-
-   OccaGridFunction v(&o_H1FESpace, S.GetRange(vSize  , vSize));
-   OccaGridFunction e(&o_L2FESpace, S.GetRange(2*vSize, eSize));
-
-   quad_data.geom = OccaGeometry::Get(device,
-                                      o_H1FESpace,
-                                      integ_rule);
-
-   OccaVector v2(device,
-                 o_H1FESpace.GetVDim() * o_H1FESpace.GetLocalDofs() * elements);
-   o_H1FESpace.GlobalToLocal(v, v2);
-
-   OccaVector eValues;
-   e.ToQuad(integ_rule, eValues);
-
-   updateKernel(elements,
-                quad_data.dqMaps.dofToQuad,
-                quad_data.dqMaps.dofToQuadD,
-                quad_data.dqMaps.quadWeights,
-                v2,
-                eValues,
-                quad_data.rho0DetJ0w,
-                quad_data.Jac0inv,
-                quad_data.geom.J,
-                quad_data.geom.invJ,
-                quad_data.geom.detJ,
-                quad_data.stressJinvT,
-                quad_data.dtEst);
-
-   quad_data.dt_est = quad_data.dtEst.Min();
-
-   timer.sw_qdata.Stop();
-   timer.quad_tstep += elements;
-}
-} // namespace hydrodynamics
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/occa/laghos_solver.hpp b/occa/laghos_solver.hpp
deleted file mode 100644
index 6f73a42d..00000000
--- a/occa/laghos_solver.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_SOLVER
-#define MFEM_LAGHOS_SOLVER
-
-#include "mfem.hpp"
-#include "laghos_assembly.hpp"
-#include "laghos_utils.hpp"
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-#include <fstream>
-
-namespace mfem
-{
-namespace hydrodynamics
-{
-/// Visualize the given parallel grid function, using a GLVis server on the
-/// specified host and port. Set the visualization window title, and optionally,
-/// its geometry.
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x = 0, int y = 0, int w = 400, int h = 400,
-                    bool vec = false);
-
-// These are defined in laghos.cpp
-double rho0(const Vector &);
-void v0(const Vector &, Vector &);
-double e0(const Vector &);
-double gamma(const Vector &);
-
-struct TimingData
-{
-   // Total times for all major computations:
-   // CG solves (H1 and L2) / force RHS assemblies / quadrature computations.
-   StopWatch sw_cgH1, sw_cgL2, sw_force, sw_qdata;
-
-   // These accumulate the total processed dofs or quad points:
-   // #(CG iterations) for the H1 CG solve.
-   // #dofs  * #(CG iterations) for the L2 CG solve.
-   // #quads * #(RK sub steps) for the quadrature data computations.
-   int H1cg_iter, L2dof_iter, quad_tstep;
-
-   TimingData()
-      : H1cg_iter(0), L2dof_iter(0), quad_tstep(0) { }
-};
-
-// Given a solutions state (x, v, e), this class performs all necessary
-// computations to evaluate the new slopes (dx_dt, dv_dt, de_dt).
-class LagrangianHydroOperator : public TimeDependentOperator
-{
-protected:
-   const ProblemOption problem;
-
-   occa::device device;
-   OccaFiniteElementSpace &o_H1FESpace;
-   OccaFiniteElementSpace &o_L2FESpace;
-   mutable OccaFiniteElementSpace o_H1compFESpace;
-
-   ParFiniteElementSpace &H1FESpace;
-   ParFiniteElementSpace &L2FESpace;
-
-   Array<int> &essential_tdofs;
-
-   int dim, elements, l2dofs_cnt, h1dofs_cnt;
-   double cfl;
-   bool use_viscosity;
-
-   // Velocity mass matrix and local inverses of the energy mass matrices. These
-   // are constant in time, due to the pointwise mass conservation property.
-   mutable ParBilinearForm Mv;
-   DenseTensor Me_inv;
-
-   // Integration rule for all assemblies.
-   const IntegrationRule &integ_rule;
-
-   int cg_print_level, cg_max_iter;
-   double cg_rel_tol, cg_abs_tol;
-
-   // Data associated with each quadrature point in the mesh. These values are
-   // recomputed at each time step.
-   mutable QuadratureData quad_data;
-   mutable bool quad_data_is_current;
-
-   // Force matrix that combines the kinematic and thermodynamic spaces. It is
-   // assembled in each time step and then it's used to compute the final
-   // right-hand sides for momentum and specific internal energy.
-   mutable OccaMassOperator VMass, EMass;
-   mutable OccaForceOperator Force;
-
-   occa::kernel updateKernel;
-
-   // Linear solver for energy.
-   OccaCGSolver locCG;
-
-   mutable TimingData timer;
-
-   void UpdateQuadratureData(const OccaVector &S) const;
-
-public:
-   LagrangianHydroOperator(ProblemOption problem_,
-                           OccaFiniteElementSpace &o_H1FESpace_,
-                           OccaFiniteElementSpace &o_L1FESpace_,
-                           Array<int> &essential_tdofs,
-                           OccaGridFunction &rho0,
-                           double cfl_,
-                           Coefficient *material_, bool use_viscosity_,
-                           double cg_tol_, int cg_max_iter_,
-                           occa::properties &props);
-
-   // Solve for dx_dt, dv_dt and de_dt.
-   virtual void Mult(const OccaVector &S, OccaVector &dS_dt) const;
-
-   // Calls UpdateQuadratureData to compute the new quad_data.dt_est.
-   double GetTimeStepEstimate(const OccaVector &S) const;
-   void ResetTimeStepEstimate() const;
-   void ResetQuadratureData() const
-   {
-      quad_data_is_current = false;
-   }
-
-   // The density values, which are stored only at some quadrature points, are
-   // projected as a ParGridFunction.
-   void ComputeDensity(ParGridFunction &rho);
-
-   void PrintTimingData(bool IamRoot, int steps);
-
-   ~LagrangianHydroOperator();
-};
-
-class TaylorCoefficient : public Coefficient
-{
-   virtual double Eval(ElementTransformation &T,
-                       const IntegrationPoint &ip)
-   {
-      Vector x(2);
-      T.Transform(ip, x);
-      return 3.0 / 8.0 * M_PI * ( cos(3.0*M_PI*x(0)) * cos(M_PI*x(1)) -
-                                  cos(M_PI*x(0))     * cos(3.0*M_PI*x(1)) );
-   }
-};
-} // namespace hydrodynamics
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS
diff --git a/occa/laghos_utils.hpp b/occa/laghos_utils.hpp
deleted file mode 100644
index d3804c63..00000000
--- a/occa/laghos_utils.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See file LICENSE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "occa.hpp"
-
-namespace mfem {
-  namespace hydrodynamics {
-    enum ProblemOption {
-      vortex      = 0,
-      sedov       = 1,
-      shockTube   = 2,
-      triplePoint = 3
-    };
-
-    enum ODESolverOption {
-      ForwardEuler = 1,
-      RK2          = 2,
-      RK3          = 3,
-      RK4          = 4,
-      RK6          = 6
-    };
-
-    static occa::properties GetProblemProperties() {
-      occa::properties props;
-      props["defines/VORTEX_PROBLEM"] = vortex;
-      props["defines/SEDOV_PROBLEM"] = sedov;
-      props["defines/SHOCK_TUBE_PROBLEM"] = shockTube;
-      props["defines/TRIPLE_POINT_PROBLEM"] = triplePoint;
-      return props;
-    }
-  }
-}
diff --git a/occa/makefile b/occa/makefile
deleted file mode 100644
index 707c45fd..00000000
--- a/occa/makefile
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-# reserved. See files LICENSE and NOTICE for details.
-#
-# This file is part of CEED, a collection of benchmarks, miniapps, software
-# libraries and APIs for efficient high-order finite element and spectral
-# element discretizations for exascale applications. For more information and
-# source code availability see http://github.com/ceed.
-#
-# The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-# a collaborative effort of two U.S. Department of Energy organizations (Office
-# of Science and the National Nuclear Security Administration) responsible for
-# the planning and preparation of a capable exascale ecosystem, including
-# software, applications, hardware, advanced system engineering and early
-# testbed platforms, in support of the nation's exascale computing imperative.
-
-define LAGHOS_HELP_MSG
-
-Laghos makefile targets:
-
-   make
-   make status/info
-   make install
-   make clean
-   make distclean
-   make style
-
-Examples:
-
-make -j 4
-   Build Laghos using the current configuration options from MFEM.
-   (Laghos requires the MFEM finite element library, and uses its compiler and
-    linker options in its build process.)
-make status
-   Display information about the current configuration.
-make install PREFIX=<dir>
-   Install the Laghos executable in <dir>.
-make clean
-   Clean the Laghos executable, library and object files.
-make distclean
-   In addition to "make clean", remove the local installation directory and some
-   run-time generated files.
-make style
-   Format the Laghos C++ source files using the Artistic Style (astyle) settings
-   from MFEM.
-
-endef
-
-PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-
-# Default installation location
-PREFIX = ./bin
-INSTALL = /usr/bin/install
-
-# Use the MFEM build directory
-MFEM_DIR = ../../mfem
-CONFIG_MK = $(MFEM_DIR)/config/config.mk
-TEST_MK = $(MFEM_DIR)/config/test.mk
-# Use the MFEM install directory
-# MFEM_DIR = ../../mfem/mfem
-# CONFIG_MK = $(MFEM_DIR)/config.mk
-# TEST_MK = $(MFEM_DIR)/test.mk
-
-# Use two relative paths to MFEM: first one for compilation in '.' and second
-# one for compilation in 'lib'.
-MFEM_DIR1 := $(MFEM_DIR)
-MFEM_DIR2 := $(realpath $(MFEM_DIR))
-
-# Use the compiler used by MFEM. Get the compiler and the options for compiling
-# and linking from MFEM's config.mk. (Skip this if the target does not require
-# building.)
-MFEM_LIB_FILE = mfem_is_not_built
-ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
-   -include $(CONFIG_MK)
-endif
-
-CXX = $(MFEM_CXX)
-CPPFLAGS = $(MFEM_CPPFLAGS)
-CXXFLAGS = $(MFEM_CXXFLAGS)
-
-# MFEM config does not define C compiler
-CC     = gcc
-CFLAGS = -O3
-
-# Optional link flags
-LDFLAGS =
-
-OPTIM_OPTS = -O3
-DEBUG_OPTS = -g -Wall
-LAGHOS_DEBUG = $(MFEM_DEBUG)
-ifneq ($(LAGHOS_DEBUG),$(MFEM_DEBUG))
-   ifeq ($(LAGHOS_DEBUG),YES)
-      CXXFLAGS = $(DEBUG_OPTS)
-   else
-      CXXFLAGS = $(OPTIM_OPTS)
-   endif
-endif
-
-LAGHOS_FLAGS = $(CPPFLAGS) $(CXXFLAGS) $(MFEM_INCFLAGS)
-LAGHOS_LIBS = $(MFEM_LIBS)
-
-ifeq ($(LAGHOS_DEBUG),YES)
-   LAGHOS_FLAGS += -DLAGHOS_DEBUG
-endif
-
-LIBS = $(strip $(LAGHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(LAGHOS_FLAGS))
-Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
-
-SOURCE_FILES = laghos.cpp laghos_solver.cpp laghos_assembly.cpp
-OBJECT_FILES1 = $(SOURCE_FILES:.cpp=.o)
-OBJECT_FILES = $(OBJECT_FILES1:.c=.o)
-HEADER_FILES = laghos_solver.hpp laghos_assembly.hpp
-
-# Targets
-
-.PHONY: all clean distclean install status info opt debug test style clean-build clean-exec
-
-.SUFFIXES: .c .cpp .o
-.cpp.o:
-	cd $(<D); $(CCC) -c $(<F)
-.c.o:
-	cd $(<D); $(Ccc) -c $(<F)
-
-laghos: override MFEM_DIR = $(MFEM_DIR1)
-laghos:	$(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
-	$(CCC) -o laghos $(OBJECT_FILES) $(LIBS)
-
-all: laghos
-
-opt:
-	$(MAKE) "LAGHOS_DEBUG=NO"
-
-debug:
-	$(MAKE) "LAGHOS_DEBUG=YES"
-
-$(OBJECT_FILES): override MFEM_DIR = $(MFEM_DIR2)
-$(OBJECT_FILES): $(HEADER_FILES) $(CONFIG_MK)
-
-MFEM_TESTS = laghos
-include $(TEST_MK)
-# Testing: Specific execution options
-RUN_MPI = $(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) 4
-test: laghos
-	@$(call mfem-test,$<, $(RUN_MPI), Laghos miniapp,\
-	-p 0 -m data/square01_quad.mesh -rs 3 -tf 0.1)
-# Testing: "test" target and mfem-test* variables are defined in MFEM's
-# config/test.mk
-
-# Generate an error message if the MFEM library is not built and exit
-$(CONFIG_MK) $(MFEM_LIB_FILE):
-	$(error The MFEM library is not built)
-
-clean: clean-build clean-exec clear-kernels
-
-clean-build:
-	rm -rf laghos *.o *~ *.dSYM Laghos_*
-clean-exec:
-
-distclean: clean
-	rm -rf bin/
-
-install: laghos
-	mkdir -p $(PREFIX)
-	$(INSTALL) -m 750 laghos $(PREFIX)
-
-help:
-	$(info $(value LAGHOS_HELP_MSG))
-	@true
-
-status info:
-	$(info MFEM_DIR    = $(MFEM_DIR))
-	$(info LAGHOS_FLAGS = $(LAGHOS_FLAGS))
-	$(info LAGHOS_LIBS  = $(value LAGHOS_LIBS))
-	$(info PREFIX      = $(PREFIX))
-	@true
-
-
-ASTYLE = astyle --options=$(MFEM_DIR1)/config/mfem.astylerc
-FORMAT_FILES := $(SOURCE_FILES) $(HEADER_FILES)
-
-style:
-	@if ! $(ASTYLE) $(FORMAT_FILES) | grep Formatted; then\
-	   echo "No source files were changed.";\
-	fi
diff --git a/occa/mpi_utils.hpp b/occa/mpi_utils.hpp
deleted file mode 100644
index 96176405..00000000
--- a/occa/mpi_utils.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include <iostream>
-#include <sstream>
-#include <mpi.h>
-
-class MpiFlush {
-public:
-  inline MpiFlush() {}
-};
-
-class MpiOstream {
-public:
-  int rank, procs;
-  std::stringstream ss;
-
-  static MpiFlush flush;
-
-  inline MpiOstream() {}
-
-  inline void setup() {
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &procs);
-  }
-
-  inline void _flush() {
-    std::string message = ss.str();
-    ss.str("");
-    ss << "[" << rank << "/" << procs << "] ";
-    std::string prefix = ss.str();
-    ss.str("");
-    std::string indent = "\n" + std::string(prefix.length(), ' ');
-    for (int i = 0; i < (int) message.size(); ++i) {
-      if ((message[i] != '\n') || (i == (message.size() - 1))) {
-        ss << message[i];
-      } else {
-        ss << indent;
-      }
-    }
-    message = ss.str();
-    ss.str("");
-    for (int i = 0; i < procs; ++i) {
-      MPI_Barrier(MPI_COMM_WORLD);
-      if (i == rank) {
-        std::cout << prefix << message;
-      }
-    }
-    ss.str("");
-  }
-
-  template <class TM>
-  MpiOstream& operator << (const TM &t) {
-    ss << t;
-    return *this;
-  }
-};
-
-template <>
-inline MpiOstream& MpiOstream::operator << (const MpiFlush &t) {
-  _flush();
-  return *this;
-}
-
-extern MpiOstream mpiout;
diff --git a/raja/README.md b/raja/README.md
deleted file mode 100644
index e73d1195..00000000
--- a/raja/README.md
+++ /dev/null
@@ -1,131 +0,0 @@
-               __                __
-              / /   ____  ____  / /_  ____  _____
-             / /   / __ `/ __ `/ __ \/ __ \/ ___/
-            / /___/ /_/ / /_/ / / / / /_/ (__  )
-           /_____/\__,_/\__, /_/ /_/\____/____/
-                       /____/
-
-        High-order Lagrangian Hydrodynamics Miniapp
-
-                      RAJA version
-
-## Overview
-
-This directory contains the RAJA version of the **Laghos** (LAGrangian
-High-Order Solver), which is provided as a reference implementation and is NOT
-the official benchmark version of the miniapp.
-
-For more details about Laghos see the [README file](../README.md) in the
-top-level directory.
-
-The Laghos miniapp is part of the [CEED software suite](http://ceed.exascaleproject.org/software),
-a collection of software benchmarks, miniapps, libraries and APIs for
-efficient exascale discretizations based on high-order finite element
-and spectral element methods. See http://github.com/ceed for more
-information and source code availability.
-
-The CEED research is supported by the [Exascale Computing Project](https://exascaleproject.org/exascale-computing-project)
-(17-SC-20-SC), a collaborative effort of two U.S. Department of Energy
-organizations (Office of Science and the National Nuclear Security
-Administration) responsible for the planning and preparation of a
-[capable exascale ecosystem](https://exascaleproject.org/what-is-exascale),
-including software, applications, hardware, advanced system engineering and early
-testbed platforms, in support of the nation’s exascale computing imperative.
-
-## Differences with the official benchmark version
-
-The RAJA version differs from the official benchmark version of Laghos (in the
-top-level directory) in the following ways:
-
-1. Only problems 0 and 1 are defined
-2. Final iterations (`step`), time steps (`dt`) and energies (`|e|`) differ from the original version
-
-## Building
-
-Follow the steps below to build the RAJA version with GPU acceleration.
-
-### Environment setup
-```sh
-export MPI_HOME=~/usr/local/openmpi/3.0.0
-```
-
-### Hypre
-- <https://computation.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods/download/hypre-2.11.2.tar.gz>
-- `tar xzvf hypre-2.11.2.tar.gz`
-- ` cd hypre-2.11.2/src`
-- `./configure --disable-fortran --with-MPI --with-MPI-include=$MPI_HOME/include --with-MPI-lib-dirs=$MPI_HOME/lib`
-- `make -j`
-- `cd ../..`
-
-### Metis
--   <http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/metis-5.1.0.tar.gz>
--   `tar xzvf metis-5.1.0.tar.gz`
--   `cd metis-5.1.0`
--   ``make config prefix=`pwd` ``
--   `make && make install`
--   `cd ..`
-
-### MFEM
--   `git clone git@github.com:mfem/mfem.git`
--   `cd mfem`
--   `git checkout laghos-v2.0`
--   ``make config MFEM_USE_MPI=YES HYPRE_DIR=`pwd`/../hypre-2.11.2/src/hypre MFEM_USE_METIS_5=YES METIS_DIR=`pwd`/../metis-5.1.0``
--   `make status` to verify that all the include paths are correct
--   `make -j`
--   `cd ..`
-
-### RAJA Laghos
--   `git clone git@github.com:CEED/Laghos.git`
--   `cd Laghos/raja`
--   edit the `makefile`, set NV\_ARCH to the desired architecture and the absolute paths to RAJA\_DIR, CUDA\_DIR, MFEM\_DIR, MPI\_HOME
--   `make` to build the RAJA version
-
-## Running
-
-The RAJA version can run the same sample test runs as the official benchmark
-version of Laghos.
-
-### Options
--   -m <string>: Mesh file to use
--   -ok <int>: Order (degree) of the kinematic finite element space
--   -rs <int>: Number of times to refine the mesh uniformly in serial
--   -p <int>: Problem setup to use, Sedov problem is '1'
--   -cfl <double>: CFL-condition number
--   -ms <int>: Maximum number of steps (negative means no restriction)
--   -uvm: Enable or disable Unified Memory
--   -aware: Enable or disable MPI CUDA Aware
-
-## Verification of Results
-
-To make sure the results are correct, we tabulate reference final iterations
-(`step`), time steps (`dt`) and energies (`|e|`) for the runs listed below:
-
-1. `mpirun -np 4 laghos -p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.75 -pa`
-2. `mpirun -np 4 laghos -p 0 -m ../data/cube01_hex.mesh -rs 1 -tf 0.75 -pa`
-3. `mpirun -np 4 laghos -p 1 -m ../data/square01_quad.mesh -rs 3 -tf 0.8 -pa -cfl 0.05`
-4. `mpirun -np 4 laghos -p 1 -m ../data/cube01_hex.mesh -rs 2 -tf 0.6 -pa -cfl 0.08`
-
-| `run` | `step` | `dt` | `e` |
-| ----- | ------ | ---- | --- |
-|  1. |  333 | 0.000008 | 49.6955373330   |
-|  2. | 1036 | 0.000093 | 3390.9635544029 |
-|  3. | 1570 | 0.000768 | 46.2901037375   |
-|  4. |  486 | 0.000864 | 135.1267396160  |
-
-An implementation is considered valid if the final energy values are all within
-round-off distance from the above reference values.
-
-## Contact
-
-You can reach the Laghos team by emailing laghos@llnl.gov or by leaving a
-comment in the [issue tracker](https://github.com/CEED/Laghos/issues).
-
-## Copyright
-
-The following copyright applies to each file in the CEED software suite,
-unless otherwise stated in the file:
-
-> Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the
-> Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights reserved.
-
-See files LICENSE and NOTICE in the top-level directory for details.
diff --git a/raja/laghos.cpp b/raja/laghos.cpp
deleted file mode 100644
index f708c66c..00000000
--- a/raja/laghos.cpp
+++ /dev/null
@@ -1,679 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-//
-//                     __                __
-//                    / /   ____  ____  / /_  ____  _____
-//                   / /   / __ `/ __ `/ __ \/ __ \/ ___/
-//                  / /___/ /_/ / /_/ / / / / /_/ (__  )
-//                 /_____/\__,_/\__, /_/ /_/\____/____/
-//                             /____/
-//
-//             High-order Lagrangian Hydrodynamics Miniapp
-//
-//                             RAJA version
-//
-// Laghos(LAGrangian High-Order Solver) is a miniapp that solves the
-// time-dependent Euler equation of compressible gas dynamics in a moving
-// Lagrangian frame using unstructured high-order finite element spatial
-// discretization and explicit high-order time-stepping. Laghos is based on the
-// numerical algorithm described in the following article:
-//
-//    V. Dobrev, Tz. Kolev and R. Rieben, "High-order curvilinear finite element
-//    methods for Lagrangian hydrodynamics", SIAM Journal on Scientific
-//    Computing, (34) 2012, pp.B606–B641, https://doi.org/10.1137/120864672.
-//
-// Sample runs:
-//    mpirun -np 8 laghos -p 0 -m data/square01_quad.mesh -rs 3 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m data/square01_tri.mesh  -rs 1 -tf 0.75
-//    mpirun -np 8 laghos -p 0 -m data/cube01_hex.mesh    -rs 1 -tf 2.0
-//    mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 3 -tf 0.8
-//    mpirun -np 8 laghos -p 1 -m data/square01_quad.mesh -rs 0 -tf 0.8 -ok 7 -ot 6
-//    mpirun -np 8 laghos -p 1 -m data/cube01_hex.mesh    -rs 2 -tf 0.6
-//    mpirun -np 8 laghos -p 2 -m data/segment01.mesh     -rs 5 -tf 0.2
-//    mpirun -np 8 laghos -p 3 -m data/rectangle01_quad.mesh -rs 2 -tf 2.5
-//    mpirun -np 8 laghos -p 3 -m data/box01_hex.mesh        -rs 1 -tf 2.5
-//
-// Test problems:
-//    p = 0  --> Taylor-Green vortex (smooth problem).
-//    p = 1  --> Sedov blast.
-//    p = 2  --> 1D Sod shock tube.
-//    p = 3  --> Triple point.
-
-
-#include "laghos_solver.hpp"
-#include <memory>
-#include <iostream>
-#include <fstream>
-#include <sys/time.h>
-
-using namespace std;
-using namespace mfem;
-using namespace mfem::hydrodynamics;
-
-// Choice for the problem setup.
-int problem = 0;
-
-void display_banner(ostream & os);
-
-int main(int argc, char *argv[])
-{
-   // Initialize MPI.
-   MPI_Session mpi(argc, argv);
-   int myid = mpi.WorldRank();
-
-   // Print the banner.
-   if (mpi.Root()) { display_banner(cout); }
-
-   // Parse command-line options.
-   const char *mesh_file = "../data/square01_quad.mesh";
-   int rs_levels = 0;
-   int rp_levels = 0;
-   int order_v = 2;
-   int order_e = 1;
-   int ode_solver_type = 4;
-   double t_final = 0.5;
-   double cfl = 0.5;
-   double cg_tol = 1e-8;
-   int cg_max_iter = 300;
-   int max_tsteps = -1;
-   bool p_assembly = true;
-   bool visualization = false;
-   int vis_steps = 5;
-   bool visit = false;
-   bool gfprint = false;
-   bool cuda = false;
-   bool hip = false;
-   bool uvm = false;
-   bool aware = false;
-   bool hcpo = false; // do Host Conforming Prolongation Operation
-   bool sync = false;
-
-   const char *basename = "results/Laghos";
-   OptionsParser args(argc, argv);
-   // Standard Options *********************************************************
-   args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use.");
-   args.AddOption(&rs_levels, "-rs", "--refine-serial",
-                  "Number of times to refine the mesh uniformly in serial.");
-   args.AddOption(&rp_levels, "-rp", "--refine-parallel",
-                  "Number of times to refine the mesh uniformly in parallel.");
-   args.AddOption(&problem, "-p", "--problem", "Problem setup to use.");
-   args.AddOption(&order_v, "-ok", "--order-kinematic",
-                  "Order (degree) of the kinematic finite element space.");
-   args.AddOption(&order_e, "-ot", "--order-thermo",
-                  "Order (degree) of the thermodynamic finite element space.");
-   args.AddOption(&ode_solver_type, "-s", "--ode-solver",
-                  "ODE solver: 1 - Forward Euler,\n\t"
-                  "            2 - RK2 SSP, 3 - RK3 SSP, 4 - RK4, 6 - RK6.");
-   args.AddOption(&t_final, "-tf", "--t-final",
-                  "Final time; start time is 0.");
-   args.AddOption(&cfl, "-cfl", "--cfl", "CFL-condition number.");
-   args.AddOption(&cg_tol, "-cgt", "--cg-tol",
-                  "Relative CG tolerance (velocity linear solve).");
-   args.AddOption(&cg_max_iter, "-cgm", "--cg-max-steps",
-                  "Maximum number of CG iterations (velocity linear solve).");
-   args.AddOption(&max_tsteps, "-ms", "--max-steps",
-                  "Maximum number of steps (negative means no restriction).");
-   args.AddOption(&p_assembly, "-pa", "--partial-assembly", "-fa",
-                  "--full-assembly",
-                  "Activate 1D tensor-based assembly (partial assembly).");
-   args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
-                  "--no-visualization",
-                  "Enable or disable GLVis visualization.");
-   args.AddOption(&vis_steps, "-vs", "--visualization-steps",
-                  "Visualize every n-th timestep.");
-   args.AddOption(&visit, "-visit", "--visit", "-no-visit", "--no-visit",
-                  "Enable or disable VisIt visualization.");
-   args.AddOption(&gfprint, "-print", "--print", "-no-print", "--no-print",
-                  "Enable or disable result output (files in mfem format).");
-   args.AddOption(&basename, "-k", "--outputfilename",
-                  "Name of the visit dump files");
-   // RAJA Options *************************************************************
-   args.AddOption(&cuda, "-cuda", "--cuda", "-no-cuda", "--no-cuda",
-                  "Enable or disable CUDA kernels if you are using RAJA.");
-   args.AddOption(&hip, "-hip", "--hip", "-no-hip", "--no-hip",
-                  "Enable or disable HIP kernels if you are using RAJA.");
-   // CUDA Options *************************************************************
-   args.AddOption(&uvm, "-uvm", "--uvm", "-no-uvm", "--no-uvm",
-                  "[32mEnable or disable Unified Memory.[m");
-   args.AddOption(&aware, "-aware", "--aware", "-no-aware", "--no-aware",
-                  "[32mEnable or disable MPI GPU Aware (GPUDirect).[m");
-   args.AddOption(&hcpo, "-hcpo", "--hcpo", "-not-hcpo", "--no-hcpo",
-                  "[32mEnable or disable Host Conforming Prolongation Operations,\n"
-                  "\twhich transfers ALL the data to the host before communications.[m");
-   args.AddOption(&sync, "-sync", "--sync", "-no-sync", "--no-sync",
-                  "[32mEnable or disable Enforced Kernel Synchronization.[m");
-   args.Parse();
-   if (!args.Good())
-   {
-      if (mpi.Root()) { args.PrintUsage(cout); }
-      return 1;
-   }
-   if (mpi.Root()) { args.PrintOptions(cout); }
-
-   // CUDA set device & options
-   // **************************************************************************
-   rconfig::Get().Setup(mpi.WorldRank(),mpi.WorldSize(),
-                        cuda,hip,uvm,aware,hcpo,sync);
-
-   // Read the serial mesh from the given mesh file on all processors.
-   // Refine the mesh in serial to increase the resolution.
-   Mesh *mesh = new Mesh(mesh_file, 1, 1);
-   const int dim = mesh->Dimension();
-   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
-
-   if (p_assembly && dim == 1)
-   {
-      p_assembly = false;
-      if (mpi.Root())
-      {
-         cout << "Laghos does not support PA in 1D. Switching to FA." << endl;
-      }
-   }
-
-   // Parallel partitioning of the mesh.
-   // **************************************************************************
-   ParMesh *pmesh = NULL;
-   const int num_tasks = mpi.WorldSize();
-   const int partitions = floor(pow(num_tasks, 1.0 / dim) + 1e-2);
-   int *nxyz = new int[dim];
-   int product = 1;
-   for (int d = 0; d < dim; d++)
-   {
-      nxyz[d] = partitions;
-      product *= partitions;
-   }
-   if (product == num_tasks)
-   {
-      if (myid == 0)
-      {
-         printf("\033[32m[laghos] \033[32;1mCartesian\033[m\033[32m partitioning will be used\033[m\n");
-      }
-      int *partitioning = mesh->CartesianPartitioning(nxyz);
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh, partitioning);
-      delete[] partitioning;
-   }
-   else
-   {
-      if (myid == 0)
-      {
-         printf("\033[32m[laghos] Non-Cartesian partitioning through METIS will be used\033[m\n");
-#ifndef MFEM_USE_METIS
-         cout << "MFEM was built without METIS. "
-              << "Adjust the number of tasks to use a Cartesian split." << endl;
-#endif
-      }
-#ifndef MFEM_USE_METIS
-      return 1;
-#endif
-      pmesh = new ParMesh(MPI_COMM_WORLD, *mesh);
-   }
-   delete [] nxyz;
-   delete mesh;
-
-   // **************************************************************************
-   // We need at least some elements in each partition for now
-#ifdef MFEM_USE_MPI
-   int global_pmesh_NE;
-   const int pmesh_NE=pmesh->GetNE();
-   MPI_Allreduce(&pmesh_NE,&global_pmesh_NE,1,MPI_INT,MPI_MIN,pmesh->GetComm());
-   if (global_pmesh_NE==0) {
-      printf("[Laghos] ERROR: pmesh->GetNE()==0!");
-      return 1;
-   }
-   else { printf("\033[32m[laghos] pmesh->GetNE()=%d\033[m\n",global_pmesh_NE); }
-   assert(pmesh->GetNE()>0);
-#endif
-
-   // Refine the mesh further in parallel to increase the resolution.
-   for (int lev = 0; lev < rp_levels; lev++) { pmesh->UniformRefinement(); }
-
-   // Define the parallel finite element spaces. We use:
-   // - H1 (Gauss-Lobatto, continuous) for position and velocity.
-   // - L2 (Bernstein, discontinuous) for specific internal energy.
-   L2_FECollection L2FEC(order_e, dim, BasisType::Positive);
-   H1_FECollection H1FEC(order_v, dim);
-   RajaFiniteElementSpace L2FESpace(pmesh, &L2FEC);
-   RajaFiniteElementSpace H1FESpace(pmesh, &H1FEC, pmesh->Dimension());
-
-   // Boundary conditions: all tests use v.n = 0 on the boundary,
-   // and we assume that the boundaries are straight.
-   Array<int> essential_tdofs;
-   {
-      Array<int> ess_bdr(pmesh->bdr_attributes.Max()), tdofs1d;
-      for (int d = 0; d < pmesh->Dimension(); d++)
-      {
-         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e., we must
-         // enforce v_x/y/z = 0 for the velocity components.
-         ess_bdr = 0; ess_bdr[d] = 1;
-         H1FESpace.GetEssentialTrueDofs(ess_bdr, tdofs1d, d);
-         essential_tdofs.Append(tdofs1d);
-      }
-   }
-
-   // Define the explicit ODE solver used for time integration.
-   RajaODESolver *ode_solver = NULL;
-   switch (ode_solver_type)
-   {
-      case 1: ode_solver = new RajaForwardEulerSolver; break;
-      case 2: ode_solver = new RajaRK2Solver(0.5); break;
-      case 3: ode_solver = new RajaRK3SSPSolver; break;
-      case 4: ode_solver = new RajaRK4Solver; break;
-      case 6: ode_solver = new RajaRK6Solver; break;
-      default:
-         if (myid == 0)
-         {
-            cout << "Unknown ODE solver type: " << ode_solver_type << '\n';
-         }
-         delete pmesh;
-         MPI_Finalize();
-         return 3;
-   }
-
-   HYPRE_Int glob_size_l2 = L2FESpace.GlobalTrueVSize();
-   HYPRE_Int glob_size_h1 = H1FESpace.GlobalTrueVSize();
-
-   if (mpi.Root())
-   {
-      cout << "Number of kinematic (position, velocity) dofs: "
-           << glob_size_h1 << endl;
-      cout << "Number of specific internal energy dofs: "
-           << glob_size_l2 << endl<< endl;
-   }
-
-   int Vsize_l2 = L2FESpace.GetVSize();
-   int Vsize_h1 = H1FESpace.GetVSize();
-
-   // The monolithic BlockVector stores unknown fields as:
-   // - 0 -> position
-   // - 1 -> velocity
-   // - 2 -> specific internal energy
-   Array<int> true_offset(4);
-   true_offset[0] = 0;
-   true_offset[1] = true_offset[0] + Vsize_h1;
-   true_offset[2] = true_offset[1] + Vsize_h1;
-   true_offset[3] = true_offset[2] + Vsize_l2;
-   RajaVector S(true_offset[3]);
-
-   // Define GridFunction objects for the position, velocity and specific
-   // internal energy.  There is no function for the density, as we can always
-   // compute the density values given the current mesh position, using the
-   // property of pointwise mass conservation.
-   ParGridFunction x_gf(&H1FESpace);
-   ParGridFunction v_gf(&H1FESpace);
-   ParGridFunction e_gf(&L2FESpace);
-
-   RajaGridFunction d_x_gf(H1FESpace, S.GetRange(true_offset[0], true_offset[1]));
-   RajaGridFunction d_v_gf(H1FESpace, S.GetRange(true_offset[1], true_offset[2]));
-   RajaGridFunction d_e_gf(L2FESpace, S.GetRange(true_offset[2], true_offset[3]));
-
-   // Initialize x_gf using the starting mesh coordinates. This also links the
-   // mesh positions to the values in x_gf.
-   pmesh->SetNodalGridFunction(&x_gf);
-   d_x_gf = x_gf;
-
-   // Initialize the velocity.
-   VectorFunctionCoefficient v_coeff(pmesh->Dimension(), v0);
-   v_gf.ProjectCoefficient(v_coeff);
-   d_v_gf = v_gf;
-
-   // Initialize density and specific internal energy values. We interpolate in
-   // a non-positive basis to get the correct values at the dofs.  Then we do an
-   // L2 projection to the positive basis in which we actually compute. The goal
-   // is to get a high-order representation of the initial condition. Note that
-   // this density is a temporary function and it will not be updated during the
-   // time evolution.
-   ParGridFunction rho(&L2FESpace);
-   FunctionCoefficient rho_coeff(hydrodynamics::rho0);
-   L2_FECollection l2_fec(order_e, pmesh->Dimension());
-   RajaFiniteElementSpace l2_fes(pmesh, &l2_fec);
-   ParGridFunction l2_rho(&l2_fes), l2_e(&l2_fes);
-   l2_rho.ProjectCoefficient(rho_coeff);
-   rho.ProjectGridFunction(l2_rho);
-   RajaGridFunction d_rho(L2FESpace);
-   d_rho = rho;
-   if (problem == 1)
-   {
-      // For the Sedov test, we use a delta function at the origin.
-      DeltaCoefficient e_coeff(0, 0, 0.25);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   else
-   {
-      FunctionCoefficient e_coeff(e0);
-      l2_e.ProjectCoefficient(e_coeff);
-   }
-   e_gf.ProjectGridFunction(l2_e);
-   d_e_gf = e_gf;
-
-   Coefficient *material_pcf = new FunctionCoefficient(hydrodynamics::gamma);
-   // Piecewise constant ideal gas coefficient over the Lagrangian mesh. The
-   // gamma values are projected on a function that stays constant on the moving
-   // mesh.
-   /*L2_FECollection mat_fec(0, pmesh->Dimension());
-   RajaFiniteElementSpace mat_fes(pmesh, &mat_fec);
-   ParGridFunction mat_gf(&mat_fes);
-   FunctionCoefficient mat_coeff(hydrodynamics::gamma);
-   mat_gf.ProjectCoefficient(mat_coeff);
-   GridFunctionCoefficient *mat_gf_coeff = new GridFunctionCoefficient(&mat_gf);
-   RajaGridFunction d_mat_gf_coeff(mat_fes);
-   d_mat_gf_coeff=mat_gf_coeff;*/
-
-   // Additional details, depending on the problem.
-   int source = 0; bool visc=false;
-   switch (problem)
-   {
-      case 0: if (pmesh->Dimension() == 2) { source = 1; }
-         visc = false; break;
-      case 1: visc = true; break;
-      case 2: visc = true; break;
-      case 3: visc = true; break;
-      default: MFEM_ABORT("Wrong problem specification!");
-   }
-
-   LagrangianHydroOperator oper(S.Size(), H1FESpace, L2FESpace,
-                                essential_tdofs, d_rho, source, cfl, material_pcf,
-                                visc, p_assembly, cg_tol, cg_max_iter);
-
-   socketstream vis_rho, vis_v, vis_e;
-   char vishost[] = "localhost";
-   int  visport   = 19916;
-
-   ParGridFunction rho_gf;
-   if (visualization || visit) { oper.ComputeDensity(rho_gf); }
-
-   if (visualization)
-   {
-      // Make sure all MPI ranks have sent their 'v' solution before initiating
-      // another set of GLVis connections (one from each rank):
-      MPI_Barrier(pmesh->GetComm());
-
-      vis_rho.precision(8);
-      vis_v.precision(8);
-      vis_e.precision(8);
-
-      int Wx = 0, Wy = 0; // window position
-      const int Ww = 350, Wh = 350; // window size
-      int offx = Ww+10; // window offsets
-
-      VisualizeField(vis_rho, vishost, visport, rho_gf,
-                     "Density", Wx, Wy, Ww, Wh);
-      Wx += offx;
-      VisualizeField(vis_v, vishost, visport, v_gf,
-                     "Velocity", Wx, Wy, Ww, Wh);
-      Wx += offx;
-      VisualizeField(vis_e, vishost, visport, e_gf,
-                     "Specific Internal Energy", Wx, Wy, Ww, Wh);
-   }
-
-   // Save data for VisIt visualization
-   VisItDataCollection visit_dc(basename, pmesh);
-   if (visit)
-   {
-      visit_dc.RegisterField("Density",  &rho_gf);
-      visit_dc.RegisterField("Velocity", &v_gf);
-      visit_dc.RegisterField("Specific Internal Energy", &e_gf);
-      visit_dc.SetCycle(0);
-      visit_dc.SetTime(0.0);
-      visit_dc.Save();
-   }
-
-   // Perform time-integration (looping over the time iterations, ti, with a
-   // time-step dt). The object oper is of type LagrangianHydroOperator that
-   // defines the Mult() method that used by the time integrators.
-   ode_solver->Init(oper);
-   oper.ResetTimeStepEstimate();
-   double t = 0.0, dt = oper.GetTimeStepEstimate(S), t_old;
-   bool last_step = false;
-   int steps = 0;
-   RajaVector S_old(S);
-
-   for (int ti = 1; !last_step; ti++)
-   {
-      if (t + dt >= t_final)
-      {
-         dt = t_final - t;
-         last_step = true;
-      }
-      if (steps == max_tsteps) { last_step = true; }
-
-      S_old = S;
-      t_old = t;
-      oper.ResetTimeStepEstimate();
-
-      // S is the vector of dofs, t is the current time,
-      // and dt is the time step to advance.
-      ode_solver->Step(S, t, dt);
-      steps++;
-
-      // Make sure that the mesh corresponds to the new solution state.
-      x_gf = d_x_gf;
-      pmesh->NewNodes(x_gf, false);
-
-      // Adaptive time step control.
-      const double dt_est = oper.GetTimeStepEstimate(S);
-      if (dt_est < dt)
-      {
-         // Repeat (solve again) with a decreased time step - decrease of the
-         // time estimate suggests appearance of oscillations.
-         dt *= 0.85;
-         if (dt < numeric_limits<double>::epsilon())
-         { MFEM_ABORT("The time step crashed!"); }
-         t = t_old;
-         S = S_old;
-         oper.ResetQuadratureData();
-         if (mpi.Root()) { cout << "Repeating step " << ti << endl; }
-         ti--; continue;
-      }
-      else if (dt_est > 1.25 * dt) { dt *= 1.02; }
-
-
-      if (last_step || (ti % vis_steps) == 0)
-      {
-         double loc_norm = d_e_gf * d_e_gf, tot_norm;
-         MPI_Allreduce(&loc_norm, &tot_norm, 1, MPI_DOUBLE, MPI_SUM,
-                       pmesh->GetComm());
-         if (mpi.Root())
-         {
-            cout << fixed;
-            cout << "step " << setw(5) << ti
-                 << ",\tt = " << setw(5) << setprecision(4) << t
-                 << ",\tdt = " << setw(5) << setprecision(6) << dt
-                 << ",\t|e| = " << setprecision(10)
-                 << sqrt(tot_norm) << endl;
-         }
-
-         // Make sure all ranks have sent their 'v' solution before initiating
-         // another set of GLVis connections (one from each rank):
-         MPI_Barrier(pmesh->GetComm());
-
-         if (visualization || visit || gfprint) { oper.ComputeDensity(rho_gf); }
-         if (visualization)
-         {
-            int Wx = 0, Wy = 0; // window position
-            int Ww = 350, Wh = 350; // window size
-            int offx = Ww+10; // window offsets
-
-            VisualizeField(vis_rho, vishost, visport, rho_gf,
-                           "Density", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_v, vishost, visport,
-                           v_gf, "Velocity", Wx, Wy, Ww, Wh);
-            Wx += offx;
-            VisualizeField(vis_e, vishost, visport, e_gf,
-                           "Specific Internal Energy", Wx, Wy, Ww,Wh);
-            Wx += offx;
-         }
-
-         if (visit)
-         {
-            visit_dc.SetCycle(ti);
-            visit_dc.SetTime(t);
-            visit_dc.Save();
-         }
-
-         if (gfprint)
-         {
-            ostringstream mesh_name, rho_name, v_name, e_name;
-            mesh_name << basename << "_" << ti
-                      << "_mesh." << setfill('0') << setw(6) << myid;
-            rho_name  << basename << "_" << ti
-                      << "_rho." << setfill('0') << setw(6) << myid;
-            v_name << basename << "_" << ti
-                   << "_v." << setfill('0') << setw(6) << myid;
-            e_name << basename << "_" << ti
-                   << "_e." << setfill('0') << setw(6) << myid;
-
-            ofstream mesh_ofs(mesh_name.str().c_str());
-            mesh_ofs.precision(8);
-            pmesh->Print(mesh_ofs);
-            mesh_ofs.close();
-
-            ofstream rho_ofs(rho_name.str().c_str());
-            rho_ofs.precision(8);
-            rho_gf.Save(rho_ofs);
-            rho_ofs.close();
-
-            ofstream v_ofs(v_name.str().c_str());
-            v_ofs.precision(8);
-            v_gf.Save(v_ofs);
-            v_ofs.close();
-
-            ofstream e_ofs(e_name.str().c_str());
-            e_ofs.precision(8);
-            e_gf.Save(e_ofs);
-            e_ofs.close();
-         }
-      }
-   }
-
-   switch (ode_solver_type)
-   {
-      case 2: steps *= 2; break;
-      case 3: steps *= 3; break;
-      case 4: steps *= 4; break;
-      case 6: steps *= 6;
-   }
-   oper.PrintTimingData(mpi.Root(), steps);
-
-   if (visualization)
-   {
-      vis_v.close();
-      vis_e.close();
-   }
-
-   // Free the used memory.
-   delete ode_solver;
-   delete pmesh;
-   delete material_pcf;
-   return 0;
-}
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-double rho0(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0: return 1.0;
-      case 1: return 1.0;
-      case 2: if (x(0) < 0.5) { return 1.0; }
-         else { return 0.1; }
-      case 3: if (x(0) > 1.0 && x(1) <= 1.5) { return 1.0; }
-         else { return 0.125; }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-double gamma(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0: return 5./3.;
-      case 1: return 1.4;
-      case 2: return 1.4;
-      case 3: if (x(0) > 1.0 && x(1) <= 1.5) { return 1.4; }
-         else { return 1.5; }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-void v0(const Vector &x, Vector &v)
-{
-   switch (problem)
-   {
-      case 0:
-         v(0) =  sin(M_PI*x(0)) * cos(M_PI*x(1));
-         v(1) = -cos(M_PI*x(0)) * sin(M_PI*x(1));
-         if (x.Size() == 3)
-         {
-            v(0) *= cos(M_PI*x(2));
-            v(1) *= cos(M_PI*x(2));
-            v(2) = 0.0;
-         }
-         break;
-      case 1: v = 0.0; break;
-      case 2: v = 0.0; break;
-      case 3: v = 0.0; break;
-      default: MFEM_ABORT("Bad number given for problem id!");
-   }
-}
-
-double e0(const Vector &x)
-{
-   switch (problem)
-   {
-      case 0:
-      {
-         const double denom = 2.0 / 3.0;  // (5/3 - 1) * density.
-         double val;
-         if (x.Size() == 2)
-         {
-            val = 1.0 + (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) / 4.0;
-         }
-         else
-         {
-            val = 100.0 + ((cos(2*M_PI*x(2)) + 2) *
-                           (cos(2*M_PI*x(0)) + cos(2*M_PI*x(1))) - 2) / 16.0;
-         }
-         return val/denom;
-      }
-      case 1: return 0.0; // This case in initialized in main().
-      case 2: if (x(0) < 0.5) { return 1.0 / rho0(x) / (gamma(x) - 1.0); }
-         else { return 0.1 / rho0(x) / (gamma(x) - 1.0); }
-      case 3: if (x(0) > 1.0) { return 0.1 / rho0(x) / (gamma(x) - 1.0); }
-         else { return 1.0 / rho0(x) / (gamma(x) - 1.0); }
-      default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
-   }
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-void display_banner(ostream & os)
-{
-   os << endl
-      << "       __                __                 " << endl
-      << "      / /   ____  ____  / /_  ____  _____   " << endl
-      << "     / /   / __ `/ __ `/ __ \\/ __ \\/ ___/ " << endl
-      << "    / /___/ /_/ / /_/ / / / / /_/ (__  )    " << endl
-      << "   /_____/\\__,_/\\__, /_/ /_/\\____/____/  " << endl
-      << "               /____/                       " << endl << endl;
-}
diff --git a/raja/laghos_assembly.cpp b/raja/laghos_assembly.cpp
deleted file mode 100644
index 67bc6894..00000000
--- a/raja/laghos_assembly.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project (17-SC-20-SC)
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_assembly.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-QuadratureData::QuadratureData(int dim,
-                               int nzones,
-                               int nqp)
-{ Setup(dim, nzones, nqp); }
-
-
-void QuadratureData::Setup(int dim,
-                           int nzones,
-                           int nqp)
-{
-   rho0DetJ0w.SetSize(nqp * nzones);
-   stressJinvT.SetSize(dim * dim * nqp * nzones);
-   dtEst.SetSize(nqp * nzones);
-}
-
-void DensityIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
-                                               ElementTransformation &Tr,
-                                               Vector &elvect)
-{
-   const int ip_cnt = integ_rule.GetNPoints();
-   Vector shape(fe.GetDof());
-   Vector rho0DetJ0w = quad_data.rho0DetJ0w;
-   elvect.SetSize(fe.GetDof());
-   elvect = 0.0;
-   for (int q = 0; q < ip_cnt; q++)
-   {
-      fe.CalcShape(integ_rule.IntPoint(q), shape);
-      shape *= rho0DetJ0w(Tr.ElementNo*ip_cnt + q);
-      elvect += shape;
-   }
-}
-
-// *****************************************************************************
-RajaMassOperator::RajaMassOperator(RajaFiniteElementSpace &fes_,
-                                   const IntegrationRule &integ_rule_,
-                                   QuadratureData *quad_data_)
-   : RajaOperator(fes_.GetTrueVSize()),
-     fes(fes_),
-     integ_rule(integ_rule_),
-     ess_tdofs_count(0),
-     bilinearForm(&fes),
-     quad_data(quad_data_),
-     x_gf(fes),
-     y_gf(fes) {}
-
-// *****************************************************************************
-RajaMassOperator::~RajaMassOperator()
-{
-}
-
-// *****************************************************************************
-void RajaMassOperator::Setup()
-{
-   dim=fes.GetMesh()->Dimension();
-   nzones=fes.GetMesh()->GetNE();
-   RajaMassIntegrator &massInteg = *(new RajaMassIntegrator());
-   massInteg.SetIntegrationRule(integ_rule);
-   massInteg.SetOperator(quad_data->rho0DetJ0w);
-   bilinearForm.AddDomainIntegrator(&massInteg);
-   bilinearForm.Assemble();
-   bilinearForm.FormOperator(Array<int>(), massOperator);
-}
-
-// *************************************************************************
-void RajaMassOperator::SetEssentialTrueDofs(Array<int> &dofs)
-{
-   ess_tdofs_count = dofs.Size();
-   if (ess_tdofs.Size()==0)
-   {
-#ifdef MFEM_USE_MPI
-      int global_ess_tdofs_count;
-      const MPI_Comm comm = fes.GetParMesh()->GetComm();
-      MPI_Allreduce(&ess_tdofs_count,&global_ess_tdofs_count,
-                    1, MPI_INT, MPI_SUM, comm);
-      assert(global_ess_tdofs_count>0);
-      ess_tdofs.allocate(global_ess_tdofs_count);
-#else
-      assert(ess_tdofs_count>0);
-      ess_tdofs.allocate(ess_tdofs_count);
-#endif
-   }
-   else { assert(ess_tdofs_count<=ess_tdofs.Size()); }
-   assert(ess_tdofs.ptr());
-   if (ess_tdofs_count == 0) { return; }
-   assert(ess_tdofs_count>0);
-   assert(dofs.GetData());
-   rHtoD(ess_tdofs.ptr(),dofs.GetData(),ess_tdofs_count*sizeof(int));
-}
-
-// *****************************************************************************
-void RajaMassOperator::EliminateRHS(RajaVector &b)
-{
-   if (ess_tdofs_count > 0)
-   {
-      b.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-// *************************************************************************
-void RajaMassOperator::Mult(const RajaVector &x, RajaVector &y) const
-{
-   distX = x;
-   if (ess_tdofs_count)
-   {
-      distX.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-   massOperator->Mult(distX, y);
-   if (ess_tdofs_count)
-   {
-      y.SetSubVector(ess_tdofs, 0.0, ess_tdofs_count);
-   }
-}
-
-
-// *****************************************************************************
-// * RajaForceOperator
-// *****************************************************************************
-RajaForceOperator::RajaForceOperator(RajaFiniteElementSpace &h1fes_,
-                                     RajaFiniteElementSpace &l2fes_,
-                                     const IntegrationRule &integ_rule_,
-                                     const QuadratureData *quad_data_)
-   : RajaOperator(l2fes_.GetTrueVSize(), h1fes_.GetTrueVSize()),
-     dim(h1fes_.GetMesh()->Dimension()),
-     nzones(h1fes_.GetMesh()->GetNE()),
-     h1fes(h1fes_),
-     l2fes(l2fes_),
-     integ_rule(integ_rule_),
-     quad_data(quad_data_),
-     gVecL2(l2fes.GetLocalDofs() * nzones),
-     gVecH1(h1fes.GetVDim() * h1fes.GetLocalDofs() * nzones) { }
-
-// *****************************************************************************
-RajaForceOperator::~RajaForceOperator() {}
-
-// *************************************************************************
-void RajaForceOperator::Setup()
-{
-   h1D2Q = RajaDofQuadMaps::Get(h1fes, integ_rule);
-   l2D2Q = RajaDofQuadMaps::Get(l2fes, integ_rule);
-}
-
-// *************************************************************************
-void RajaForceOperator::Mult(const RajaVector &vecL2,
-                             RajaVector &vecH1) const
-{
-   l2fes.GlobalToLocal(vecL2, gVecL2);
-   const int NUM_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int L2_DOFS_1D = l2fes.GetFE(0)->GetOrder()+1;
-   const int H1_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   rForceMult(dim,
-              NUM_DOFS_1D,
-              NUM_QUAD_1D,
-              L2_DOFS_1D,
-              H1_DOFS_1D,
-              nzones,
-              l2D2Q->dofToQuad,
-              h1D2Q->quadToDof,
-              h1D2Q->quadToDofD,
-              quad_data->stressJinvT,
-              gVecL2,
-              gVecH1);
-   h1fes.LocalToGlobal(gVecH1, vecH1);
-}
-
-// *************************************************************************
-void RajaForceOperator::MultTranspose(const RajaVector &vecH1,
-                                      RajaVector &vecL2) const
-{
-   h1fes.GlobalToLocal(vecH1, gVecH1);
-   const int NUM_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   const IntegrationRule &ir1D = IntRules.Get(Geometry::SEGMENT,
-                                              integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int L2_DOFS_1D = l2fes.GetFE(0)->GetOrder()+1;
-   const int H1_DOFS_1D = h1fes.GetFE(0)->GetOrder()+1;
-   rForceMultTranspose(dim,
-                       NUM_DOFS_1D,
-                       NUM_QUAD_1D,
-                       L2_DOFS_1D,
-                       H1_DOFS_1D,
-                       nzones,
-                       l2D2Q->quadToDof,
-                       h1D2Q->dofToQuad,
-                       h1D2Q->dofToQuadD,
-                       quad_data->stressJinvT,
-                       gVecH1,
-                       gVecL2);
-   l2fes.LocalToGlobal(gVecL2, vecL2);
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/raja/laghos_assembly.hpp b/raja/laghos_assembly.hpp
deleted file mode 100644
index ad917a48..00000000
--- a/raja/laghos_assembly.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_ASSEMBLY
-#define MFEM_LAGHOS_ASSEMBLY
-
-#include "mfem.hpp"
-#include "raja/raja.hpp"
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-// Container for all data needed at quadrature points.
-struct QuadratureData
-{
-   // TODO: use QuadratureFunctions?
-
-   // Reference to physical Jacobian for the initial mesh. These are computed
-   // only at time zero and stored here.
-   RajaVector Jac0inv;
-
-   // Quadrature data used for full/partial assembly of the force operator. At
-   // each quadrature point, it combines the stress, inverse Jacobian,
-   // determinant of the Jacobian and the integration weight. It must be
-   // recomputed in every time step.
-   RajaVector stressJinvT;
-   RajaDofQuadMaps *dqMaps;
-   RajaGeometry *geom;
-
-   // Quadrature data used for full/partial assembly of the mass matrices. At
-   // time zero, we compute and store (rho0 * det(J0) * qp_weight) at each
-   // quadrature point. Note the at any other time, we can compute
-   // rho = rho0 * det(J0) / det(J), representing the notion of pointwise mass
-   // conservation.
-   RajaVector rho0DetJ0w;
-
-
-   // Initial length scale. This represents a notion of local mesh size. We
-   // assume that all initial zones have similar size.
-   double h0;
-
-   // Estimate of the minimum time step over all quadrature points. This is
-   // recomputed at every time step to achieve adaptive time stepping.
-   double dt_est;
-   RajaVector dtEst;
-
-   QuadratureData(int dim, int nzones, int quads_per_zone);
-
-   void Setup(int dim, int nzones, int quads_per_zone);
-};
-
-// This class is used only for visualization. It assembles (rho, phi) in each
-// zone, which is used by LagrangianHydroOperator::ComputeDensity to do an L2
-// projection of the density.
-class DensityIntegrator : public LinearFormIntegrator
-{
-private:
-   const QuadratureData &quad_data;
-   const IntegrationRule &integ_rule;
-public:
-   DensityIntegrator(const QuadratureData &qd,
-                     const IntegrationRule &ir) : quad_data(qd),
-      integ_rule(ir) {}
-
-   void AssembleRHSElementVect(const FiniteElement &fe,
-                               ElementTransformation &Tr,
-                               Vector &elvect);
-
-   void AssembleRHSElementVect(const FiniteElement &el,
-                               FaceElementTransformations &Tr,
-                               Vector &elvect) {assert(false);}
-
-};
-
-// *****************************************************************************
-// * RajaMassOperator
-// *****************************************************************************
-class RajaMassOperator : public RajaOperator
-{
-private:
-   int dim;
-   int nzones;
-   RajaFiniteElementSpace &fes;
-   const IntegrationRule &integ_rule;
-   unsigned int ess_tdofs_count;
-   RajaArray<int> ess_tdofs;
-   RajaBilinearForm bilinearForm;
-   RajaOperator *massOperator;
-   QuadratureData *quad_data;
-   // For distributing X
-   mutable RajaVector distX;
-   mutable RajaGridFunction x_gf, y_gf;
-public:
-   RajaMassOperator(RajaFiniteElementSpace &fes_,
-                    const IntegrationRule &integ_rule_,
-                    QuadratureData *quad_data_);
-   ~RajaMassOperator();
-   void Setup();
-   void SetEssentialTrueDofs(Array<int> &dofs);
-   // Can be used for both velocity and specific internal energy. For the case
-   // of velocity, we only work with one component at a time.
-   void Mult(const RajaVector &x, RajaVector &y) const;
-   void EliminateRHS(RajaVector &b);
-   void ComputeDiagonal2D(Vector &diag) const;
-   void ComputeDiagonal3D(Vector &diag) const;
-};
-
-// Performs partial assembly, which corresponds to (and replaces) the use of the
-// LagrangianHydroOperator::Force global matrix.
-class RajaForceOperator : public RajaOperator
-{
-private:
-   const int dim;
-   const int nzones;
-   const RajaFiniteElementSpace &h1fes, &l2fes;
-   const IntegrationRule &integ_rule;
-   const QuadratureData *quad_data;
-   const RajaDofQuadMaps *l2D2Q, *h1D2Q;
-   mutable RajaVector gVecL2, gVecH1;
-public:
-   RajaForceOperator(RajaFiniteElementSpace &h1fes_,
-                     RajaFiniteElementSpace &l2fes_,
-                     const IntegrationRule &integ_rule,
-                     const QuadratureData *quad_data_);
-   void Setup();
-   void Mult(const RajaVector &vecL2, RajaVector &vecH1) const;
-   void MultTranspose(const RajaVector &vecH1, RajaVector &vecL2) const;
-   ~RajaForceOperator();
-};
-
-// Scales by the inverse diagonal of the MassPAOperator.
-class DiagonalSolver : public Solver
-{
-private:
-   Vector diag;
-   FiniteElementSpace &FESpace;
-public:
-   DiagonalSolver(FiniteElementSpace &fes): Solver(fes.GetVSize()),
-      diag(),
-      FESpace(fes) { }
-
-   void SetDiagonal(Vector &d)
-   {
-      const Operator *P = FESpace.GetProlongationMatrix();
-      diag.SetSize(P->Width());
-      P->MultTranspose(d, diag);
-   }
-
-   virtual void Mult(const Vector &x, Vector &y) const
-   {
-      for (int i = 0; i < x.Size(); i++) { y(i) = x(i) / diag(i); }
-   }
-   virtual void SetOperator(const Operator &op) { }
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS_ASSEMBLY
diff --git a/raja/laghos_solver.cpp b/raja/laghos_solver.cpp
deleted file mode 100644
index be01a138..00000000
--- a/raja/laghos_solver.cpp
+++ /dev/null
@@ -1,405 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_solver.hpp"
-
-#ifdef MFEM_USE_MPI
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x, int y, int w, int h, bool vec)
-{
-   ParMesh &pmesh = *gf.ParFESpace()->GetParMesh();
-   MPI_Comm comm = pmesh.GetComm();
-
-   int num_procs, myid;
-   MPI_Comm_size(comm, &num_procs);
-   MPI_Comm_rank(comm, &myid);
-
-   bool newly_opened = false;
-   int connection_failed;
-
-   do
-   {
-      if (myid == 0)
-      {
-         if (!sock.is_open() || !sock)
-         {
-            sock.open(vishost, visport);
-            sock.precision(8);
-            newly_opened = true;
-         }
-         sock << "solution\n";
-      }
-
-      pmesh.PrintAsOne(sock);
-      gf.SaveAsOne(sock);
-
-      if (myid == 0 && newly_opened)
-      {
-         sock << "window_title '" << title << "'\n"
-              << "window_geometry "
-              << x << " " << y << " " << w << " " << h << "\n"
-              << "keys maaAcl";
-         if ( vec ) { sock << "vvv"; }
-         sock << endl;
-      }
-
-      if (myid == 0)
-      {
-         connection_failed = !sock && !newly_opened;
-      }
-      MPI_Bcast(&connection_failed, 1, MPI_INT, 0, comm);
-   }
-   while (connection_failed);
-}
-
-// ***************************************************************************
-// * LagrangianHydroOperator
-// ***************************************************************************
-LagrangianHydroOperator::LagrangianHydroOperator(int size,
-                                                 RajaFiniteElementSpace &h1_fes,
-                                                 RajaFiniteElementSpace &l2_fes,
-                                                 Array<int> &essential_tdofs,
-                                                 RajaGridFunction &rho0,
-                                                 int source_type_, double cfl_,
-                                                 Coefficient *material_,
-                                                 bool visc, bool pa,
-                                                 double cgt, int cgiter)
-   : RajaTimeDependentOperator(size),
-     H1FESpace(h1_fes), L2FESpace(l2_fes),
-     H1compFESpace(h1_fes.GetParMesh(), h1_fes.FEColl(),1),
-     ess_tdofs(essential_tdofs),
-     dim(h1_fes.GetMesh()->Dimension()),
-     nzones(h1_fes.GetMesh()->GetNE()),
-     l2dofs_cnt(l2_fes.GetFE(0)->GetDof()),
-     h1dofs_cnt(h1_fes.GetFE(0)->GetDof()),
-     source_type(source_type_), cfl(cfl_),
-     use_viscosity(visc), p_assembly(pa), cg_rel_tol(cgt), cg_max_iter(cgiter),
-     material_pcf(material_),
-     integ_rule(IntRules.Get(h1_fes.GetMesh()->GetElementBaseGeometry(0),
-                             3*h1_fes.GetOrder(0) + l2_fes.GetOrder(0) - 1)),
-     quad_data(dim, nzones, integ_rule.GetNPoints()),
-     quad_data_is_current(false),
-     VMassPA(H1compFESpace, integ_rule, &quad_data),
-     EMassPA(L2FESpace, integ_rule, &quad_data),
-     VMassPA_prec(H1FESpace),
-     ForcePA(H1FESpace, L2FESpace, integ_rule, &quad_data),
-     //locCG(),
-     CG_VMass(H1FESpace.GetParMesh()->GetComm()),
-     CG_EMass(L2FESpace.GetParMesh()->GetComm()),
-     timer(),
-     v(),e(),
-     rhs(H1FESpace.GetVSize()),
-     B(H1compFESpace.GetTrueVSize()),X(H1compFESpace.GetTrueVSize()),
-     one(L2FESpace.GetVSize(),1.0),
-     e_rhs(L2FESpace.GetVSize()),
-     rhs_c(H1compFESpace.GetVSize()),
-     v_local(H1FESpace.GetVDim() * H1FESpace.GetLocalDofs()*nzones),
-     e_quad()
-{
-   // Initial local mesh size (assumes similar cells).
-   double loc_area = 0.0, glob_area;
-   int loc_z_cnt = nzones, glob_z_cnt;
-   ParMesh *pm = H1FESpace.GetParMesh();
-   for (int i = 0; i < nzones; i++) { loc_area += pm->GetElementVolume(i); }
-   MPI_Allreduce(&loc_area, &glob_area, 1, MPI_DOUBLE, MPI_SUM, pm->GetComm());
-   MPI_Allreduce(&loc_z_cnt, &glob_z_cnt, 1, MPI_INT, MPI_SUM, pm->GetComm());
-   switch (pm->GetElementBaseGeometry(0))
-   {
-      case Geometry::SEGMENT:
-         quad_data.h0 = glob_area / glob_z_cnt; break;
-      case Geometry::SQUARE:
-         quad_data.h0 = sqrt(glob_area / glob_z_cnt); break;
-      case Geometry::TRIANGLE:
-         quad_data.h0 = sqrt(2.0 * glob_area / glob_z_cnt); break;
-      case Geometry::CUBE:
-         quad_data.h0 = pow(glob_area / glob_z_cnt, 1.0/3.0); break;
-      case Geometry::TETRAHEDRON:
-         quad_data.h0 = pow(6.0 * glob_area / glob_z_cnt, 1.0/3.0); break;
-      default: MFEM_ABORT("Unknown zone type!");
-   }
-   quad_data.h0 /= (double) H1FESpace.GetOrder(0);
-
-   quad_data.dqMaps = RajaDofQuadMaps::Get(H1FESpace,integ_rule);
-   quad_data.geom = RajaGeometry::Get(H1FESpace,integ_rule);
-   quad_data.Jac0inv = quad_data.geom->invJ;
-
-   RajaVector rhoValues; // used in rInitQuadratureData
-   rho0.ToQuad(integ_rule, rhoValues);
-
-   if (dim==1) { assert(false); }
-   const int NUM_QUAD = integ_rule.GetNPoints();
-
-   rInitQuadratureData(NUM_QUAD,
-                       nzones,
-                       rhoValues,
-                       quad_data.geom->detJ,
-                       quad_data.dqMaps->quadWeights,
-                       quad_data.rho0DetJ0w);
-
-   // Needs quad_data.rho0DetJ0w
-   ForcePA.Setup();
-   VMassPA.Setup();
-   EMassPA.Setup();
-
-   {
-      // Setup the preconditioner of the velocity mass operator.
-      //Vector d;
-      //#warning ComputeDiagonal
-      //(dim == 2) ? VMassPA.ComputeDiagonal2D(d) : VMassPA.ComputeDiagonal3D(d);
-      //VMassPA_prec.SetDiagonal(d);
-   }
-
-   CG_VMass.SetOperator(VMassPA);
-   CG_VMass.SetRelTol(cg_rel_tol);
-   CG_VMass.SetAbsTol(0.0);
-   CG_VMass.SetMaxIter(cg_max_iter);
-   CG_VMass.SetPrintLevel(-1);
-
-   CG_EMass.SetOperator(EMassPA);
-   CG_EMass.iterative_mode = false;
-   CG_EMass.SetRelTol(1e-8);
-   CG_EMass.SetAbsTol(1e-8 * numeric_limits<double>::epsilon());
-   CG_EMass.SetMaxIter(200);
-   CG_EMass.SetPrintLevel(-1);
-}
-
-// *****************************************************************************
-LagrangianHydroOperator::~LagrangianHydroOperator() {}
-
-// *****************************************************************************
-void LagrangianHydroOperator::Mult(const RajaVector &S, RajaVector &dS_dt) const
-{
-   dS_dt = 0.0;
-   // Make sure that the mesh positions correspond to the ones in S. This is
-   // needed only because some mfem time integrators don't update the solution
-   // vector at every intermediate stage (hence they don't change the mesh).
-   Vector h_x = RajaVector(S.GetRange(0, H1FESpace.GetVSize()));
-   ParGridFunction x(&H1FESpace, h_x.GetData());
-   H1FESpace.GetParMesh()->NewNodes(x, false);
-   UpdateQuadratureData(S);
-   // The monolithic BlockVector stores the unknown fields as follows:
-   // - Position
-   // - Velocity
-   // - Specific Internal Energy
-   const int VsizeL2 = L2FESpace.GetVSize();
-   const int VsizeH1 = H1FESpace.GetVSize();
-   v = S.GetRange(VsizeH1, VsizeH1);
-   e = S.GetRange(2*VsizeH1, VsizeL2);
-   RajaVector dx = dS_dt.GetRange(0, VsizeH1);
-   RajaVector dv = dS_dt.GetRange(VsizeH1, VsizeH1);
-   RajaVector de = dS_dt.GetRange(2*VsizeH1, VsizeL2);
-   // Set dx_dt = v (explicit)
-   dx = v;
-   // Solve for velocity.
-   timer.sw_force.Start();
-   ForcePA.Mult(one, rhs);
-   timer.sw_force.Stop();
-   rhs.Neg();
-   // Partial assembly solve for each velocity component.
-   const int size = H1compFESpace.GetVSize();
-   for (int c = 0; c < dim; c++)
-   {
-      rhs_c = rhs.GetRange(c*size, size);
-      RajaVector dv_c = dv.GetRange(c*size, size);
-      Array<int> c_tdofs;
-      Array<int> ess_bdr(H1FESpace.GetMesh()->bdr_attributes.Max());
-      // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e.,
-      // we must enforce v_x/y/z = 0 for the velocity components.
-      ess_bdr = 0; ess_bdr[c] = 1;
-      // Essential true dofs as if there's only one component.
-      H1compFESpace.GetEssentialTrueDofs(ess_bdr, c_tdofs);
-      dv_c = 0.0;
-      H1compFESpace.GetProlongationOperator()->MultTranspose(rhs_c, B);
-      H1compFESpace.GetRestrictionOperator()->Mult(dv_c, X);
-      VMassPA.SetEssentialTrueDofs(c_tdofs);
-      VMassPA.EliminateRHS(B);
-      timer.sw_cgH1.Start();
-      CG_VMass.Mult(B, X);
-      timer.sw_cgH1.Stop();
-      timer.H1cg_iter += CG_VMass.GetNumIterations();
-      //printf("\n[H1cg_iter] %d",timer.H1cg_iter);
-      H1compFESpace.GetProlongationOperator()->Mult(X, dv_c);
-   }
-   // Solve for energy, assemble the energy source if such exists.
-   LinearForm *e_source = NULL;
-   if (source_type == 1) // 2D Taylor-Green.
-   {
-      e_source = new LinearForm(&L2FESpace);
-      assert(L2FESpace.FEColl());
-      TaylorCoefficient coeff;
-      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &integ_rule);
-      e_source->AddDomainIntegrator(d);
-      e_source->Assemble();
-   }
-   Array<int> l2dofs;
-   {
-      timer.sw_force.Start();
-      ForcePA.MultTranspose(v, e_rhs);
-      timer.sw_force.Stop();
-   }
-   if (e_source) { e_rhs += *e_source; }
-   {
-      timer.sw_cgL2.Start();
-      CG_EMass.Mult(e_rhs, de);
-      timer.sw_cgL2.Stop();
-      timer.L2cg_iter += CG_EMass.GetNumIterations();
-   }
-   delete e_source;
-   quad_data_is_current = false;
-}
-
-double LagrangianHydroOperator::GetTimeStepEstimate(const RajaVector &S) const
-{
-   UpdateQuadratureData(S);
-   double glob_dt_est;
-   MPI_Allreduce(&quad_data.dt_est, &glob_dt_est, 1, MPI_DOUBLE, MPI_MIN,
-                 H1FESpace.GetParMesh()->GetComm());
-   return glob_dt_est;
-}
-
-void LagrangianHydroOperator::ResetTimeStepEstimate() const
-{
-   quad_data.dt_est = numeric_limits<double>::infinity();
-}
-
-void LagrangianHydroOperator::ComputeDensity(ParGridFunction &rho)
-{
-   rho.SetSpace(&L2FESpace);
-   DenseMatrix Mrho(l2dofs_cnt);
-   Vector rhs(l2dofs_cnt), rho_z(l2dofs_cnt);
-   Array<int> dofs(l2dofs_cnt);
-   DenseMatrixInverse inv(&Mrho);
-   MassIntegrator mi(&integ_rule);
-   DensityIntegrator di(quad_data,integ_rule);
-   for (int i = 0; i < nzones; i++)
-   {
-      di.AssembleRHSElementVect(*L2FESpace.GetFE(i),
-                                *L2FESpace.GetElementTransformation(i), rhs);
-      mi.AssembleElementMatrix(*L2FESpace.GetFE(i),
-                               *L2FESpace.GetElementTransformation(i), Mrho);
-      inv.Factor();
-      inv.Mult(rhs, rho_z);
-      L2FESpace.GetElementDofs(i, dofs);
-      rho.SetSubVector(dofs, rho_z);
-   }
-}
-
-void LagrangianHydroOperator::PrintTimingData(bool IamRoot, int steps)
-{
-   double my_rt[5], rt_max[5];
-   my_rt[0] = timer.sw_cgH1.RealTime();
-   my_rt[1] = timer.sw_cgL2.RealTime();
-   my_rt[2] = timer.sw_force.RealTime();
-   my_rt[3] = timer.sw_qdata.RealTime();
-   my_rt[4] = my_rt[0] + my_rt[2] + my_rt[3];
-   MPI_Reduce(my_rt, rt_max, 5, MPI_DOUBLE, MPI_MAX, 0, H1FESpace.GetComm());
-   HYPRE_Int mydata[2], alldata[2];
-   mydata[0] = timer.L2cg_iter;
-   mydata[1] = timer.quad_tstep;
-   MPI_Reduce(mydata, alldata, 2, HYPRE_MPI_INT, MPI_SUM, 0, H1FESpace.GetComm());
-   if (IamRoot)
-   {
-      const HYPRE_Int H1gsize = H1FESpace.GlobalTrueVSize(),
-                      L2gsize = L2FESpace.GlobalTrueVSize();
-      using namespace std;
-      cout << endl;
-      cout << "CG (H1) total time: " << rt_max[0] << endl;
-      cout << "CG (H1) rate (megadofs="<<H1gsize<<" x cg_iterations="<<timer.H1cg_iter<<" / second): "
-           << 1e-6 * H1gsize * timer.H1cg_iter / rt_max[0] << endl;
-      cout << endl;
-      cout << "CG (L2) total time: " << rt_max[1] << endl;
-      cout << "CG (L2) rate (megadofs x cg_iterations / second): "
-           << 1e-6 * L2gsize * timer.L2cg_iter/*alldata[0]*/ / rt_max[1] << endl;
-      cout << endl;
-      // The Force operator is applied twice per time step, on the H1 and the L2
-      // vectors, respectively.
-      cout << "Forces total time: " << rt_max[2] << endl;
-      cout << "Forces rate (megadofs x timesteps / second): "
-           << 1e-6 * steps * (H1gsize + L2gsize) / rt_max[2] << endl;
-      cout << endl;
-      cout << "UpdateQuadData total time: " << rt_max[3] << endl;
-      cout << "UpdateQuadData rate (megaquads x timesteps / second): "
-           << 1e-6 * alldata[1] * integ_rule.GetNPoints() / rt_max[3] << endl;
-      cout << endl;
-      cout << "Major kernels total time (seconds): " << rt_max[4] << endl;
-      cout << "Major kernels total rate (megadofs x time steps / second): "
-           << 1e-6 * H1gsize * steps / rt_max[4] << endl;
-   }
-}
-
-// *****************************************************************************
-void LagrangianHydroOperator::UpdateQuadratureData(const RajaVector &S) const
-{
-   if (quad_data_is_current) { return; }
-   timer.sw_qdata.Start();
-   const int vSize = H1FESpace.GetVSize();
-   const int eSize = L2FESpace.GetVSize();
-   const RajaVector x = S.GetRange(0, vSize);
-   RajaVector v = S.GetRange(vSize, vSize);
-   RajaGridFunction e(L2FESpace, S.GetRange(2*vSize, eSize));
-   quad_data.geom = RajaGeometry::Get(H1FESpace,integ_rule,x);
-   H1FESpace.GlobalToLocal(v, v_local);
-   e.ToQuad(integ_rule, e_quad);
-   const int NUM_QUAD = integ_rule.GetNPoints();
-   const IntegrationRule &ir1D =
-      IntRules.Get(Geometry::SEGMENT, integ_rule.GetOrder());
-   const int NUM_QUAD_1D  = ir1D.GetNPoints();
-   const int NUM_DOFS_1D  = H1FESpace.GetFE(0)->GetOrder()+1;
-   ElementTransformation *T = H1FESpace.GetElementTransformation(0);
-   const IntegrationPoint &ip = integ_rule.IntPoint(0);
-   const double gamma = material_pcf->Eval(*T, ip);
-   rUpdateQuadratureData(gamma,
-                         quad_data.h0,
-                         cfl,
-                         use_viscosity,
-                         dim,
-                         NUM_QUAD,
-                         NUM_QUAD_1D,
-                         NUM_DOFS_1D,
-                         nzones,
-                         quad_data.dqMaps->dofToQuad,
-                         quad_data.dqMaps->dofToQuadD,
-                         quad_data.dqMaps->quadWeights,
-                         v_local,
-                         e_quad,
-                         quad_data.rho0DetJ0w,
-                         quad_data.Jac0inv,
-                         quad_data.geom->J,
-                         quad_data.geom->invJ,
-                         quad_data.geom->detJ,
-                         quad_data.stressJinvT,
-                         quad_data.dtEst);
-   quad_data.dt_est = quad_data.dtEst.Min();
-   quad_data_is_current = true;
-   timer.sw_qdata.Stop();
-   timer.quad_tstep += nzones;
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
diff --git a/raja/laghos_solver.hpp b/raja/laghos_solver.hpp
deleted file mode 100644
index fbef5f8b..00000000
--- a/raja/laghos_solver.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_SOLVER
-#define MFEM_LAGHOS_SOLVER
-
-#include "mfem.hpp"
-#include "raja/raja.hpp"
-
-#include "laghos_assembly.hpp"
-
-#ifdef MFEM_USE_MPI
-
-#include <memory>
-#include <iostream>
-#include <fstream>
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-/// Visualize the given parallel grid function, using a GLVis server on the
-/// specified host and port. Set the visualization window title, and optionally,
-/// its geometry.
-void VisualizeField(socketstream &sock, const char *vishost, int visport,
-                    ParGridFunction &gf, const char *title,
-                    int x = 0, int y = 0, int w = 400, int h = 400,
-                    bool vec = false);
-
-
-// These are defined in laghos.cpp
-double rho0(const Vector &);
-void v0(const Vector &, Vector &);
-double e0(const Vector &);
-double gamma(const Vector &);
-
-struct TimingData
-{
-   // Total times for all major computations:
-   // CG solves (H1 and L2) / force RHS assemblies / quadrature computations.
-   StopWatch sw_cgH1, sw_cgL2, sw_force, sw_qdata;
-
-   // These accumulate the total processed dofs or quad points:
-   // #(CG iterations) for the H1 CG solves.
-   // #dofs  * #(CG iterations) for the L2 CG solve.
-   // #quads * #(RK sub steps) for the quadrature data computations.
-   int H1cg_iter, L2cg_iter, quad_tstep;
-
-   TimingData()
-      : H1cg_iter(0), L2cg_iter(0), quad_tstep(0) {}
-};
-
-// Given a solutions state (x, v, e), this class performs all necessary
-// computations to evaluate the new slopes (dx_dt, dv_dt, de_dt).
-class LagrangianHydroOperator : public RajaTimeDependentOperator
-{
-protected:
-   RajaFiniteElementSpace &H1FESpace;
-   RajaFiniteElementSpace &L2FESpace;
-   mutable RajaFiniteElementSpace H1compFESpace;
-
-   Array<int> &ess_tdofs;
-
-   const int dim, nzones, l2dofs_cnt, h1dofs_cnt, source_type;
-   const double cfl;
-   const bool use_viscosity, p_assembly;
-   const double cg_rel_tol;
-   const int cg_max_iter;
-   Coefficient *material_pcf;
-
-   // Integration rule for all assemblies.
-   const IntegrationRule &integ_rule;
-
-   // Data associated with each quadrature point in the mesh. These values are
-   // recomputed at each time step.
-   mutable QuadratureData quad_data;
-   mutable bool quad_data_is_current;
-
-   // Force matrix that combines the kinematic and thermodynamic spaces. It is
-   // assembled in each time step and then it's used to compute the final
-   // right-hand sides for momentum and specific internal energy.
-   mutable RajaMassOperator VMassPA, EMassPA;
-   mutable DiagonalSolver VMassPA_prec;
-   mutable RajaForceOperator ForcePA;
-
-   // Linear solver for energy.
-   //RajaCGSolver locCG;
-   RajaCGSolver CG_VMass,CG_EMass;
-
-   mutable TimingData timer;
-
-   // Device vectors we want to keep
-   mutable RajaVector v,e,rhs,B,X;
-   const RajaVector one;
-   mutable RajaVector e_rhs;
-   mutable RajaVector rhs_c;
-   mutable RajaVector v_local,e_quad;
-
-   virtual void ComputeMaterialProperties(int nvalues, const double gamma[],
-                                          const double rho[], const double e[],
-                                          double p[], double cs[]) const
-   {
-      for (int v = 0; v < nvalues; v++)
-      {
-         p[v]  = (gamma[v] - 1.0) * rho[v] * e[v];
-         cs[v] = sqrt(gamma[v] * (gamma[v]-1.0) * e[v]);
-      }
-   }
-
-   void UpdateQuadratureData(const RajaVector &S) const;
-
-public:
-   LagrangianHydroOperator(int size, RajaFiniteElementSpace &h1_fes,
-                           RajaFiniteElementSpace &l2_fes,
-                           Array<int> &essential_tdofs, RajaGridFunction &rho0,
-                           int source_type_, double cfl_,
-                           Coefficient *material_, bool visc, bool pa,
-                           double cgt, int cgiter);
-
-   // Solve for dx_dt, dv_dt and de_dt.
-   virtual void Mult(const RajaVector &S, RajaVector &dS_dt) const;
-
-   // Calls UpdateQuadratureData to compute the new quad_data.dt_est.
-   double GetTimeStepEstimate(const RajaVector &S) const;
-   void ResetTimeStepEstimate() const;
-   void ResetQuadratureData() const { quad_data_is_current = false; }
-
-   // The density values, which are stored only at some quadrature points, are
-   // projected as a ParGridFunction.
-   void ComputeDensity(ParGridFunction &rho);
-
-   void PrintTimingData(bool IamRoot, int steps);
-
-   ~LagrangianHydroOperator();
-};
-
-class TaylorCoefficient : public Coefficient
-{
-   virtual double Eval(ElementTransformation &T,
-                       const IntegrationPoint &ip)
-   {
-      Vector x(2);
-      T.Transform(ip, x);
-      return 3.0 / 8.0 * M_PI * ( cos(3.0*M_PI*x(0)) * cos(M_PI*x(1)) -
-                                  cos(M_PI*x(0))     * cos(3.0*M_PI*x(1)) );
-   }
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_USE_MPI
-
-#endif // MFEM_LAGHOS
diff --git a/raja/makefile b/raja/makefile
deleted file mode 100644
index 0a71f01d..00000000
--- a/raja/makefile
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-# the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-# reserved. See files LICENSE and NOTICE for details.
-#
-# This file is part of CEED, a collection of benchmarks, miniapps, software
-# libraries and APIs for efficient high-order finite element and spectral
-# element discretizations for exascale applications. For more information and
-# source code availability see http://github.com/ceed.
-#
-# The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-# a collaborative effort of two U.S. Department of Energy organizations (Office
-# of Science and the National Nuclear Security Administration) responsible for
-# the planning and preparation of a capable exascale ecosystem, including
-# software, applications, hardware, advanced system engineering and early
-# testbed platforms, in support of the nation's exascale computing imperative.
-
-enableCuda= 0
-enableHip = 1
-
-RAJA_DIR ?= /usr/local/raja
-RAJA_INCFLAGS = -I$(RAJA_DIR)/include -I$(RAJA_DIR)/tpl/cub -I$(RAJA_DIR)/tpl/rocPRIM/rocprim/include  
-RAJA_LIBS = -fopenmp $(RAJA_DIR)/lib/libRAJA.a
-RAJA_CXXFLAGS = 
-
-NV_ARCH ?= -arch=sm_61
-CUDA_DIR ?= /usr/local/cuda
-
-HIP_ARCH ?= --amdgpu-target=gfx906
-ROCM_DIR ?= /opt/rocm
-
-MPI_HOME ?= /usr/local/mpi
-MPI_INCFLAGS = -I$(MPI_HOME)/include
-MPI_LIBS := $(shell mpicxx --showme:link)
-
-
-NVCC_CXXFLAGS = --expt-extended-lambda -Xcompiler -fopenmp -x=cu -std=c++11 -m64 --restrict -Xcompiler -Wall $(NV_ARCH)
-NVCC_LIBS = -Wl,-rpath -Wl,$(CUDA_DIR)/lib64 -L$(CUDA_DIR)/lib64 \
-            -lcuda -lcudart -lcudadevrt -lnvToolsExt
-
-HIPCC_CXXFLAGS = -fopenmp -std=c++11 -Wall -I$(ROCM_DIR) $(shell $(ROCM_DIR)/bin/hipconfig --cpp_config)
-HIPCC_LIBS =  -L$(ROCM_DIR)/lib/ -lhip_hcc $(MPI_INCFLAGS) $(MPI_LIBS)
-
-define LAGHOS_HELP_MSG
-
-Laghos makefile targets:
-
-   make
-   make status/info
-   make install
-   make clean
-   make distclean
-   make style
-
-Examples:
-
-make -j 4
-   Build Laghos using the current configuration options from MFEM.
-   (Laghos requires the MFEM finite element library, and uses its compiler and
-    linker options in its build process.)
-make status
-   Display information about the current configuration.
-make install PREFIX=<dir>
-   Install the Laghos executable in <dir>.
-make clean
-   Clean the Laghos executable, library and object files.
-make distclean
-   In addition to "make clean", remove the local installation directory and some
-   run-time generated files.
-make style
-   Format the Laghos C++ source files using the Artistic Style (astyle) settings
-   from MFEM.
-
-endef
-
-# Default installation location
-PREFIX = ./bin
-INSTALL = /usr/bin/install
-
-# Use the MFEM build directory
-# MFEM_DIR = ../../mfem
-MFEM_DIR = ./mfem
-CONFIG_MK = $(MFEM_DIR)/config/config.mk
-TEST_MK = $(MFEM_DIR)/config/test.mk
-# Use the MFEM install directory
-# MFEM_DIR = ../mfem/mfem
-# CONFIG_MK = $(MFEM_DIR)/config.mk
-# TEST_MK = $(MFEM_DIR)/test.mk
-
-# Use two relative paths to MFEM: first one for compilation in '.' and second
-# one for compilation in 'lib'.
-MFEM_DIR1 := $(MFEM_DIR)
-MFEM_DIR2 := $(realpath $(MFEM_DIR))
-
-# Use the compiler used by MFEM. Get the compiler and the options for compiling
-# and linking from MFEM's config.mk. (Skip this if the target does not require
-# building.)
-MFEM_LIB_FILE = mfem_is_not_built
-ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
-   -include $(CONFIG_MK)
-endif
-
-ifeq (1,$(enableCuda))
-CXX = nvcc
-else ifeq (1,$(enableHip))
-CXX = ${ROCM_DIR}/bin/hipcc
-endif
-
-CPPFLAGS = $(MFEM_CPPFLAGS)
-CXXFLAGS = $(MFEM_CXXFLAGS)
-
-# MFEM config does not define C compiler
-CC     = gcc
-CFLAGS = -O3
-
-
-
-# Optional link flags
-LDFLAGS =
-
-OPTIM_OPTS = -O3
-DEBUG_OPTS = -g -Wall
-LAGHOS_DEBUG = $(MFEM_DEBUG)
-ifneq ($(LAGHOS_DEBUG),$(MFEM_DEBUG))
-   ifeq ($(LAGHOS_DEBUG),YES)
-      CXXFLAGS = $(DEBUG_OPTS)
-   else
-      CXXFLAGS = $(OPTIM_OPTS)
-   endif
-endif
-
-LAGHOS_FLAGS = $(CPPFLAGS) $(CXXFLAGS) $(MFEM_INCFLAGS) \
- $(MPI_INCFLAGS) $(RAJA_INCFLAGS) $(RAJA_CXXFLAGS)
-LAGHOS_LIBS = $(MFEM_LIBS) $(RAJA_LIBS)
-
-ifeq (1,$(enableCuda))
-LD = $(MFEM_CXX)
-LAGHOS_FLAGS += $(NVCC_CXXFLAGS) $(NVCC_INCFLAGS)
-LAGHOS_LIBS += $(NVCC_LIBS)
-else ifeq (1,$(enableHip))
-LD = ${ROCM_DIR}/bin/hipcc
-LAGHOS_FLAGS += $(HIPCC_CXXFLAGS) $(HIPCC_INCFLAGS)
-LAGHOS_LIBS += $(HIPCC_LIBS)
-endif
-
-
-ifeq ($(LAGHOS_DEBUG),YES)
-   LAGHOS_FLAGS += -DLAGHOS_DEBUG
-endif
-
-LIBS = $(strip $(LAGHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(LAGHOS_FLAGS))
-Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
-
-SOURCE_FILES = ./laghos.cpp \
-./laghos_assembly.cpp \
-./raja/linalg/rsolvers.cpp \
-./raja/linalg/rvector.cpp \
-./raja/kernels/force/rForce.cpp \
-./raja/kernels/geom/rInitGeom.cpp \
-./raja/kernels/quad/rQDataInit.cpp \
-./raja/kernels/quad/rQDataUpdate.cpp \
-./raja/kernels/quad/rGridFuncToQuad.cpp \
-./raja/kernels/maps/rLocalToGlobal.cpp \
-./raja/kernels/maps/rMapping.cpp \
-./raja/kernels/maps/rGlobalToLocal.cpp \
-./raja/kernels/mass/rMassAssemble.cpp \
-./raja/kernels/mass/rMassMultAdd.cpp \
-./raja/kernels/blas/vector_map_dofs.cpp \
-./raja/kernels/blas/vector_vec_sub.cpp \
-./raja/kernels/blas/vector_dot.cpp \
-./raja/kernels/blas/vector_clear_dofs.cpp \
-./raja/kernels/blas/vector_xsy.cpp \
-./raja/kernels/blas/vector_xpay.cpp \
-./raja/kernels/blas/vector_axpy.cpp \
-./raja/kernels/blas/vector_op_eq.cpp \
-./raja/kernels/blas/vector_get_subvector.cpp \
-./raja/kernels/blas/vector_vec_add.cpp \
-./raja/kernels/blas/vector_min.cpp \
-./raja/kernels/blas/vector_set_subvector.cpp \
-./raja/kernels/blas/vector_set_subvector_const.cpp \
-./raja/kernels/blas/vector_vec_mul.cpp \
-./raja/kernels/blas/vector_neg.cpp \
-./raja/fem/rconform.cpp \
-./raja/fem/rrestrict.cpp \
-./raja/fem/rbilinearform.cpp \
-./raja/fem/rbilininteg.cpp \
-./raja/fem/rprolong.cpp \
-./raja/fem/rgridfunc.cpp \
-./raja/fem/rfespace.cpp \
-./raja/general/rmemcpy.cpp \
-./raja/general/rtable.cpp \
-./raja/general/rcommd.cpp \
-./raja/config/rconfig.cpp \
-./laghos_solver.cpp
-
-OBJECT_FILES1 = $(SOURCE_FILES:.cpp=.o)
-OBJECT_FILES = $(OBJECT_FILES1:.c=.o)
-HEADER_FILES = ./raja/raja.hpp \
-./raja/linalg/rode.hpp \
-./raja/linalg/rsolvers.hpp \
-./raja/linalg/rvector.hpp \
-./raja/linalg/roperator.hpp \
-./raja/kernels/raja.hpp \
-./raja/kernels/include/forall.hpp \
-./raja/kernels/include/kernels.hpp \
-./raja/kernels/include/offsets.hpp \
-./raja/fem/rbilinearform.hpp \
-./raja/fem/rrestrict.hpp \
-./raja/fem/rfespace.hpp \
-./raja/fem/rbilininteg.hpp \
-./raja/fem/rgridfunc.hpp \
-./raja/fem/rprolong.hpp \
-./raja/fem/rconform.hpp \
-./raja/general/rarray.hpp \
-./raja/general/rtable.hpp \
-./raja/general/rmalloc.hpp \
-./raja/general/rmemcpy.hpp \
-./raja/general/rcommd.hpp \
-./raja/config/rconfig.hpp \
-./laghos_solver.hpp \
-./laghos_assembly.hpp
-
-# Targets
-
-.PHONY: all clean distclean install status info opt debug test style clean-build clean-exec
-
-.SUFFIXES: .c .cpp .o
-.cpp.o:
-	cd $(<D); $(CCC) -c $(<F)
-.c.o:
-	cd $(<D); $(Ccc) -c $(<F)
-
-laghos: override MFEM_DIR = $(MFEM_DIR1)
-laghos:	$(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
-	 $(LD) -o laghos $(OBJECT_FILES) $(LIBS)
-
-all: laghos
-
-opt:
-	$(MAKE) "LAGHOS_DEBUG=NO"
-
-debug:
-	$(MAKE) "LAGHOS_DEBUG=YES"
-
-$(OBJECT_FILES): override MFEM_DIR = $(MFEM_DIR2)
-$(OBJECT_FILES): $(HEADER_FILES) $(CONFIG_MK)
-
-MFEM_TESTS = laghos
-include $(TEST_MK)
-# Testing: Specific execution options
-RUN_MPI = $(MFEM_MPIEXEC) $(MFEM_MPIEXEC_NP) 4
-test: laghos
-	@$(call mfem-test,$<, $(RUN_MPI), Laghos miniapp,\
-	-p 0 -m data/square01_quad.mesh -rs 3 -tf 0.1)
-# Testing: "test" target and mfem-test* variables are defined in MFEM's
-# config/test.mk
-
-# Generate an error message if the MFEM library is not built and exit
-$(CONFIG_MK) $(MFEM_LIB_FILE):
-	$(error The MFEM library is not built)
-
-clean: clean-build clean-exec
-
-clean-build:
-	rm -rf laghos $(OBJECT_FILES) *~ *.dSYM
-clean-exec:
-	rm -rf ./results
-
-distclean: clean
-	rm -rf bin/
-
-install: laghos
-	mkdir -p $(PREFIX)
-	$(INSTALL) -m 750 laghos $(PREFIX)
-
-help:
-	$(info $(value LAGHOS_HELP_MSG))
-	@true
-
-status info:
-	$(info MFEM_DIR    = $(MFEM_DIR))
-	$(info LAGHOS_FLAGS = $(LAGHOS_FLAGS))
-	$(info LAGHOS_LIBS  = $(value LAGHOS_LIBS))
-	$(info PREFIX      = $(PREFIX))
-	@true
-
-ASTYLE = astyle --options=$(MFEM_DIR1)/config/mfem.astylerc
-FORMAT_FILES := $(SOURCE_FILES) $(HEADER_FILES)
-
-style:
-	@if ! $(ASTYLE) $(FORMAT_FILES) | grep Formatted; then\
-	   echo "No source files were changed.";\
-	fi
diff --git a/raja/raja/config/rconfig.cpp b/raja/raja/config/rconfig.cpp
deleted file mode 100644
index d6c624ed..00000000
--- a/raja/raja/config/rconfig.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-#include <mpi-ext.h>
-#include <unistd.h>
-
-namespace mfem
-{
-
-
-// ***************************************************************************
-bool isNvidiaCudaMpsDaemonRunning(void)
-{
-   const char *command="pidof -s nvidia-cuda-mps-control>/dev/null";
-   return system(command)==0;
-}
-
-// ***************************************************************************
-void computeCapabilityOfTheDevice(const int mpi_rank,
-#if defined(RAJA_ENABLE_CUDA)
-                                  const CUdevice cuDevice,
-#elif defined(RAJA_ENABLE_HIP)
-                                  const hipDevice_t hipDevice,
-#endif
-                                  const int device)
-{
-   char name[128];
-   int major, minor;
-#if defined(RAJA_ENABLE_CUDA)
-   cuDeviceGetName(name, 128, cuDevice);
-   cuDeviceComputeCapability(&major, &minor, device);
-#elif defined(RAJA_ENABLE_HIP)
-   hipDeviceGetName(name, 128, hipDevice);
-   hipDeviceComputeCapability(&major, &minor, device);
-#endif
-   printf("\033[32m[laghos] Rank_%d => Device_%d (%s:sm_%d.%d)\033[m\n",
-          mpi_rank, device, name, major, minor);
-}
-
-// ***************************************************************************
-// *   Setup
-// ***************************************************************************
-void rconfig::Setup(const int _mpi_rank,
-                    const int _mpi_size,
-                    const bool _cuda,
-                    const bool _hip,
-                    const bool _uvm,
-                    const bool _aware,
-                    const bool _hcpo,
-                    const bool _sync)
-{
-   mpi_rank=_mpi_rank;
-   mpi_size=_mpi_size;
-
-   // Get the number of devices with compute capability greater or equal to 2.0
-   // Can be changed wuth CUDA_VISIBLE_DEVICES
-#if defined(RAJA_ENABLE_CUDA)
-   cudaGetDeviceCount(&gpu_count);
-#elif defined(RAJA_ENABLE_HIP)
-   hipGetDeviceCount(&gpu_count);
-#endif
-   cuda=_cuda;
-   hip=_hip;
-   uvm=_uvm;
-   aware=_aware;
-   hcpo=_hcpo;
-   sync=_sync;
-
-   // Check for Enforced Kernel Synchronization
-   if (Sync() && Root())
-   {
-      printf("\033[32m[laghos] \033[31;1mEnforced Kernel Synchronization!\033[m\n");
-   }
-
-   // Check if MPI is CUDA/HIP aware
-   if (Root())
-      printf("\033[32m[laghos] MPI %s GPU aware\033[m\n",
-             aware?"\033[1mIS":"is \033[31;1mNOT\033[32m");
-
-   if (Root())
-   {
-      printf("\033[32m[laghos] GPU device count: %i\033[m\n", gpu_count);
-   }
-
-   // Initializes the driver API
-   // Must be called before any other function from the driver API
-   // Currently, the Flags parameter must be 0.
-   const unsigned int Flags = 0; // parameter must be 0
-
-   // Returns properties for the selected device
-   const int device = Mps()?0:(mpi_rank%gpu_count);
-   // Check if we have enough devices for all ranks
-   assert(device<gpu_count);
-
-   // Get a handle to our compute device
-#if defined(RAJA_ENABLE_CUDA)
-   cuInit(Flags);
-   cuDeviceGet(&cuDevice,device);
-   computeCapabilityOfTheDevice(mpi_rank,cuDevice,device);
-
-   // Get the properties of the device
-   struct cudaDeviceProp properties;
-   cudaGetDeviceProperties(&properties, device);
-   maxXGridSize=properties.maxGridSize[0];
-   maxXThreadsDim=properties.maxThreadsDim[0];
-
-   // Create our context
-   cuCtxCreate(&cuContext, CU_CTX_SCHED_AUTO, cuDevice);
-   hStream=new CUstream;
-   cuStreamCreate(hStream, CU_STREAM_DEFAULT);
-
-#elif defined(RAJA_ENABLE_HIP)
-   hipInit(Flags);
-   hipDeviceGet(&hipDevice,device);
-   computeCapabilityOfTheDevice(mpi_rank,hipDevice,device);
-
-   // Get the properties of the device
-   struct hipDeviceProp_t properties;
-   hipGetDeviceProperties(&properties, device);
-   maxXGridSize=properties.maxGridSize[0];
-   maxXThreadsDim=properties.maxThreadsDim[0];
-
-   // Create our context
-   hStream=new hipStream_t;
-   hipStreamCreate(hStream);
-#endif
-}
-
-// ***************************************************************************
-bool rconfig::IAmAlone()
-{
-   return mpi_size==1;
-}
-
-// ***************************************************************************
-bool rconfig::GeomNeedsUpdate(const int sequence)
-{
-   assert(sequence==0);
-   return (sequence!=0);
-}
-
-// ***************************************************************************
-bool rconfig::DoHostConformingProlongationOperator()
-{
-   return ((Cuda()||Hip()))?hcpo:true;
-}
-
-} // namespace mfem
diff --git a/raja/raja/config/rconfig.hpp b/raja/raja/config/rconfig.hpp
deleted file mode 100644
index 52328b33..00000000
--- a/raja/raja/config/rconfig.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_CONFIG
-#define LAGHOS_RAJA_CONFIG
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * Configuration class for RAJA
-// ***************************************************************************
-class rconfig
-{
-private:
-   // *************************************************************************
-   int mpi_rank=0;
-   int mpi_size=0;
-   bool aware=false;
-   //  ************************************************************************
-   bool mps=false;
-   int gpu_count=0;
-   int maxXGridSize=0;
-   int maxXThreadsDim=0;
-   // **************************************************************************
-#if defined(RAJA_ENABLE_CUDA)
-   CUdevice cuDevice;
-   CUcontext cuContext;
-   CUstream *hStream;
-#elif defined(RAJA_ENABLE_HIP)
-   hipDevice_t hipDevice;
-   hipStream_t *hStream;
-#endif
-   // *************************************************************************
-   bool cuda=false;
-   bool hip=false;
-   bool dcg=false;
-   bool uvm=false;
-   // *************************************************************************
-   bool occa=false;
-   bool hcpo=false;
-   bool sync=false;
-   bool nvvp=false;
-   // *************************************************************************
-private:
-   rconfig() {}
-   rconfig(rconfig const&);
-   void operator=(rconfig const&);
-   // *************************************************************************
-public:
-   static rconfig& Get()
-   {
-      static rconfig rconfig_singleton;
-      return rconfig_singleton;
-   }
-   // *************************************************************************
-   void Setup(const int, const int,
-              const bool cuda, const bool hip, const bool uvm, const bool aware,
-              const bool hcpo, const bool sync);
-   // *************************************************************************
-   bool IAmAlone();
-   bool GeomNeedsUpdate(const int);
-   bool DoHostConformingProlongationOperator();
-   // *************************************************************************
-   inline int Rank() { return mpi_rank; }
-   inline int Size() { return mpi_size; }
-   inline bool Root() { return mpi_rank==0; }
-   inline bool Aware() { return aware; }
-   // *************************************************************************
-   inline bool Mps() { return mps; }
-   // *************************************************************************
-   inline bool Uvm() { return uvm; }
-   inline bool Cuda() { return cuda; }
-   inline bool Hip() { return hip; }
-   inline bool Hcpo() { return hcpo; }
-   inline bool Sync() { return sync; }
-   inline bool Nvvp(bool toggle=false) { return toggle?nvvp=!nvvp:nvvp; }
-   inline int MaxXGridSize() { return maxXGridSize; }
-   inline int MaxXThreadsDim() { return maxXThreadsDim; }
-   // *************************************************************************
-#if defined(RAJA_ENABLE_CUDA)
-   inline CUstream *Stream() { return hStream; }
-#elif defined(RAJA_ENABLE_HIP)
-   inline hipStream_t *Stream() { return hStream; }
-#endif
-};
-
-} // namespace mfem
-
-#endif // LAGHOS_RAJA_CONFIG
diff --git a/raja/raja/fem/rbilinearform.cpp b/raja/raja/fem/rbilinearform.cpp
deleted file mode 100644
index b4908cf4..00000000
--- a/raja/raja/fem/rbilinearform.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaBilinearForm
-// ***************************************************************************
-RajaBilinearForm::RajaBilinearForm(RajaFiniteElementSpace* fes) :
-   RajaOperator(fes->GetVSize(),fes->GetVSize()),
-   mesh(fes->GetMesh()),
-   trialFes(fes),
-   testFes(fes),
-   localX(mesh->GetNE() * trialFes->GetLocalDofs() * trialFes->GetVDim()),
-   localY(mesh->GetNE() * testFes->GetLocalDofs() * testFes->GetVDim()) {}
-
-// ***************************************************************************
-RajaBilinearForm::~RajaBilinearForm() { }
-
-// ***************************************************************************
-// Adds new Domain Integrator.
-void RajaBilinearForm::AddDomainIntegrator(RajaIntegrator* i)
-{
-   AddIntegrator(i, DomainIntegrator);
-}
-
-// Adds new Boundary Integrator.
-void RajaBilinearForm::AddBoundaryIntegrator(RajaIntegrator* i)
-{
-   AddIntegrator(i, BoundaryIntegrator);
-}
-
-// Adds new interior Face Integrator.
-void RajaBilinearForm::AddInteriorFaceIntegrator(RajaIntegrator* i)
-{
-   AddIntegrator(i, InteriorFaceIntegrator);
-}
-
-// Adds new boundary Face Integrator.
-void RajaBilinearForm::AddBoundaryFaceIntegrator(RajaIntegrator* i)
-{
-   AddIntegrator(i, BoundaryFaceIntegrator);
-}
-
-// Adds Integrator based on RajaIntegratorType
-void RajaBilinearForm::AddIntegrator(RajaIntegrator* i,
-                                     const RajaIntegratorType itype)
-{
-   assert(i);
-   i->SetupIntegrator(*this, itype);
-   integrators.push_back(i);
-}
-
-// ***************************************************************************
-void RajaBilinearForm::Assemble()
-{
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->Assemble();
-   }
-}
-
-// ***************************************************************************
-void RajaBilinearForm::FormLinearSystem(const Array<int>& constraintList,
-                                        RajaVector& x, RajaVector& b,
-                                        RajaOperator*& Aout,
-                                        RajaVector& X, RajaVector& B,
-                                        int copy_interior)
-{
-   FormOperator(constraintList, Aout);
-   InitRHS(constraintList, x, b, Aout, X, B, copy_interior);
-}
-
-// ***************************************************************************
-void RajaBilinearForm::FormOperator(const Array<int>& constraintList,
-                                    RajaOperator*& Aout)
-{
-   const RajaOperator* trialP = trialFes->GetProlongationOperator();
-   const RajaOperator* testP  = testFes->GetProlongationOperator();
-   RajaOperator *rap = this;
-   if (trialP) { rap = new RajaRAPOperator(*testP, *this, *trialP); }
-   Aout = new RajaConstrainedOperator(rap, constraintList, rap!=this);
-}
-
-// ***************************************************************************
-void RajaBilinearForm::InitRHS(const Array<int>& constraintList,
-                               const RajaVector& x, const RajaVector& b,
-                               RajaOperator* A,
-                               RajaVector& X, RajaVector& B,
-                               int copy_interior)
-{
-   const RajaOperator* P = trialFes->GetProlongationOperator();
-   const RajaOperator* R = trialFes->GetRestrictionOperator();
-   if (P)
-   {
-      // Variational restriction with P
-      B.SetSize(P->Width());
-      P->MultTranspose(b, B);
-      X.SetSize(R->Height());
-      R->Mult(x, X);
-   }
-   else
-   {
-      // rap, X and B point to the same data as this, x and b
-      X.SetSize(x.Size(),x);
-      B.SetSize(b.Size(),b);
-   }
-   RajaConstrainedOperator* cA = static_cast<RajaConstrainedOperator*>(A);
-   if (cA)
-   {
-      cA->EliminateRHS(X, B);
-   }
-   else
-   {
-      mfem_error("RajaBilinearForm::InitRHS expects an RajaConstrainedOperator");
-   }
-}
-
-// ***************************************************************************
-void RajaBilinearForm::Mult(const RajaVector& x, RajaVector& y) const
-{
-   trialFes->GlobalToLocal(x, localX);
-   localY = 0;
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->MultAdd(localX, localY);
-   }
-   testFes->LocalToGlobal(localY, y);
-}
-
-// ***************************************************************************
-void RajaBilinearForm::MultTranspose(const RajaVector& x, RajaVector& y) const
-{
-   testFes->GlobalToLocal(x, localX);
-   localY = 0;
-   const int integratorCount = (int) integrators.size();
-   for (int i = 0; i < integratorCount; ++i)
-   {
-      integrators[i]->MultTransposeAdd(localX, localY);
-   }
-   trialFes->LocalToGlobal(localY, y);
-}
-
-// ***************************************************************************
-void RajaBilinearForm::RecoverFEMSolution(const RajaVector& X,
-                                          const RajaVector& b,
-                                          RajaVector& x)
-{
-   const RajaOperator *P = this->GetProlongation();
-   if (P)
-   {
-      // Apply conforming prolongation
-      x.SetSize(P->Height());
-      P->Mult(X, x);
-   }
-   // Otherwise X and x point to the same data
-}
-
-
-// ***************************************************************************
-// * RajaConstrainedOperator
-// ***************************************************************************
-RajaConstrainedOperator::RajaConstrainedOperator(RajaOperator* A_,
-                                                 const Array<int>& constraintList_,
-                                                 bool own_A_) :
-   RajaOperator(A_->Height(), A_->Width())
-{
-   Setup(A_, constraintList_, own_A_);
-}
-
-void RajaConstrainedOperator::Setup(RajaOperator* A_,
-                                    const Array<int>& constraintList_,
-                                    bool own_A_)
-{
-   A = A_;
-   own_A = own_A_;
-   constraintIndices = constraintList_.Size();
-   if (constraintIndices)
-   {
-      constraintList.allocate(constraintIndices);
-   }
-   z.SetSize(height);
-   w.SetSize(height);
-}
-
-void RajaConstrainedOperator::EliminateRHS(const RajaVector& x,
-                                           RajaVector& b) const
-{
-   w = 0.0;
-   A->Mult(w, z);
-   b -= z;
-}
-
-void RajaConstrainedOperator::Mult(const RajaVector& x, RajaVector& y) const
-{
-   if (constraintIndices == 0)
-   {
-      A->Mult(x, y);
-      return;
-   }
-   z = x;
-   A->Mult(z, y);
-}
-
-} // mfem
diff --git a/raja/raja/fem/rbilinearform.hpp b/raja/raja/fem/rbilinearform.hpp
deleted file mode 100644
index 41211b3f..00000000
--- a/raja/raja/fem/rbilinearform.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_BILINEARFORM
-#define LAGHOS_RAJA_BILINEARFORM
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaIntegratorType
-// ***************************************************************************
-enum RajaIntegratorType
-{
-   DomainIntegrator       = 0,
-   BoundaryIntegrator     = 1,
-   InteriorFaceIntegrator = 2,
-   BoundaryFaceIntegrator = 3,
-};
-
-class RajaIntegrator;
-
-// ***************************************************************************
-// * RajaBilinearForm
-// ***************************************************************************
-class RajaBilinearForm : public RajaOperator
-{
-   friend class RajaIntegrator;
-protected:
-   typedef std::vector<RajaIntegrator*> IntegratorVector;
-   mutable Mesh* mesh;
-   mutable RajaFiniteElementSpace* trialFes;
-   mutable RajaFiniteElementSpace* testFes;
-   IntegratorVector integrators;
-   mutable RajaVector localX, localY;
-public:
-   RajaBilinearForm(RajaFiniteElementSpace*);
-   ~RajaBilinearForm();
-   Mesh& GetMesh() const { return *mesh; }
-   RajaFiniteElementSpace& GetTrialFESpace() const { return *trialFes;}
-   RajaFiniteElementSpace& GetTestFESpace() const { return *testFes;}
-   // *************************************************************************
-   void AddDomainIntegrator(RajaIntegrator*);
-   void AddBoundaryIntegrator(RajaIntegrator*);
-   void AddInteriorFaceIntegrator(RajaIntegrator*);
-   void AddBoundaryFaceIntegrator(RajaIntegrator*);
-   void AddIntegrator(RajaIntegrator*, const RajaIntegratorType);
-   // *************************************************************************
-   virtual void Assemble();
-   void FormLinearSystem(const Array<int>& constraintList,
-                         RajaVector& x, RajaVector& b,
-                         RajaOperator*& Aout,
-                         RajaVector& X, RajaVector& B,
-                         int copy_interior = 0);
-   void FormOperator(const Array<int>& constraintList, RajaOperator*& Aout);
-   void InitRHS(const Array<int>& constraintList,
-                const RajaVector& x, const RajaVector& b,
-                RajaOperator* Aout,
-                RajaVector& X, RajaVector& B,
-                int copy_interior = 0);
-   virtual void Mult(const RajaVector& x, RajaVector& y) const;
-   virtual void MultTranspose(const RajaVector& x, RajaVector& y) const;
-   void RecoverFEMSolution(const RajaVector&, const RajaVector&, RajaVector&);
-};
-
-
-// ***************************************************************************
-// * Constrained Operator
-// ***************************************************************************
-class RajaConstrainedOperator : public RajaOperator
-{
-protected:
-   RajaOperator *A;
-   bool own_A;
-   RajaArray<int> constraintList;
-   int constraintIndices;
-   mutable RajaVector z, w;
-public:
-   RajaConstrainedOperator(RajaOperator*, const Array<int>&, bool = false);
-   void Setup(RajaOperator*, const Array<int>&, bool = false);
-   void EliminateRHS(const RajaVector&, RajaVector&) const;
-   virtual void Mult(const RajaVector&, RajaVector&) const;
-   virtual ~RajaConstrainedOperator() {}
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_BILINEARFORM
diff --git a/raja/raja/fem/rbilininteg.cpp b/raja/raja/fem/rbilininteg.cpp
deleted file mode 100644
index e652deaf..00000000
--- a/raja/raja/fem/rbilininteg.cpp
+++ /dev/null
@@ -1,515 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// *****************************************************************************
-static RajaGeometry *geom=NULL;
-
-// ***************************************************************************
-// * ~ RajaGeometry
-// ***************************************************************************
-RajaGeometry::~RajaGeometry()
-{
-   free(geom->meshNodes);
-   free(geom->J);
-   free(geom->invJ);
-   free(geom->detJ);
-   delete[] geom;
-}
-
-// *****************************************************************************
-// * RajaGeometry Get: use this one to fetch nodes from vector Sx
-// *****************************************************************************
-RajaGeometry* RajaGeometry::Get(RajaFiniteElementSpace& fes,
-                                const IntegrationRule& ir,
-                                const RajaVector& Sx)
-{
-   const Mesh *mesh = fes.GetMesh();
-   const GridFunction *nodes = mesh->GetNodes();
-   const FiniteElementSpace *fespace = nodes->FESpace();
-   const FiniteElement *fe = fespace->GetFE(0);
-   const int dims     = fe->GetDim();
-   const int numDofs  = fe->GetDof();
-   const int numQuad  = ir.GetNPoints();
-   const int elements = fespace->GetNE();
-   const int ndofs    = fespace->GetNDofs();
-   const RajaDofQuadMaps* maps = RajaDofQuadMaps::GetSimplexMaps(*fe, ir);
-   rNodeCopyByVDim(elements,numDofs,ndofs,dims,geom->eMap,Sx,geom->meshNodes);
-   rIniGeom(dims,numDofs,numQuad,elements,
-            maps->dofToQuadD,
-            geom->meshNodes,
-            geom->J,
-            geom->invJ,
-            geom->detJ);
-   return geom;
-}
-
-
-// *****************************************************************************
-RajaGeometry* RajaGeometry::Get(RajaFiniteElementSpace& fes,
-                                const IntegrationRule& ir)
-{
-   Mesh& mesh = *(fes.GetMesh());
-   const bool geom_to_allocate =
-      (!geom) || rconfig::Get().GeomNeedsUpdate(mesh.GetSequence());
-   if (geom_to_allocate) { geom=new RajaGeometry(); }
-   if (!mesh.GetNodes()) { mesh.SetCurvature(1, false, -1, Ordering::byVDIM); }
-   GridFunction& nodes = *(mesh.GetNodes());
-   const FiniteElementSpace& fespace = *(nodes.FESpace());
-   const FiniteElement& fe = *(fespace.GetFE(0));
-   const int dims     = fe.GetDim();
-   const int elements = fespace.GetNE();
-   const int numDofs  = fe.GetDof();
-   const int numQuad  = ir.GetNPoints();
-   const bool orderedByNODES = (fespace.GetOrdering() == Ordering::byNODES);
-
-   if (orderedByNODES) { ReorderByVDim(nodes); }
-   const int asize = dims*numDofs*elements;
-   Array<double> meshNodes(asize);
-   const Table& e2dTable = fespace.GetElementToDofTable();
-   const int* elementMap = e2dTable.GetJ();
-   Array<int> eMap(numDofs*elements);
-   {
-      for (int e = 0; e < elements; ++e)
-      {
-         for (int d = 0; d < numDofs; ++d)
-         {
-            const int lid = d+numDofs*e;
-            const int gid = elementMap[lid];
-            eMap[lid]=gid;
-            for (int v = 0; v < dims; ++v)
-            {
-               const int moffset = v+dims*lid;
-               const int xoffset = v+dims*gid;
-               meshNodes[moffset] = nodes[xoffset];
-            }
-         }
-      }
-   }
-   if (geom_to_allocate)
-   {
-      geom->meshNodes.allocate(dims, numDofs, elements);
-      geom->eMap.allocate(numDofs, elements);
-   }
-   {
-      geom->meshNodes = meshNodes;
-      geom->eMap = eMap;
-   }
-   // Reorder the original gf back
-   if (orderedByNODES) { ReorderByNodes(nodes); }
-   if (geom_to_allocate)
-   {
-      geom->J.allocate(dims, dims, numQuad, elements);
-      geom->invJ.allocate(dims, dims, numQuad, elements);
-      geom->detJ.allocate(numQuad, elements);
-   }
-
-   const RajaDofQuadMaps* maps = RajaDofQuadMaps::GetSimplexMaps(fe, ir);
-   rIniGeom(dims,numDofs,numQuad,elements,
-            maps->dofToQuadD,
-            geom->meshNodes,
-            geom->J,
-            geom->invJ,
-            geom->detJ);
-   return geom;
-}
-
-// ***************************************************************************
-void RajaGeometry::ReorderByVDim(GridFunction& nodes)
-{
-   const FiniteElementSpace *fes=nodes.FESpace();
-   const int size = nodes.Size();
-   const int vdim = fes->GetVDim();
-   const int ndofs = fes->GetNDofs();
-   double *data = nodes.GetData();
-   double *temp = new double[size];
-   int k=0;
-   for (int d = 0; d < ndofs; d++)
-      for (int v = 0; v < vdim; v++)
-      {
-         temp[k++] = data[d+v*ndofs];
-      }
-   for (int i = 0; i < size; i++)
-   {
-      data[i] = temp[i];
-   }
-   delete [] temp;
-}
-
-// ***************************************************************************
-void RajaGeometry::ReorderByNodes(GridFunction& nodes)
-{
-   const FiniteElementSpace *fes=nodes.FESpace();
-   const int size = nodes.Size();
-   const int vdim = fes->GetVDim();
-   const int ndofs = fes->GetNDofs();
-   double *data = nodes.GetData();
-   double *temp = new double[size];
-   int k = 0;
-   for (int j = 0; j < ndofs; j++)
-      for (int i = 0; i < vdim; i++)
-      {
-         temp[j+i*ndofs] = data[k++];
-      }
-   for (int i = 0; i < size; i++)
-   {
-      data[i] = temp[i];
-   }
-   delete [] temp;
-}
-
-// ***************************************************************************
-// * RajaDofQuadMaps
-// ***************************************************************************
-static std::map<std::string, RajaDofQuadMaps* > AllDofQuadMaps;
-
-// ***************************************************************************
-RajaDofQuadMaps::~RajaDofQuadMaps() {}
-
-// *****************************************************************************
-void RajaDofQuadMaps::delRajaDofQuadMaps()
-{
-   for (std::map<std::string,
-        RajaDofQuadMaps*>::iterator itr = AllDofQuadMaps.begin();
-        itr != AllDofQuadMaps.end();
-        itr++)
-   {
-      delete itr->second;
-   }
-}
-
-// *****************************************************************************
-RajaDofQuadMaps* RajaDofQuadMaps::Get(const RajaFiniteElementSpace& fespace,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return Get(*fespace.GetFE(0),*fespace.GetFE(0),ir,transpose);
-}
-
-RajaDofQuadMaps* RajaDofQuadMaps::Get(const RajaFiniteElementSpace&
-                                      trialFESpace,
-                                      const RajaFiniteElementSpace& testFESpace,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return Get(*trialFESpace.GetFE(0),*testFESpace.GetFE(0),ir,transpose);
-}
-
-RajaDofQuadMaps* RajaDofQuadMaps::Get(const FiniteElement& trialFE,
-                                      const FiniteElement& testFE,
-                                      const IntegrationRule& ir,
-                                      const bool transpose)
-{
-   return GetTensorMaps(trialFE, testFE, ir, transpose);
-}
-
-// ***************************************************************************
-RajaDofQuadMaps* RajaDofQuadMaps::GetTensorMaps(const FiniteElement& trialFE,
-                                                const FiniteElement& testFE,
-                                                const IntegrationRule& ir,
-                                                const bool transpose)
-{
-   const TensorBasisElement& trialTFE =
-      dynamic_cast<const TensorBasisElement&>(trialFE);
-   const TensorBasisElement& testTFE =
-      dynamic_cast<const TensorBasisElement&>(testFE);
-   std::stringstream ss;
-   ss << "TensorMap:"
-      << " O1:"  << trialFE.GetOrder()
-      << " O2:"  << testFE.GetOrder()
-      << " BT1:" << trialTFE.GetBasisType()
-      << " BT2:" << testTFE.GetBasisType()
-      << " Q:"   << ir.GetNPoints();
-   std::string hash = ss.str();
-   // If we've already made the dof-quad maps, reuse them
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   // Otherwise, build them
-   RajaDofQuadMaps *maps = new RajaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   const RajaDofQuadMaps* trialMaps = GetD2QTensorMaps(trialFE, ir);
-   const RajaDofQuadMaps* testMaps  = GetD2QTensorMaps(testFE, ir, true);
-   maps->dofToQuad   = trialMaps->dofToQuad;
-   maps->dofToQuadD  = trialMaps->dofToQuadD;
-   maps->quadToDof   = testMaps->dofToQuad;
-   maps->quadToDofD  = testMaps->dofToQuadD;
-   maps->quadWeights = testMaps->quadWeights;
-   return maps;
-}
-
-// ***************************************************************************
-RajaDofQuadMaps* RajaDofQuadMaps::GetD2QTensorMaps(const FiniteElement& fe,
-                                                   const IntegrationRule& ir,
-                                                   const bool transpose)
-{
-   const TensorBasisElement& tfe = dynamic_cast<const TensorBasisElement&>(fe);
-   const Poly_1D::Basis& basis = tfe.GetBasis1D();
-   const int order = fe.GetOrder();
-   const int dofs = order + 1;
-   const int dims = fe.GetDim();
-   const IntegrationRule& ir1D = IntRules.Get(Geometry::SEGMENT, ir.GetOrder());
-   const int quadPoints = ir1D.GetNPoints();
-   const int quadPoints2D = quadPoints*quadPoints;
-   const int quadPoints3D = quadPoints2D*quadPoints;
-   const int quadPointsND = ((dims == 1) ? quadPoints :
-                             ((dims == 2) ? quadPoints2D : quadPoints3D));
-   std::stringstream ss ;
-   ss << "D2QTensorMap:"
-      << " order:" << order
-      << " dofs:" << dofs
-      << " dims:" << dims
-      << " quadPoints:"<<quadPoints
-      << " transpose:"  << (transpose?"T":"F");
-   std::string hash = ss.str();
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-
-   RajaDofQuadMaps *maps = new RajaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-
-   maps->dofToQuad.allocate(quadPoints, dofs,1,1,transpose);
-   maps->dofToQuadD.allocate(quadPoints, dofs,1,1,transpose);
-   double* quadWeights1DData = NULL;
-   if (transpose)
-   {
-      // Initialize quad weights only for transpose
-      maps->quadWeights.allocate(quadPointsND);
-      quadWeights1DData = ::new double[quadPoints];
-   }
-   mfem::Vector d2q(dofs);
-   mfem::Vector d2qD(dofs);
-   Array<double> dofToQuad(quadPoints*dofs);
-   Array<double> dofToQuadD(quadPoints*dofs);
-   for (int q = 0; q < quadPoints; ++q)
-   {
-      const IntegrationPoint& ip = ir1D.IntPoint(q);
-      basis.Eval(ip.x, d2q, d2qD);
-      if (transpose)
-      {
-         quadWeights1DData[q] = ip.weight;
-      }
-      for (int d = 0; d < dofs; ++d)
-      {
-         dofToQuad[maps->dofToQuad.dim()[0]*q + maps->dofToQuad.dim()[1]*d] = d2q[d];
-         dofToQuadD[maps->dofToQuad.dim()[0]*q + maps->dofToQuad.dim()[1]*d] = d2qD[d];
-      }
-   }
-   maps->dofToQuad = dofToQuad;
-   maps->dofToQuadD = dofToQuadD;
-   if (transpose)
-   {
-      Array<double> quadWeights(quadPointsND);
-      for (int q = 0; q < quadPointsND; ++q)
-      {
-         const int qx = q % quadPoints;
-         const int qz = q / quadPoints2D;
-         const int qy = (q - qz*quadPoints2D) / quadPoints;
-         double w = quadWeights1DData[qx];
-         if (dims > 1)
-         {
-            w *= quadWeights1DData[qy];
-         }
-         if (dims > 2)
-         {
-            w *= quadWeights1DData[qz];
-         }
-         quadWeights[q] = w;
-      }
-      maps->quadWeights = quadWeights;
-      ::delete [] quadWeights1DData;
-   }
-   assert(maps);
-   return maps;
-}
-
-// ***************************************************************************
-RajaDofQuadMaps* RajaDofQuadMaps::GetSimplexMaps(const FiniteElement& fe,
-                                                 const IntegrationRule& ir,
-                                                 const bool transpose)
-{
-   return GetSimplexMaps(fe, fe, ir, transpose);
-}
-
-// *****************************************************************************
-RajaDofQuadMaps* RajaDofQuadMaps::GetSimplexMaps(const FiniteElement& trialFE,
-                                                 const FiniteElement& testFE,
-                                                 const IntegrationRule& ir,
-                                                 const bool transpose)
-{
-   std::stringstream ss;
-   ss << "SimplexMap:"
-      << " O1:" << trialFE.GetOrder()
-      << " O2:" << testFE.GetOrder()
-      << " Q:"  << ir.GetNPoints();
-   std::string hash = ss.str();
-   // If we've already made the dof-quad maps, reuse them
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   RajaDofQuadMaps *maps = new RajaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   const RajaDofQuadMaps* trialMaps = GetD2QSimplexMaps(trialFE, ir);
-   const RajaDofQuadMaps* testMaps  = GetD2QSimplexMaps(testFE, ir, true);
-   maps->dofToQuad   = trialMaps->dofToQuad;
-   maps->dofToQuadD  = trialMaps->dofToQuadD;
-   maps->quadToDof   = testMaps->dofToQuad;
-   maps->quadToDofD  = testMaps->dofToQuadD;
-   maps->quadWeights = testMaps->quadWeights;
-   return maps;
-}
-
-// ***************************************************************************
-RajaDofQuadMaps* RajaDofQuadMaps::GetD2QSimplexMaps(const FiniteElement& fe,
-                                                    const IntegrationRule& ir,
-                                                    const bool transpose)
-{
-   const int dims = fe.GetDim();
-   const int numDofs = fe.GetDof();
-   const int numQuad = ir.GetNPoints();
-   std::stringstream ss ;
-   ss << "D2QSimplexMap:"
-      << " Dim:" << dims
-      << " numDofs:" << numDofs
-      << " numQuad:" << numQuad
-      << " transpose:"  << (transpose?"T":"F");
-   std::string hash = ss.str();
-   if (AllDofQuadMaps.find(hash)!=AllDofQuadMaps.end())
-   {
-      return AllDofQuadMaps[hash];
-   }
-   RajaDofQuadMaps* maps = new RajaDofQuadMaps();
-   AllDofQuadMaps[hash]=maps;
-   maps->hash = hash;
-   // Initialize the dof -> quad mapping
-   maps->dofToQuad.allocate(numQuad, numDofs,1,1,transpose);
-   maps->dofToQuadD.allocate(dims, numQuad, numDofs,1,transpose);
-   if (transpose) // Initialize quad weights only for transpose
-   {
-      maps->quadWeights.allocate(numQuad);
-   }
-   Vector d2q(numDofs);
-   DenseMatrix d2qD(numDofs, dims);
-   Array<double> quadWeights(numQuad);
-   Array<double> dofToQuad(numQuad*numDofs);
-   Array<double> dofToQuadD(dims*numQuad*numDofs);
-   for (int q = 0; q < numQuad; ++q)
-   {
-      const IntegrationPoint& ip = ir.IntPoint(q);
-      if (transpose)
-      {
-         quadWeights[q] = ip.weight;
-      }
-      fe.CalcShape(ip, d2q);
-      fe.CalcDShape(ip, d2qD);
-      for (int d = 0; d < numDofs; ++d)
-      {
-         const double w = d2q[d];
-         dofToQuad[maps->dofToQuad.dim()[0]*q +
-                   maps->dofToQuad.dim()[1]*d] = w;
-         for (int dim = 0; dim < dims; ++dim)
-         {
-            const double wD = d2qD(d, dim);
-            dofToQuadD[maps->dofToQuadD.dim()[0]*dim +
-                       maps->dofToQuadD.dim()[1]*q +
-                       maps->dofToQuadD.dim()[2]*d] = wD;
-         }
-      }
-   }
-   if (transpose)
-   {
-      maps->quadWeights = quadWeights;
-   }
-   maps->dofToQuad = dofToQuad;
-   maps->dofToQuadD = dofToQuadD;
-   return maps;
-}
-
-
-// ***************************************************************************
-// * Base Integrator
-// ***************************************************************************
-void RajaIntegrator::SetIntegrationRule(const IntegrationRule& ir_)
-{
-   ir = &ir_;
-}
-
-const IntegrationRule& RajaIntegrator::GetIntegrationRule() const
-{
-   assert(ir);
-   return *ir;
-}
-
-void RajaIntegrator::SetupIntegrator(RajaBilinearForm& bform_,
-                                     const RajaIntegratorType itype_)
-{
-   mesh = &(bform_.GetMesh());
-   trialFESpace = &(bform_.GetTrialFESpace());
-   testFESpace  = &(bform_.GetTestFESpace());
-   itype = itype_;
-   if (ir == NULL) { assert(false); }
-   maps = RajaDofQuadMaps::Get(*trialFESpace,*testFESpace,*ir);
-   mapsTranspose = RajaDofQuadMaps::Get(*testFESpace,*trialFESpace,*ir);
-   Setup();
-}
-
-RajaGeometry* RajaIntegrator::GetGeometry()
-{
-   return RajaGeometry::Get(*trialFESpace, *ir);
-}
-
-
-// ***************************************************************************
-// * Mass Integrator
-// ***************************************************************************
-void RajaMassIntegrator::SetupIntegrationRule()
-{
-   assert(false);
-}
-
-// ***************************************************************************
-void RajaMassIntegrator::Assemble()
-{
-   if (op.Size()) { return; }
-   assert(false);
-}
-
-// ***************************************************************************
-void RajaMassIntegrator::SetOperator(RajaVector& v) { op = v; }
-
-// ***************************************************************************
-void RajaMassIntegrator::MultAdd(RajaVector& x, RajaVector& y)
-{
-   const int dim = mesh->Dimension();
-   const int quad1D = IntRules.Get(Geometry::SEGMENT,ir->GetOrder()).GetNPoints();
-   const int dofs1D = trialFESpace->GetFE(0)->GetOrder() + 1;
-   rMassMultAdd(dim,
-                dofs1D,
-                quad1D,
-                mesh->GetNE(),
-                maps->dofToQuad,
-                maps->dofToQuadD,
-                maps->quadToDof,
-                maps->quadToDofD,
-                op,x,y);
-}
-}
-
diff --git a/raja/raja/fem/rbilininteg.hpp b/raja/raja/fem/rbilininteg.hpp
deleted file mode 100644
index 8aac8f3f..00000000
--- a/raja/raja/fem/rbilininteg.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_BILININTEG
-#define LAGHOS_RAJA_BILININTEG
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaGeometry
-// ***************************************************************************
-class RajaGeometry
-{
-public:
-   ~RajaGeometry();
-   RajaArray<int> eMap;
-   RajaArray<double> meshNodes;
-   RajaArray<double> J, invJ, detJ;
-   static RajaGeometry* Get(RajaFiniteElementSpace&,
-                            const IntegrationRule&);
-   static RajaGeometry* Get(RajaFiniteElementSpace&,
-                            const IntegrationRule&,
-                            const RajaVector&);
-   static void ReorderByVDim(GridFunction& nodes);
-   static void ReorderByNodes(GridFunction& nodes);
-};
-
-// ***************************************************************************
-// * RajaDofQuadMaps
-// ***************************************************************************
-class RajaDofQuadMaps
-{
-private:
-   std::string hash;
-public:
-   RajaArray<double, false> dofToQuad, dofToQuadD; // B
-   RajaArray<double, false> quadToDof, quadToDofD; // B^T
-   RajaArray<double> quadWeights;
-public:
-   ~RajaDofQuadMaps();
-   static void delRajaDofQuadMaps();
-   static RajaDofQuadMaps* Get(const RajaFiniteElementSpace&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static RajaDofQuadMaps* Get(const RajaFiniteElementSpace&,
-                               const RajaFiniteElementSpace&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static RajaDofQuadMaps* Get(const FiniteElement&,
-                               const FiniteElement&,
-                               const IntegrationRule&,
-                               const bool = false);
-   static RajaDofQuadMaps* GetTensorMaps(const FiniteElement&,
-                                         const FiniteElement&,
-                                         const IntegrationRule&,
-                                         const bool = false);
-   static RajaDofQuadMaps* GetD2QTensorMaps(const FiniteElement&,
-                                            const IntegrationRule&,
-                                            const bool = false);
-   static RajaDofQuadMaps* GetSimplexMaps(const FiniteElement&,
-                                          const IntegrationRule&,
-                                          const bool = false);
-   static RajaDofQuadMaps* GetSimplexMaps(const FiniteElement&,
-                                          const FiniteElement&,
-                                          const IntegrationRule&,
-                                          const bool = false);
-   static RajaDofQuadMaps* GetD2QSimplexMaps(const FiniteElement&,
-                                             const IntegrationRule&,
-                                             const bool = false);
-};
-
-// ***************************************************************************
-// * Base Integrator
-// ***************************************************************************
-class RajaIntegrator
-{
-protected:
-   Mesh* mesh = NULL;
-   RajaFiniteElementSpace* trialFESpace = NULL;
-   RajaFiniteElementSpace* testFESpace = NULL;
-   RajaIntegratorType itype;
-   const IntegrationRule* ir = NULL;
-   RajaDofQuadMaps* maps;
-   RajaDofQuadMaps* mapsTranspose;
-private:
-public:
-   virtual std::string GetName() = 0;
-   void SetIntegrationRule(const IntegrationRule& ir_);
-   const IntegrationRule& GetIntegrationRule() const;
-   virtual void SetupIntegrationRule() = 0;
-   virtual void SetupIntegrator(RajaBilinearForm& bform_,
-                                const RajaIntegratorType itype_);
-   virtual void Setup() = 0;
-   virtual void Assemble() = 0;
-   virtual void MultAdd(RajaVector& x, RajaVector& y) = 0;
-   virtual void MultTransposeAdd(RajaVector&, RajaVector&) {assert(false);}
-   RajaGeometry* GetGeometry();
-};
-
-// ***************************************************************************
-// * Mass Integrator
-// ***************************************************************************
-class RajaMassIntegrator : public RajaIntegrator
-{
-private:
-   RajaVector op;
-public:
-   RajaMassIntegrator() {}
-   virtual ~RajaMassIntegrator() {}
-   virtual std::string GetName() {return "MassIntegrator";}
-   virtual void SetupIntegrationRule();
-   virtual void Setup() {}
-   virtual void Assemble();
-   void SetOperator(RajaVector& v);
-   virtual void MultAdd(RajaVector& x, RajaVector& y);
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_BILININTEG
diff --git a/raja/raja/fem/rconform.cpp b/raja/raja/fem/rconform.cpp
deleted file mode 100644
index 9c53e2e5..00000000
--- a/raja/raja/fem/rconform.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaConformingProlongationOperator
-// ***************************************************************************
-RajaConformingProlongationOperator::RajaConformingProlongationOperator
-(ParFiniteElementSpace &pfes): RajaOperator(pfes.GetVSize(),
-                                               pfes.GetTrueVSize()),
-   external_ldofs(),
-   d_external_ldofs(Height()-Width()), // size can be 0 here
-   gc(new RajaCommD(pfes)),
-   kMaxTh(0)
-{
-   Array<int> ldofs;
-   Table &group_ldof = gc->GroupLDofTable();
-   external_ldofs.Reserve(Height()-Width());
-   for (int gr = 1; gr < group_ldof.Size(); gr++)
-   {
-      if (!gc->GetGroupTopology().IAmMaster(gr))
-      {
-         ldofs.MakeRef(group_ldof.GetRow(gr), group_ldof.RowSize(gr));
-         external_ldofs.Append(ldofs);
-      }
-   }
-   external_ldofs.Sort();
-   const int HmW=Height()-Width();
-   if (HmW>0)
-   {
-      d_external_ldofs=external_ldofs;
-   }
-   assert(external_ldofs.Size() == Height()-Width());
-   // *************************************************************************
-   const int m = external_ldofs.Size();
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      const int size = end-j;
-      if (size>kMaxTh) { kMaxTh=size; }
-      j = end+1;
-   }
-}
-
-// ***************************************************************************
-// * ~RajaConformingProlongationOperator
-// ***************************************************************************
-RajaConformingProlongationOperator::~RajaConformingProlongationOperator()
-{
-   delete  gc;
-}
-
-// ***************************************************************************
-// * CUDA/HIP Error Status Check
-// ***************************************************************************
-void LastCheck()
-{
-#if defined(RAJA_ENABLE_CUDA)
-   cudaError_t cudaStatus = cudaGetLastError();
-   if (cudaStatus != cudaSuccess)
-      exit(fprintf(stderr, "\n\t\033[31;1m[cudaLastCheck] failed: %s\033[m\n",
-                   cudaGetErrorString(cudaStatus)));
-#elif defined(RAJA_ENABLE_HIP)
-   hipError_t hipStatus = hipGetLastError();
-   if (hipStatus != hipSuccess)
-      exit(fprintf(stderr, "\n\t\033[31;1m[hipLastCheck] failed: %s\033[m\n",
-                   hipGetErrorString(hipStatus)));
-#endif
-}
-
-// ***************************************************************************
-// * k_Mult
-// ***************************************************************************
-static __global__
-void k_Mult(double *y,const double *x,const int *external_ldofs,const int m)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i>=m) { return; }
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   for (int k=0; k<(end-j); k+=1)
-   {
-      y[j+k]=x[j-i+k];
-   }
-}
-static __global__
-void k_Mult2(double *y,const double *x,const int *external_ldofs,
-             const int m, const int base)
-{
-   const int i = base+threadIdx.x;
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   const int k = blockIdx.x;
-   if (k>=(end-j)) { return; }
-   y[j+k]=x[j-i+k];
-}
-
-// ***************************************************************************
-// * Device Mult
-// ***************************************************************************
-void RajaConformingProlongationOperator::d_Mult(const RajaVector &x,
-                                                RajaVector &y) const
-{
-   const double *d_xdata = x.GetData();
-   const int in_layout = 2; // 2 - input is ltdofs array
-   gc->d_BcastBegin(const_cast<double*>(d_xdata), in_layout);
-   double *d_ydata = y.GetData();
-   int j = 0;
-   const int m = external_ldofs.Size();
-   if (m>0)
-   {
-      const int maxXThDim = rconfig::Get().MaxXThreadsDim();
-      if (m>maxXThDim)
-      {
-         const int kTpB=64;
-#if defined(RAJA_ENABLE_CUDA)
-         k_Mult<<<(m+kTpB-1)/kTpB,kTpB>>>(d_ydata,d_xdata,d_external_ldofs,m);
-#elif defined(RAJA_ENABLE_HIP)
-         hipLaunchKernelGGL((k_Mult), dim3((m+kTpB-1)/kTpB), dim3(kTpB), 0, 0,
-                              d_ydata,d_xdata,d_external_ldofs.ptr(),m);
-#endif
-         LastCheck();
-      }
-      else
-      {
-         assert((m/maxXThDim)==0);
-         assert(kMaxTh<rconfig::Get().MaxXGridSize());
-         for (int of7=0; of7<m/maxXThDim; of7+=1)
-         {
-            const int base = of7*maxXThDim;
-#if defined(RAJA_ENABLE_CUDA)
-            k_Mult2<<<kMaxTh,maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,base);
-#elif defined(RAJA_ENABLE_HIP)
-            hipLaunchKernelGGL((k_Mult2),dim3(kMaxTh),dim3(maxXThDim), 0, 0,
-                                 d_ydata,d_xdata,d_external_ldofs.ptr(),m,base);
-#endif
-            LastCheck();
-         }
-#if defined(RAJA_ENABLE_CUDA)
-         k_Mult2<<<kMaxTh,m%maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,0);
-#elif defined(RAJA_ENABLE_HIP)
-         hipLaunchKernelGGL((k_Mult2),dim3(kMaxTh),dim3(m%maxXThDim), 0, 0,
-                              d_ydata,d_xdata,d_external_ldofs.ptr(),m,0);
-#endif
-         LastCheck();
-      }
-      j = external_ldofs[m-1]+1;
-   }
-   rmemcpy::rDtoD(d_ydata+j,d_xdata+j-m,(Width()+m-j)*sizeof(double));
-   const int out_layout = 0; // 0 - output is ldofs array
-   gc->d_BcastEnd(d_ydata, out_layout);
-}
-
-
-// ***************************************************************************
-// * k_Mult
-// ***************************************************************************
-static __global__
-void k_MultTranspose(double *y,const double *x,const int *external_ldofs,
-                     const int m)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   if (i>=m) { return; }
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   for (int k=0; k<(end-j); k+=1)
-   {
-      y[j-i+k]=x[j+k];
-   }
-}
-
-static __global__
-void k_MultTranspose2(double *y,const double *x,const int *external_ldofs,
-                      const int m, const int base)
-{
-   const int i = base+threadIdx.x;
-   const int j = (i>0)?external_ldofs[i-1]+1:0;
-   const int end = external_ldofs[i];
-   const int k = blockIdx.x;
-   if (k>=(end-j)) { return; }
-   y[j-i+k]=x[j+k];
-}
-
-// ***************************************************************************
-// * Device MultTranspose
-// ***************************************************************************
-void RajaConformingProlongationOperator::d_MultTranspose(const RajaVector &x,
-                                                         RajaVector &y) const
-{
-   const double *d_xdata = x.GetData();
-   gc->d_ReduceBegin(d_xdata);
-   double *d_ydata = y.GetData();
-   int j = 0;
-   const int m = external_ldofs.Size();
-
-   if (m>0)
-   {
-      const int maxXThDim = rconfig::Get().MaxXThreadsDim();
-      if (m>maxXThDim)
-      {
-         const int kTpB=64;
-#if defined(RAJA_ENABLE_CUDA)
-         k_MultTranspose<<<(m+kTpB-1)/kTpB,kTpB>>>(d_ydata,d_xdata,d_external_ldofs,m);
-#elif defined(RAJA_ENABLE_HIP)
-         hipLaunchKernelGGL((k_MultTranspose), dim3((m+kTpB-1)/kTpB),dim3(kTpB), 0, 0,
-                              d_ydata,d_xdata,d_external_ldofs.ptr(),m);
-#endif
-         LastCheck();
-      }
-      else
-      {
-         // const int TpB = rconfig::Get().MaxXThreadsDim();
-         assert(kMaxTh<rconfig::Get().MaxXGridSize());
-         for (int of7=0; of7<m/maxXThDim; of7+=1)
-         {
-            const int base = of7*maxXThDim;
-#if defined(RAJA_ENABLE_CUDA)
-            k_MultTranspose2<<<kMaxTh,maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,base);
-#elif defined(RAJA_ENABLE_HIP)
-            hipLaunchKernelGGL((k_MultTranspose2), dim3(kMaxTh),dim3(maxXThDim), 0, 0,
-                                 d_ydata,d_xdata,d_external_ldofs.ptr(),m,base);
-#endif
-            LastCheck();
-         }
-#if defined(RAJA_ENABLE_CUDA)
-         k_MultTranspose2<<<kMaxTh,m%maxXThDim>>>(d_ydata,d_xdata,d_external_ldofs,m,0);
-#elif defined(RAJA_ENABLE_HIP)
-         hipLaunchKernelGGL((k_MultTranspose2), dim3(kMaxTh),dim3(m%maxXThDim), 0, 0,
-                              d_ydata,d_xdata,d_external_ldofs.ptr(),m,0);
-#endif
-         LastCheck();
-      }
-      j = external_ldofs[m-1]+1;
-   }
-   rmemcpy::rDtoD(d_ydata+j-m,d_xdata+j,(Height()-j)*sizeof(double));
-   const int out_layout = 2; // 2 - output is an array on all ltdofs
-   gc->d_ReduceEnd<double>(d_ydata, out_layout, GroupCommunicator::Sum);
-}
-
-// ***************************************************************************
-// * Host Mult
-// ***************************************************************************
-void RajaConformingProlongationOperator::h_Mult(const Vector &x,
-                                                Vector &y) const
-{
-   const double *xdata = x.GetData();
-   double *ydata = y.GetData();
-   const int m = external_ldofs.Size();
-   const int in_layout = 2; // 2 - input is ltdofs array
-   gc->BcastBegin(const_cast<double*>(xdata), in_layout);
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      std::copy(xdata+j-i, xdata+end-i, ydata+j);
-      j = end+1;
-   }
-   std::copy(xdata+j-m, xdata+Width(), ydata+j);
-   const int out_layout = 0; // 0 - output is ldofs array
-   gc->BcastEnd(ydata, out_layout);
-}
-
-// ***************************************************************************
-// * Host MultTranspose
-// ***************************************************************************
-void RajaConformingProlongationOperator::h_MultTranspose(const Vector &x,
-                                                         Vector &y) const
-{
-   const double *xdata = x.GetData();
-   double *ydata = y.GetData();
-   const int m = external_ldofs.Size();
-   gc->ReduceBegin(xdata);
-   int j = 0;
-   for (int i = 0; i < m; i++)
-   {
-      const int end = external_ldofs[i];
-      std::copy(xdata+j, xdata+end, ydata+j-i);
-      j = end+1;
-   }
-   std::copy(xdata+j, xdata+Height(), ydata+j-m);
-   const int out_layout = 2; // 2 - output is an array on all ltdofs
-   gc->ReduceEnd<double>(ydata, out_layout, GroupCommunicator::Sum);
-}
-
-} // namespace mfem
diff --git a/raja/raja/fem/rconform.hpp b/raja/raja/fem/rconform.hpp
deleted file mode 100644
index bb8d5048..00000000
--- a/raja/raja/fem/rconform.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_CONFORM_PROLONGATION_OP
-#define LAGHOS_RAJA_CONFORM_PROLONGATION_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaConformingProlongationOperator
-//  **************************************************************************
-class RajaConformingProlongationOperator : public RajaOperator
-{
-protected:
-   Array<int> external_ldofs;
-   RajaArray<int> d_external_ldofs;
-   RajaCommD *gc;
-   int kMaxTh;
-public:
-   RajaConformingProlongationOperator(ParFiniteElementSpace &);
-   ~RajaConformingProlongationOperator();
-   void d_Mult(const RajaVector &x, RajaVector &y) const;
-   void d_MultTranspose(const RajaVector &x, RajaVector &y) const;
-   void h_Mult(const Vector &x, Vector &y) const;
-   void h_MultTranspose(const Vector &x, Vector &y) const;
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_CONFORM_PROLONGATION_OP
diff --git a/raja/raja/fem/rfespace.cpp b/raja/raja/fem/rfespace.cpp
deleted file mode 100644
index 1db9f11e..00000000
--- a/raja/raja/fem/rfespace.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaFiniteElementSpace
-//  ***************************************************************************
-RajaFiniteElementSpace::RajaFiniteElementSpace(Mesh* mesh,
-                                               const FiniteElementCollection* fec,
-                                               const int vdim_,
-                                               Ordering::Type ordering_)
-   :ParFiniteElementSpace(static_cast<ParMesh*>(mesh),fec,vdim_,ordering_),
-    globalDofs(GetNDofs()),
-    localDofs(GetFE(0)->GetDof()),
-    offsets(globalDofs+1),
-    indices(localDofs, GetNE()),
-    map(localDofs, GetNE())
-{
-   const FiniteElement *fe = GetFE(0);
-   const TensorBasisElement* el = dynamic_cast<const TensorBasisElement*>(fe);
-   const Array<int> &dof_map = el->GetDofMap();
-   const bool dof_map_is_identity = (dof_map.Size()==0);
-
-   const Table& e2dTable = GetElementToDofTable();
-   const int* elementMap = e2dTable.GetJ();
-   const int elements = GetNE();
-   Array<int> h_offsets(globalDofs+1);
-   // We'll be keeping a count of how many local nodes point to its global dof
-   for (int i = 0; i <= globalDofs; ++i)
-   {
-      h_offsets[i] = 0;
-   }
-   for (int e = 0; e < elements; ++e)
-   {
-      for (int d = 0; d < localDofs; ++d)
-      {
-         const int gid = elementMap[localDofs*e + d];
-         ++h_offsets[gid + 1];
-      }
-   }
-   // Aggregate to find offsets for each global dof
-   for (int i = 1; i <= globalDofs; ++i)
-   {
-      h_offsets[i] += h_offsets[i - 1];
-   }
-
-   Array<int> h_indices(localDofs*elements);
-   Array<int> h_map(localDofs*elements);
-   // For each global dof, fill in all local nodes that point   to it
-   for (int e = 0; e < elements; ++e)
-   {
-      for (int d = 0; d < localDofs; ++d)
-      {
-         const int did = dof_map_is_identity?d:dof_map[d];
-         const int gid = elementMap[localDofs*e + did];
-         const int lid = localDofs*e + d;
-         h_indices[h_offsets[gid]++] = lid;
-         h_map[lid] = gid;
-      }
-   }
-
-   // We shifted the offsets vector by 1 by using it as a counter
-   // Now we shift it back.
-   for (int i = globalDofs; i > 0; --i)
-   {
-      h_offsets[i] = h_offsets[i - 1];
-   }
-   h_offsets[0] = 0;
-
-   offsets = h_offsets;
-   indices = h_indices;
-   map = h_map;
-
-   const SparseMatrix* R = GetRestrictionMatrix(); assert(R);
-   //const Operator* P = GetProlongationMatrix(); assert(P);
-   const RajaConformingProlongationOperator *P = new
-   RajaConformingProlongationOperator(*this);
-
-   const int mHeight = R->Height();
-   const int* I = R->GetI();
-   const int* J = R->GetJ();
-   int trueCount = 0;
-   for (int i = 0; i < mHeight; ++i)
-   {
-      trueCount += ((I[i + 1] - I[i]) == 1);
-   }
-
-   Array<int> h_reorderIndices(2*trueCount);
-   for (int i = 0, trueIdx=0; i < mHeight; ++i)
-   {
-      if ((I[i + 1] - I[i]) == 1)
-      {
-         h_reorderIndices[trueIdx++] = J[I[i]];
-         h_reorderIndices[trueIdx++] = i;
-      }
-   }
-
-   reorderIndices = ::new RajaArray<int>(2*trueCount);
-   *reorderIndices = h_reorderIndices;
-
-   restrictionOp = new RajaRestrictionOperator(R->Height(),
-                                               R->Width(),
-                                               reorderIndices);
-   prolongationOp = new RajaProlongationOperator(P);
-}
-
-// ***************************************************************************
-RajaFiniteElementSpace::~RajaFiniteElementSpace()
-{
-   ::delete reorderIndices;
-}
-
-// ***************************************************************************
-bool RajaFiniteElementSpace::hasTensorBasis() const
-{
-   assert(dynamic_cast<const TensorBasisElement*>(GetFE(0)));
-   return true;
-}
-
-// ***************************************************************************
-void RajaFiniteElementSpace::GlobalToLocal(const RajaVector& globalVec,
-                                           RajaVector& localVec) const
-{
-   const int vdim = GetVDim();
-   const int localEntries = localDofs * GetNE();
-   const bool vdim_ordering = ordering == Ordering::byVDIM;
-   rGlobalToLocal(vdim,
-                  vdim_ordering,
-                  globalDofs,
-                  localEntries,
-                  offsets,
-                  indices,
-                  globalVec,
-                  localVec);
-}
-
-// ***************************************************************************
-// Aggregate local node values to their respective global dofs
-void RajaFiniteElementSpace::LocalToGlobal(const RajaVector& localVec,
-                                           RajaVector& globalVec) const
-{
-   const int vdim = GetVDim();
-   const int localEntries = localDofs * GetNE();
-   const bool vdim_ordering = ordering == Ordering::byVDIM;
-   rLocalToGlobal(vdim,
-                  vdim_ordering,
-                  globalDofs,
-                  localEntries,
-                  offsets,
-                  indices,
-                  localVec,
-                  globalVec);
-}
-
-} // namespace mfem
diff --git a/raja/raja/fem/rfespace.hpp b/raja/raja/fem/rfespace.hpp
deleted file mode 100644
index 6a970614..00000000
--- a/raja/raja/fem/rfespace.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_FESPACE
-#define LAGHOS_RAJA_FESPACE
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaFiniteElementSpace
-//  **************************************************************************
-class RajaFiniteElementSpace : public ParFiniteElementSpace
-{
-private:
-   int globalDofs, localDofs;
-   RajaArray<int> offsets;
-   RajaArray<int> indices, *reorderIndices;
-   RajaArray<int> map;
-   RajaOperator *restrictionOp, *prolongationOp;
-public:
-   RajaFiniteElementSpace(Mesh* mesh,
-                          const FiniteElementCollection* fec,
-                          const int vdim_ = 1,
-                          Ordering::Type ordering_ = Ordering::byNODES);
-   ~RajaFiniteElementSpace();
-   // *************************************************************************
-   bool hasTensorBasis() const;
-   int GetLocalDofs() const { return localDofs; }
-   const RajaOperator* GetRestrictionOperator() { return restrictionOp; }
-   const RajaOperator* GetProlongationOperator() { return prolongationOp; }
-   const RajaArray<int>& GetLocalToGlobalMap() const { return map; }
-   // *************************************************************************
-   void GlobalToLocal(const RajaVector&, RajaVector&) const;
-   void LocalToGlobal(const RajaVector&, RajaVector&) const;
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_FESPACE
diff --git a/raja/raja/fem/rgridfunc.cpp b/raja/raja/fem/rgridfunc.cpp
deleted file mode 100644
index 83c934b7..00000000
--- a/raja/raja/fem/rgridfunc.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-void RajaGridFunction::ToQuad(const IntegrationRule& ir,
-                              RajaVector& quadValues)
-{
-   const FiniteElement& fe = *(fes.GetFE(0));
-   const int dim  = fe.GetDim();
-   const int vdim = fes.GetVDim();
-   const int elements = fes.GetNE();
-   const int numQuad  = ir.GetNPoints();
-   const RajaDofQuadMaps* maps = RajaDofQuadMaps::Get(fes, ir);
-   const int quad1D  = IntRules.Get(Geometry::SEGMENT,ir.GetOrder()).GetNPoints();
-   const int dofs1D =fes.GetFE(0)->GetOrder() + 1;
-   quadValues.SetSize(numQuad * elements);
-   rGridFuncToQuad(dim,vdim,dofs1D,quad1D,elements,
-                   maps->dofToQuad,
-                   fes.GetLocalToGlobalMap(),
-                   ptr(),
-                   quadValues);
-}
-
-} // mfem
diff --git a/raja/raja/fem/rgridfunc.hpp b/raja/raja/fem/rgridfunc.hpp
deleted file mode 100644
index 85fcbabf..00000000
--- a/raja/raja/fem/rgridfunc.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_GRIDFUNC
-#define LAGHOS_RAJA_GRIDFUNC
-
-namespace mfem
-{
-
-class RajaGridFunction : public RajaVector
-{
-public:
-   const RajaFiniteElementSpace& fes;
-public:
-
-   RajaGridFunction(const RajaFiniteElementSpace& f):
-      RajaVector(f.GetVSize()),fes(f) {}
-
-   RajaGridFunction(const RajaFiniteElementSpace& f,const RajaVector* v):
-      RajaVector(v), fes(f) {}
-
-   void ToQuad(const IntegrationRule&,RajaVector&);
-
-   RajaGridFunction& operator=(const RajaVector& v)
-   {
-      RajaVector::operator=(v);
-      return *this;
-   }
-   RajaGridFunction& operator=(const Vector& v)
-   {
-      RajaVector::operator=(v);
-      return *this;
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_GRIDFUNC
diff --git a/raja/raja/fem/rprolong.cpp b/raja/raja/fem/rprolong.cpp
deleted file mode 100644
index 17bd7d88..00000000
--- a/raja/raja/fem/rprolong.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaProlongationOperator
-// ***************************************************************************
-RajaProlongationOperator::RajaProlongationOperator
-(const RajaConformingProlongationOperator* Op):
-   RajaOperator(Op->Height(), Op->Width()),pmat(Op) {}
-
-// ***************************************************************************
-void RajaProlongationOperator::Mult(const RajaVector& x,
-                                    RajaVector& y) const
-{
-   if (rconfig::Get().IAmAlone())
-   {
-      y=x;
-      return;
-   }
-   if (!rconfig::Get().DoHostConformingProlongationOperator())
-   {
-      pmat->d_Mult(x, y);
-      return;
-   }
-   const Vector hostX=x;//D2H
-   Vector hostY(y.Size());
-   pmat->h_Mult(hostX, hostY);
-   y=hostY;//H2D
-}
-
-// ***************************************************************************
-void RajaProlongationOperator::MultTranspose(const RajaVector& x,
-                                             RajaVector& y) const
-{
-   if (rconfig::Get().IAmAlone())
-   {
-      y=x;
-      return;
-   }
-   if (!rconfig::Get().DoHostConformingProlongationOperator())
-   {
-      pmat->d_MultTranspose(x, y);
-      return;
-   }
-   const Vector hostX=x;
-   Vector hostY(y.Size());
-   pmat->h_MultTranspose(hostX, hostY);
-   y=hostY;//H2D
-}
-
-} // namespace mfem
diff --git a/raja/raja/fem/rprolong.hpp b/raja/raja/fem/rprolong.hpp
deleted file mode 100644
index aff3e29f..00000000
--- a/raja/raja/fem/rprolong.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_PROLONG_OP
-#define LAGHOS_RAJA_PROLONG_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaProlongationOperator
-// ***************************************************************************
-class RajaProlongationOperator : public RajaOperator
-{
-protected:
-   const RajaConformingProlongationOperator* pmat = NULL;
-public:
-   RajaProlongationOperator(const RajaConformingProlongationOperator*);
-   void Mult(const RajaVector& x, RajaVector& y) const;
-   void MultTranspose(const RajaVector& x, RajaVector& y) const ;
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_PROLONG_OP
diff --git a/raja/raja/fem/rrestrict.cpp b/raja/raja/fem/rrestrict.cpp
deleted file mode 100644
index 86d9e8d5..00000000
--- a/raja/raja/fem/rrestrict.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaRestrictionOperator
-// ***************************************************************************
-void RajaRestrictionOperator::Mult(const RajaVector& x,
-                                   RajaVector& y) const
-{
-   rExtractSubVector(entries, indices->ptr(), x, y);
-}
-
-} // namespace mfem
diff --git a/raja/raja/fem/rrestrict.hpp b/raja/raja/fem/rrestrict.hpp
deleted file mode 100644
index f423033c..00000000
--- a/raja/raja/fem/rrestrict.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_RESTRICT_OP
-#define LAGHOS_RAJA_RESTRICT_OP
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaRestrictionOperator
-// ***************************************************************************
-class RajaRestrictionOperator : public RajaOperator
-{
-protected:
-   int entries;
-   const RajaArray<int> *indices;
-public:
-   RajaRestrictionOperator(const int h, const int w,
-                           const RajaArray<int> *idx):
-      RajaOperator(h,w),
-      entries(idx->size()>>1),
-      indices(idx) {}
-   void Mult(const RajaVector& x, RajaVector& y) const ;
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_RESTRICT_OP
diff --git a/raja/raja/general/rarray.hpp b/raja/raja/general/rarray.hpp
deleted file mode 100644
index 0386fff6..00000000
--- a/raja/raja/general/rarray.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_ARRAY
-#define LAGHOS_RAJA_ARRAY
-
-namespace mfem
-{
-
-template <class T, bool xyz = true> class RajaArray;
-
-// Partial Specializations for xyz==TRUE *************************************
-template <class T> class RajaArray<T,true> : public rmalloc<T>
-{
-private:
-   T* data = NULL;
-   size_t sz,d[4];
-public:
-   RajaArray():data(NULL),sz(0),d{0,0,0,0} {}
-   RajaArray(const size_t x) {allocate(x);}
-   RajaArray(const size_t x,const size_t y) {allocate(x,y);}
-   RajaArray(const RajaArray<T,true> &r) {assert(false);}
-   RajaArray& operator=(Array<T> &a)
-   {
-      rmemcpy::rHtoD(data,a.GetData(),a.Size()*sizeof(T));
-      return *this;
-   }
-   ~RajaArray() {rmalloc<T>::operator delete (data);}
-   inline size_t* dim() { return &d[0]; }
-   inline T* ptr() { return data; }
-   inline const T* GetData() const { return data; }
-   inline const T* ptr() const { return data; }
-   inline operator T* () { return data; }
-   inline operator const T* () const { return data; }
-   double operator* (const RajaArray& a) const { return vector_dot(sz, data, a.data); }
-   inline size_t size() const { return sz; }
-   inline size_t Size() const { return sz; }
-   inline size_t bytes() const { return size()*sizeof(T); }
-   void allocate(const size_t X, const size_t Y =1,
-                 const size_t Z =1, const size_t D =1,
-                 const bool transposed = false)
-   {
-      d[0]=X; d[1]=Y; d[2]=Z; d[3]=D;
-      sz=d[0]*d[1]*d[2]*d[3];
-      data=(T*) rmalloc<T>::operator new (sz);
-   }
-   inline T& operator[](const size_t x) { return data[x]; }
-   inline T& operator()(const size_t x, const size_t y)
-   {
-      return data[x + d[0]*y];
-   }
-   inline T& operator()(const size_t x, const size_t y, const size_t z)
-   {
-      return data[x + d[0]*(y + d[1]*z)];
-   }
-   void Print(std::ostream& out= std::cout, int width = 8) const
-   {
-      T *h_data = (double*) ::malloc(bytes());
-      rmemcpy::rDtoH(h_data,data,bytes());
-      for (size_t i=0; i<sz; i+=1)
-         if (sizeof(T)==8) { printf("\n\t[%ld] %.15e",i,h_data[i]); }
-         else { printf("\n\t[%ld] %d",i,h_data[i]); }
-      free(h_data);
-   }
-};
-
-// Partial Specializations for xyz==FALSE ************************************
-template <class T> class RajaArray<T,false> : public rmalloc<T>
-{
-private:
-   static const int DIM = 4;
-   T* data = NULL;
-   size_t sz,d[DIM];
-public:
-   RajaArray():data(NULL),sz(0),d{0,0,0,0} {}
-   RajaArray(const size_t d0) {allocate(d0);}
-   RajaArray(const RajaArray<T,false> &r) {assert(false);}
-   ~RajaArray() {rmalloc<T>::operator delete (data);}
-   RajaArray& operator=(Array<T> &a)
-   {
-      rmemcpy::rHtoD(data,a.GetData(),a.Size()*sizeof(T));
-      return *this;
-   }
-   inline size_t* dim() { return &d[0]; }
-   inline T* ptr() { return data; }
-   inline T* GetData() const { return data; }
-   inline const T* ptr() const { return data; }
-   inline operator T* () { return data; }
-   inline operator const T* () const { return data; }
-   double operator* (const RajaArray& a) const { return vector_dot(sz, data, a.data); }
-   inline size_t size() const { return sz; }
-   inline size_t Size() const { return sz; }
-   inline size_t bytes() const { return size()*sizeof(T); }
-   void allocate(const size_t X, const size_t Y =1,
-                 const size_t Z =1, const size_t D =1,
-                 const bool transposed = false)
-   {
-      d[0]=X; d[1]=Y; d[2]=Z; d[3]=D;
-      sz=d[0]*d[1]*d[2]*d[3];
-      assert(sz>0);
-      data=(T*) rmalloc<T>::operator new (sz);
-#define xsw(a,b) a^=b^=a^=b
-      if (transposed) { xsw(d[0],d[1]); }
-      for (size_t i=1,b=d[0]; i<DIM; xsw(d[i],b),++i)
-      {
-         d[i]*=d[i-1];
-      }
-      d[0]=1;
-      if (transposed) { xsw(d[0],d[1]); }
-   }
-   inline T& operator[](const size_t x) { return data[x]; }
-   inline T& operator()(const size_t x, const size_t y)
-   {
-      return data[d[0]*x + d[1]*y];
-   }
-   inline T& operator()(const size_t x, const size_t y, const size_t z)
-   {
-      return data[d[0]*x + d[1]*y + d[2]*z];
-   }
-   void Print(std::ostream& out= std::cout, int width = 8) const
-   {
-      T *h_data = (double*) ::malloc(bytes());
-      rmemcpy::rDtoH(h_data,data,bytes());
-      for (size_t i=0; i<sz; i+=1)
-         if (sizeof(T)==8) { printf("\n\t[%ld] %.15e",i,h_data[i]); }
-         else { printf("\n\t[%ld] %d",i,h_data[i]); }
-      free(h_data);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_ARRAY
-
diff --git a/raja/raja/general/rcommd.cpp b/raja/raja/general/rcommd.cpp
deleted file mode 100644
index 42fd94e7..00000000
--- a/raja/raja/general/rcommd.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * RajaCommD
-// ***************************************************************************
-RajaCommD::RajaCommD(ParFiniteElementSpace &pfes):
-   GroupCommunicator(pfes.GroupComm()),
-   d_group_ldof(group_ldof),
-   d_group_ltdof(group_ltdof),
-   d_group_buf(NULL) {comm_lock=0;}
-
-
-// ***************************************************************************
-// * ~RajaCommD
-// ***************************************************************************
-RajaCommD::~RajaCommD() { }
-
-
-// ***************************************************************************
-// * kCopyFromTable
-// ***************************************************************************
-template <class T> static __global__
-void k_CopyGroupToBuffer(T *buf,const T *data,const int *dofs)
-{
-   const int j = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[j];
-   buf[j]=data[idx];
-}
-
-// ***************************************************************************
-// ***************************************************************************
-template <class T> static
-T *d_CopyGroupToBuffer_k(const T *d_ldata,T *d_buf,
-                         const RajaTable &d_dofs,
-                         const int group)
-{
-   const int ndofs = d_dofs.RowSize(group);
-   const int *dofs = d_dofs.GetRow(group);
-#if defined(RAJA_ENABLE_CUDA)
-   k_CopyGroupToBuffer<<<ndofs,1>>>(d_buf,d_ldata,dofs);
-#elif defined(RAJA_ENABLE_HIP)
-   hipLaunchKernelGGL((k_CopyGroupToBuffer), dim3(ndofs), dim3(1), 0, 0, d_buf,d_ldata,dofs);
-#endif
-   return d_buf + ndofs;
-}
-
-// ***************************************************************************
-// * d_CopyGroupToBuffer
-// ***************************************************************************
-template <class T>
-T *RajaCommD::d_CopyGroupToBuffer(const T *d_ldata, T *d_buf,
-                                  int group, int layout) const
-{
-   if (layout==2) // master
-   {
-      return d_CopyGroupToBuffer_k(d_ldata,d_buf,d_group_ltdof,group);
-   }
-   if (layout==0) // slave
-   {
-      return d_CopyGroupToBuffer_k(d_ldata,d_buf,d_group_ldof,group);
-   }
-   assert(false);
-   return 0;
-}
-
-// ***************************************************************************
-// * k_CopyGroupFromBuffer
-// ***************************************************************************
-template <class T> static __global__
-void k_CopyGroupFromBuffer(const T *buf,T *data,const int *dofs)
-{
-   const int j = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[j];
-   data[idx]=buf[j];
-}
-
-// ***************************************************************************
-// * d_CopyGroupFromBuffer
-// ***************************************************************************
-template <class T>
-const T *RajaCommD::d_CopyGroupFromBuffer(const T *d_buf, T *d_ldata,
-                                          int group, int layout) const
-{
-   assert(layout==0);
-   const int ndofs = d_group_ldof.RowSize(group);
-   const int *dofs = d_group_ldof.GetRow(group);
-#if defined(RAJA_ENABLE_CUDA)
-   k_CopyGroupFromBuffer<<<ndofs,1>>>(d_buf,d_ldata,dofs);
-#elif defined(RAJA_ENABLE_HIP)
-   hipLaunchKernelGGL((k_CopyGroupFromBuffer), dim3(ndofs), dim3(1), 0, 0,
-                                              d_buf,d_ldata,dofs);
-#endif
-   return d_buf + ndofs;
-}
-
-// ***************************************************************************
-// * kAtomicAdd
-// ***************************************************************************
-template <class T>
-static __global__ void kAtomicAdd(T* adrs, const int* dofs,T *value)
-{
-   const int i = blockDim.x * blockIdx.x + threadIdx.x;
-   const int idx = dofs[i];
-   adrs[idx] += value[i];
-}
-template __global__ void kAtomicAdd<int>(int*, const int*, int*);
-template __global__ void kAtomicAdd<double>(double*, const int*, double*);
-
-// ***************************************************************************
-// * ReduceGroupFromBuffer
-// ***************************************************************************
-template <class T>
-const T *RajaCommD::d_ReduceGroupFromBuffer(const T *d_buf, T *d_ldata,
-                                            int group, int layout,
-                                            void (*Op)(OpData<T>)) const
-{
-   OpData<T> opd;
-   opd.ldata = d_ldata;
-   opd.nldofs = group_ldof.RowSize(group);
-   opd.nb = 1;
-   opd.buf = const_cast<T*>(d_buf);
-   opd.ldofs = const_cast<int*>(d_group_ltdof.GetRow(group));
-   assert(opd.nb == 1);
-   // this is the operation to perform: opd.ldata[opd.ldofs[i]] += opd.buf[i];
-   // mfem/general/communication.cpp, line 1008
-#if defined(RAJA_ENABLE_CUDA)
-   kAtomicAdd<<<opd.nldofs,1>>>(opd.ldata,opd.ldofs,opd.buf);
-#elif defined(RAJA_ENABLE_HIP)
-   hipLaunchKernelGGL((kAtomicAdd), dim3(opd.nldofs), dim3(1), 0, 0,
-                      opd.ldata,opd.ldofs,opd.buf);
-#endif
-
-   return d_buf + opd.nldofs;
-}
-
-
-// ***************************************************************************
-// * d_BcastBegin
-// ***************************************************************************
-template <class T>
-void RajaCommD::d_BcastBegin(T *d_ldata, int layout)
-{
-   MFEM_VERIFY(comm_lock == 0, "object is already in use");
-   if (group_buf_size == 0) { return; }
-
-   assert(layout==2);
-   // const int rnk = rconfig::Get().Rank();
-   int request_counter = 0;
-   group_buf.SetSize(group_buf_size*sizeof(T));
-   T *buf = (T *)group_buf.GetData();
-   if (!d_group_buf)
-   {
-      d_group_buf = rmalloc<T>::operator new (group_buf_size);
-   }
-   T *d_buf = (T*)d_group_buf;
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      const int num_send_groups = nbr_send_groups.RowSize(nbr);
-      if (num_send_groups > 0)
-      {
-         T *buf_start = buf;
-         T *d_buf_start = d_buf;
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         for (int i = 0; i < num_send_groups; i++)
-         {
-            T *d_buf_ini = d_buf;
-            assert(layout==2);
-            d_buf = d_CopyGroupToBuffer(d_ldata, d_buf, grp_list[i], 2);
-            buf += d_buf - d_buf_ini;
-         }
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rDtoH(buf_start,d_buf_start,(buf-buf_start)*sizeof(T));
-         }
-
-         // make sure the device has finished
-         if (rconfig::Get().Aware())
-         {
-#if defined(RAJA_ENABLE_CUDA)
-            cudaStreamSynchronize(0);//*rconfig::Get().Stream());
-#elif defined(RAJA_ENABLE_HIP)
-            hipStreamSynchronize(0);//*rconfig::Get().Stream());
-#endif
-         }
-
-         if (rconfig::Get().Aware())
-            MPI_Isend(d_buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Isend(buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = -1; // mark as send request
-         request_counter++;
-      }
-
-      const int num_recv_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         if (rconfig::Get().Aware())
-            MPI_Irecv(d_buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Irecv(buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      40822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = nbr;
-         request_counter++;
-         buf_offsets[nbr] = buf - (T*)group_buf.GetData();
-         buf += recv_size;
-         d_buf += recv_size;
-      }
-   }
-   assert(buf - (T*)group_buf.GetData() == group_buf_size);
-   comm_lock = 1; // 1 - locked for Bcast
-   num_requests = request_counter;
-}
-
-// ***************************************************************************
-// * d_BcastEnd
-// ***************************************************************************
-template <class T>
-void RajaCommD::d_BcastEnd(T *d_ldata, int layout)
-{
-   if (comm_lock == 0) { return; }
-   // const int rnk = rconfig::Get().Rank();
-   // The above also handles the case (group_buf_size == 0).
-   assert(comm_lock == 1);
-   // copy the received data from the buffer to d_ldata, as it arrives
-   int idx;
-   while (MPI_Waitany(num_requests, requests, &idx, MPI_STATUS_IGNORE),
-          idx != MPI_UNDEFINED)
-   {
-      int nbr = request_marker[idx];
-      if (nbr == -1) { continue; } // skip send requests
-
-      const int num_recv_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         const T *buf = (T*)group_buf.GetData() + buf_offsets[nbr];
-         const T *d_buf = (T*)d_group_buf + buf_offsets[nbr];
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rHtoD((void*)d_buf,buf,recv_size*sizeof(T));
-         }
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            d_buf = d_CopyGroupFromBuffer(d_buf, d_ldata, grp_list[i], layout);
-         }
-      }
-   }
-   comm_lock = 0; // 0 - no lock
-   num_requests = 0;
-}
-
-// ***************************************************************************
-// * d_ReduceBegin
-// ***************************************************************************
-template <class T>
-void RajaCommD::d_ReduceBegin(const T *d_ldata)
-{
-   MFEM_VERIFY(comm_lock == 0, "object is already in use");
-   if (group_buf_size == 0) { return; }
-   // const int rnk = rconfig::Get().Rank();
-
-   int request_counter = 0;
-   group_buf.SetSize(group_buf_size*sizeof(T));
-   T *buf = (T *)group_buf.GetData();
-   if (!d_group_buf)
-   {
-      d_group_buf = rmalloc<T>::operator new (group_buf_size);
-   }
-   T *d_buf = (T*)d_group_buf;
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      const int num_send_groups = nbr_recv_groups.RowSize(nbr);
-      if (num_send_groups > 0)
-      {
-         T *buf_start = buf;
-         T *d_buf_start = d_buf;
-         const int *grp_list = nbr_recv_groups.GetRow(nbr);
-         for (int i = 0; i < num_send_groups; i++)
-         {
-            T *d_buf_ini = d_buf;
-            d_buf = d_CopyGroupToBuffer(d_ldata, d_buf, grp_list[i], 0);
-            buf += d_buf - d_buf_ini;
-         }
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rDtoH(buf_start,d_buf_start,(buf-buf_start)*sizeof(T));
-         }
-         // make sure the device has finished
-         if (rconfig::Get().Aware())
-         {
-#if defined(RAJA_ENABLE_CUDA)
-            cudaStreamSynchronize(0);//*rconfig::Get().Stream());
-#elif defined(RAJA_ENABLE_HIP)
-            hipStreamSynchronize(0);//*rconfig::Get().Stream());
-#endif
-         }
-         if (rconfig::Get().Aware())
-            MPI_Isend(d_buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Isend(buf_start,
-                      buf - buf_start,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = -1; // mark as send request
-         request_counter++;
-      }
-
-      // In Reduce operation: send_groups <--> recv_groups
-      const int num_recv_groups = nbr_send_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         if (rconfig::Get().Aware())
-            MPI_Irecv(d_buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         else
-            MPI_Irecv(buf,
-                      recv_size,
-                      MPITypeMap<T>::mpi_type,
-                      gtopo.GetNeighborRank(nbr),
-                      43822,
-                      gtopo.GetComm(),
-                      &requests[request_counter]);
-         request_marker[request_counter] = nbr;
-         request_counter++;
-         buf_offsets[nbr] = buf - (T*)group_buf.GetData();
-         buf += recv_size;
-         d_buf += recv_size;
-      }
-   }
-   assert(buf - (T*)group_buf.GetData() == group_buf_size);
-   comm_lock = 2;
-   num_requests = request_counter;
-}
-
-// ***************************************************************************
-// * d_ReduceEnd
-// ***************************************************************************
-template <class T>
-void RajaCommD::d_ReduceEnd(T *d_ldata, int layout,
-                            void (*Op)(OpData<T>))
-{
-   if (comm_lock == 0) { return; }
-   // const int rnk = rconfig::Get().Rank();
-   // The above also handles the case (group_buf_size == 0).
-   assert(comm_lock == 2);
-
-   MPI_Waitall(num_requests, requests, MPI_STATUSES_IGNORE);
-   for (int nbr = 1; nbr < nbr_send_groups.Size(); nbr++)
-   {
-      // In Reduce operation: send_groups <--> recv_groups
-      const int num_recv_groups = nbr_send_groups.RowSize(nbr);
-      if (num_recv_groups > 0)
-      {
-         const int *grp_list = nbr_send_groups.GetRow(nbr);
-         int recv_size = 0;
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            recv_size += group_ldof.RowSize(grp_list[i]);
-         }
-         const T *buf = (T*)group_buf.GetData() + buf_offsets[nbr];
-         assert(d_group_buf);
-         const T *d_buf = (T*)d_group_buf + buf_offsets[nbr];
-         if (!rconfig::Get().Aware())
-         {
-            rmemcpy::rHtoD((void*)d_buf,buf,recv_size*sizeof(T));
-         }
-         for (int i = 0; i < num_recv_groups; i++)
-         {
-            d_buf = d_ReduceGroupFromBuffer(d_buf, d_ldata, grp_list[i], layout, Op);
-         }
-      }
-   }
-   comm_lock = 0; // 0 - no lock
-   num_requests = 0;
-}
-
-// ***************************************************************************
-// * instantiate RajaCommD::Bcast and Reduce for doubles
-// ***************************************************************************
-template void RajaCommD::d_BcastBegin<double>(double*, int);
-template void RajaCommD::d_BcastEnd<double>(double*, int);
-template void RajaCommD::d_ReduceBegin<double>(const double *);
-template void RajaCommD::d_ReduceEnd<double>(double*,int,
-                                             void (*)(OpData<double>));
-
-} // namespace mfem
diff --git a/raja/raja/general/rcommd.hpp b/raja/raja/general/rcommd.hpp
deleted file mode 100644
index 24ed7a9a..00000000
--- a/raja/raja/general/rcommd.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_COMM_D
-#define LAGHOS_RAJA_COMM_D
-
-#ifdef MFEM_USE_MPI
-#include <mpi.h>
-#endif
-
-namespace mfem
-{
-
-// ***************************************************************************
-// * First communicator, buf goes on the device
-// ***************************************************************************
-class RajaCommD : public GroupCommunicator, public rmemcpy
-{
-private:
-   RajaTable d_group_ldof;
-   RajaTable d_group_ltdof;
-   void *d_group_buf;
-   int comm_lock; // 0 - no lock, 1 - locked for Bcast, 2 - locked for Reduce
-   int num_requests;
-public:
-   RajaCommD(ParFiniteElementSpace&);
-   ~RajaCommD();
-
-   template <class T> T *d_CopyGroupToBuffer(const T*,T*,int,int) const;
-   template <class T> const T *d_CopyGroupFromBuffer(const T*, T*,int, int) const;
-   template <class T> const T *d_ReduceGroupFromBuffer(const T*,T*,int,int,
-                                                       void (*)(OpData<T>)) const;
-
-   template <class T> void d_BcastBegin(T*,int);
-   template <class T> void d_BcastEnd(T*, int);
-
-   template <class T> void d_ReduceBegin(const T*);
-   template <class T> void d_ReduceEnd(T*,int,void (*)(OpData<T>));
-};
-
-
-} // mfem
-
-#endif // LAGHOS_RAJA_COMM_D
diff --git a/raja/raja/general/rmalloc.hpp b/raja/raja/general/rmalloc.hpp
deleted file mode 100644
index b43e9221..00000000
--- a/raja/raja/general/rmalloc.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_MALLOC
-#define LAGHOS_RAJA_MALLOC
-
-namespace mfem
-{
-
-// ***************************************************************************
-template<class T> struct rmalloc: public rmemcpy
-{
-
-   // *************************************************************************
-   inline void* operator new (size_t n, bool lock_page = false)
-   {
-#if defined(RAJA_ENABLE_CUDA)
-      if (!rconfig::Get().Cuda()) { return ::new T[n]; }
-      void *ptr;
-      if (!rconfig::Get().Uvm())
-      {
-         if (lock_page) { cuMemHostAlloc(&ptr, n*sizeof(T), CU_MEMHOSTALLOC_PORTABLE); }
-         else { cuMemAlloc((CUdeviceptr*)&ptr, n*sizeof(T)); }
-      }
-      else
-      {
-         cuMemAllocManaged((CUdeviceptr*)&ptr, n*sizeof(T),CU_MEM_ATTACH_GLOBAL);
-      }
-#elif defined(RAJA_ENABLE_HIP)
-      if (!rconfig::Get().Hip()) { return ::new T[n]; }
-      void *ptr;
-
-      if (lock_page) { hipHostMalloc(&ptr, n*sizeof(T), hipHostMallocMapped); }
-      else { hipMalloc((void**)&ptr, n*sizeof(T)); }
-#endif
-      return ptr;
-   }
-
-   // ***************************************************************************
-   inline void operator delete (void *ptr)
-   {
-#if defined(RAJA_ENABLE_CUDA)
-      if (!rconfig::Get().Cuda())
-      {
-         if (ptr)
-         {
-            ::delete[] static_cast<T*>(ptr);
-         }
-      }
-      else
-      {
-         cuMemFree((CUdeviceptr)ptr); // or cuMemFreeHost if page_locked was used
-      }
-#elif defined(RAJA_ENABLE_HIP)
-      if (!rconfig::Get().Hip())
-      {
-         if (ptr)
-         {
-            ::delete[] static_cast<T*>(ptr);
-         }
-      }
-      else
-      {
-         hipFree(ptr); // or hipHostFree if page_locked was used
-      }
-#endif
-      ptr = nullptr;
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_MALLOC
diff --git a/raja/raja/general/rmemcpy.cpp b/raja/raja/general/rmemcpy.cpp
deleted file mode 100644
index f419d05e..00000000
--- a/raja/raja/general/rmemcpy.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// *************************************************************************
-void* rmemcpy::rHtoH(void *dest, const void *src, std::size_t bytes,
-                     const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-   std::memcpy(dest,src,bytes);
-   return dest;
-}
-
-// *************************************************************************
-void* rmemcpy::rHtoD(void *dest, const void *src, std::size_t bytes,
-                     const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-#if defined(RAJA_ENABLE_CUDA)
-   if (!rconfig::Get().Cuda()) { return std::memcpy(dest,src,bytes); }
-   if (!rconfig::Get().Uvm())
-   {
-      cuMemcpyHtoD((CUdeviceptr)dest,src,bytes);
-   }
-   else { cuMemcpy((CUdeviceptr)dest,(CUdeviceptr)src,bytes); }
-#elif defined(RAJA_ENABLE_HIP)
-   if (!rconfig::Get().Hip()) { return std::memcpy(dest,src,bytes); }
-
-   hipMemcpy(dest,src,bytes,hipMemcpyHostToDevice);
-#endif
-   return dest;
-}
-
-// ***************************************************************************
-void* rmemcpy::rDtoH(void *dest, const void *src, std::size_t bytes,
-                     const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-#if defined(RAJA_ENABLE_CUDA)
-   if (!rconfig::Get().Cuda()) { return std::memcpy(dest,src,bytes); }
-   if (!rconfig::Get().Uvm())
-   {
-      cuMemcpyDtoH(dest,(CUdeviceptr)src,bytes);
-   }
-   else { cuMemcpy((CUdeviceptr)dest,(CUdeviceptr)src,bytes); }
-#elif defined(RAJA_ENABLE_HIP)
-   if (!rconfig::Get().Hip()) { return std::memcpy(dest,src,bytes); }
-
-   hipMemcpy(dest,src,bytes,hipMemcpyDeviceToHost);
-#endif
-   return dest;
-}
-
-// ***************************************************************************
-void* rmemcpy::rDtoD(void *dest, const void *src, std::size_t bytes,
-                     const bool async)
-{
-   if (bytes==0) { return dest; }
-   assert(src); assert(dest);
-#if defined(RAJA_ENABLE_CUDA)
-   if (!rconfig::Get().Cuda()) { return std::memcpy(dest,src,bytes); }
-   if (!rconfig::Get().Uvm())
-   {
-      if (!async)
-      {
-         cuMemcpyDtoD((CUdeviceptr)dest,(CUdeviceptr)src,bytes);
-      }
-      else
-      {
-         const CUstream s = *rconfig::Get().Stream();
-         cuMemcpyDtoDAsync((CUdeviceptr)dest,(CUdeviceptr)src,bytes,s);
-      }
-   }
-   else { cuMemcpy((CUdeviceptr)dest,(CUdeviceptr)src,bytes); }
-#elif defined(RAJA_ENABLE_HIP)
-   if (!rconfig::Get().Hip()) { return std::memcpy(dest,src,bytes); }
-
-   if (!async)
-   {
-      hipMemcpy(dest,src,bytes,hipMemcpyDeviceToDevice);
-   }
-   else
-   {
-      const hipStream_t s = *rconfig::Get().Stream();
-      hipMemcpyAsync(dest,src,bytes,hipMemcpyDeviceToDevice,s);
-   }
-#endif
-   return dest;
-}
-
-} // mfem
diff --git a/raja/raja/general/rmemcpy.hpp b/raja/raja/general/rmemcpy.hpp
deleted file mode 100644
index cde02a92..00000000
--- a/raja/raja/general/rmemcpy.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_MEMCPY
-#define LAGHOS_RAJA_MEMCPY
-
-namespace mfem
-{
-
-// ***************************************************************************
-struct rmemcpy
-{
-   static void* rHtoH(void*, const void*, std::size_t, const bool =false);
-   static void* rHtoD(void*, const void*, std::size_t, const bool =false);
-   static void* rDtoH(void*, const void*, std::size_t, const bool =false);
-   static void* rDtoD(void*, const void*, std::size_t, const bool =false);
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_MEMCPY
diff --git a/raja/raja/general/rtable.cpp b/raja/raja/general/rtable.cpp
deleted file mode 100644
index 63111f8c..00000000
--- a/raja/raja/general/rtable.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// ***************************************************************************
-RajaTable::RajaTable(const Table &table)
-{
-   size = table.Size();
-   assert(size > 0);
-   const int nnz = table.GetI()[size];
-   I = new int[size+1];
-   J = (int*) operator new (nnz);
-   rHtoH(I,table.GetI(),sizeof(int)*(size+1));
-   if (nnz>0)
-   {
-      assert(table.GetJ());
-      rHtoD(J,table.GetJ(),sizeof(int)*nnz);
-   }
-}
-
-} // mfem
diff --git a/raja/raja/general/rtable.hpp b/raja/raja/general/rtable.hpp
deleted file mode 100644
index 6196c182..00000000
--- a/raja/raja/general/rtable.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_TABLE
-#define LAGHOS_RAJA_TABLE
-
-namespace mfem
-{
-
-class RajaTable : public rmalloc<int>
-{
-private:
-   int size = 0;
-   int *I = NULL;
-   int *J = NULL;
-public:
-   RajaTable(const Table&);
-   inline int Size() {return size;}
-   int RowSize(int i) const { return I[i+1]-I[i]; }
-   const int *GetRow(int i) const { return J+I[i]; }
-   int *GetRow(int i) { return J+I[i]; }
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_TABLE
diff --git a/raja/raja/kernels/blas/vector_axpy.cpp b/raja/raja/kernels/blas/vector_axpy.cpp
deleted file mode 100644
index 653d4481..00000000
--- a/raja/raja/kernels/blas/vector_axpy.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_axpy(const int N,
-                 const double alpha,
-                 double* __restrict v0,
-                 const double* __restrict v1)
-{
-   forall(i,N,v0[i] += alpha * v1[i];);
-}
diff --git a/raja/raja/kernels/blas/vector_clear_dofs.cpp b/raja/raja/kernels/blas/vector_clear_dofs.cpp
deleted file mode 100644
index b736354a..00000000
--- a/raja/raja/kernels/blas/vector_clear_dofs.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_clear_dofs(const int N,
-                       double* __restrict v0,
-                       const int* __restrict v1)
-{
-   forall(i,N,v0[v1[i]] = 0.0;);
-}
diff --git a/raja/raja/kernels/blas/vector_dot.cpp b/raja/raja/kernels/blas/vector_dot.cpp
deleted file mode 100644
index 4d2baab3..00000000
--- a/raja/raja/kernels/blas/vector_dot.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-#define BLOCKSIZE 256
-
-// *****************************************************************************
-__global__ void KernelDot(const size_t N, double *gdsr,
-                            const double *x, const double *y)
-{
-   __shared__ double s_dot[BLOCKSIZE];
-   const size_t n = blockDim.x*blockIdx.x + threadIdx.x;
-   if (n>=N) { return; }
-   const size_t bid = blockIdx.x;
-   const size_t tid = threadIdx.x;
-   const size_t bbd = bid*blockDim.x;
-   const size_t rid = bbd+tid;
-   s_dot[tid] = x[n] * y[n];
-   for (size_t workers=blockDim.x>>1; workers>0; workers>>=1)
-   {
-      __syncthreads();
-      if (tid >= workers) { continue; }
-      if (rid >= N) { continue; }
-      const size_t dualTid = tid + workers;
-      if (dualTid >= N) { continue; }
-      const size_t rdd = bbd+dualTid;
-      if (rdd >= N) { continue; }
-      if (dualTid >= blockDim.x) { continue; }
-      s_dot[tid] += s_dot[dualTid];
-   }
-   if (tid==0) { gdsr[bid] = s_dot[0]; }
-}
-
-// *****************************************************************************
-double gpuVectorDot(const size_t N, const double *x, const double *y)
-{
-   const size_t tpb = BLOCKSIZE;
-   const size_t blockSize = BLOCKSIZE;
-   const size_t gridSize = (N+blockSize-1)/blockSize;
-   const size_t dot_sz = (N%tpb)==0? (N/tpb) : (1+N/tpb);
-   const size_t bytes = dot_sz*sizeof(double);
-   static double *h_dot = NULL;
-   if (!h_dot) { h_dot = (double*)calloc(dot_sz,sizeof(double)); }
-
-#if defined(RAJA_ENABLE_CUDA)
-   static CUdeviceptr gdsr = (CUdeviceptr) NULL;
-   if (!gdsr) { cuMemAlloc(&gdsr,bytes); }
-   KernelDot<<<gridSize,blockSize>>>(N, (double*)gdsr, x, y);
-   cuMemcpy((CUdeviceptr)h_dot,(CUdeviceptr)gdsr,bytes);
-#elif defined(RAJA_ENABLE_HIP)
-   static void* gdsr = (void*) NULL;
-   if (!gdsr) { hipMalloc(&gdsr,bytes); }
-   hipLaunchKernelGGL((KernelDot),dim3(gridSize),dim3(blockSize), 0, 0,
-                           N, (double*)gdsr, x, y);
-   hipMemcpy((void*)h_dot,(void*)gdsr,bytes, hipMemcpyDeviceToHost);
-#endif
-
-   double dot = 0.0;
-   for (size_t i=0; i<dot_sz; i+=1) { dot += h_dot[i]; }
-   return dot;
-}
-
-// *****************************************************************************
-double vector_dot(const int N,
-                  const double* __restrict x,
-                  const double* __restrict y)
-{
-   if (mfem::rconfig::Get().Cuda() || mfem::rconfig::Get().Hip())
-   {
-      return gpuVectorDot(N,x,y);
-   }
-   ReduceDecl(Sum,dot,0.0);
-   ReduceForall(i,N,dot += x[i]*y[i];);
-   return dot;
-}
diff --git a/raja/raja/kernels/blas/vector_get_subvector.cpp b/raja/raja/kernels/blas/vector_get_subvector.cpp
deleted file mode 100644
index d4dfd668..00000000
--- a/raja/raja/kernels/blas/vector_get_subvector.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_get_subvector(const int N,
-                          double* __restrict v0,
-                          const double* __restrict v1,
-                          const int* __restrict v2)
-{
-   forall(i, N,
-   {
-      const int dof_i = v2[i];
-      v0[i] = dof_i >= 0 ? v1[dof_i] : -v1[-dof_i-1];
-   });
-}
-
diff --git a/raja/raja/kernels/blas/vector_map_dofs.cpp b/raja/raja/kernels/blas/vector_map_dofs.cpp
deleted file mode 100644
index 37d5e798..00000000
--- a/raja/raja/kernels/blas/vector_map_dofs.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_map_dofs(const int N,
-                     double* __restrict v0,
-                     const double* __restrict v1,
-                     const int* v2)
-{
-   forall(i, N,
-   {
-      const int idx = v2[i];
-      v0[idx] = v1[idx];
-   });
-}
diff --git a/raja/raja/kernels/blas/vector_min.cpp b/raja/raja/kernels/blas/vector_min.cpp
deleted file mode 100644
index 09b95b51..00000000
--- a/raja/raja/kernels/blas/vector_min.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-#define BLOCKSIZE 256
-
-// *****************************************************************************
-__global__ void KernelMin(const size_t N, double *gdsr, const double *x)
-{
-   __shared__ double s_min[BLOCKSIZE];
-   const size_t n = blockDim.x*blockIdx.x + threadIdx.x;
-   if (n>=N) { return; }
-   const size_t bid = blockIdx.x;
-   const size_t tid = threadIdx.x;
-   const size_t bbd = bid*blockDim.x;
-   const size_t rid = bbd+tid;
-   s_min[tid] = x[n];
-   for (size_t workers=blockDim.x>>1; workers>0; workers>>=1)
-   {
-      __syncthreads();
-      if (tid >= workers) { continue; }
-      if (rid >= N) { continue; }
-      const size_t dualTid = tid + workers;
-      if (dualTid >= N) { continue; }
-      const size_t rdd = bbd+dualTid;
-      if (rdd >= N) { continue; }
-      if (dualTid >= blockDim.x) { continue; }
-      s_min[tid] = fmin(s_min[tid],s_min[dualTid]);
-   }
-   if (tid==0) { gdsr[bid] = s_min[0]; }
-}
-
-// *****************************************************************************
-double gpuVectorMin(const size_t N, const double *x)
-{
-   const size_t tpb = BLOCKSIZE;
-   const size_t blockSize = BLOCKSIZE;
-   const size_t gridSize = (N+blockSize-1)/blockSize;
-   const size_t min_sz = (N%tpb)==0? (N/tpb) : (1+N/tpb);
-   const size_t bytes = min_sz*sizeof(double);
-   static double *h_min = NULL;
-   if (!h_min) { h_min = (double*)calloc(min_sz,sizeof(double)); }
-
-#if defined(RAJA_ENABLE_CUDA)
-   static CUdeviceptr gdsr = (CUdeviceptr) NULL;
-   if (!gdsr) { cuMemAlloc(&gdsr,bytes); }
-   KernelMin<<<gridSize,blockSize>>>(N, (double*)gdsr, x);
-   cuMemcpy((CUdeviceptr)h_min,(CUdeviceptr)gdsr,bytes);
-#elif defined(RAJA_ENABLE_HIP)
-   static void* gdsr = (void*) NULL;
-   if (!gdsr) { hipMalloc(&gdsr,bytes); }
-   hipLaunchKernelGGL((KernelMin),dim3(gridSize),dim3(blockSize), 0, 0,
-                           N, (double*)gdsr, x);
-   hipMemcpy((void*)h_min,(void*)gdsr,bytes, hipMemcpyDeviceToHost);
-#endif
-
-   double min = HUGE_VAL;
-   for (size_t i=0; i<min_sz; i+=1) { min = fmin(min,h_min[i]); }
-   return min;
-}
-
-// *****************************************************************************
-double vector_min(const int N,
-                  const double* __restrict vec)
-{
-   if (mfem::rconfig::Get().Cuda() || mfem::rconfig::Get().Hip())
-   {
-      return gpuVectorMin(N,vec);
-   }
-   ReduceDecl(Min,red,vec[0]);
-   ReduceForall(i,N,red.min(vec[i]););
-   return red;
-}
-
diff --git a/raja/raja/kernels/blas/vector_neg.cpp b/raja/raja/kernels/blas/vector_neg.cpp
deleted file mode 100644
index 42d96e7a..00000000
--- a/raja/raja/kernels/blas/vector_neg.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_neg(const int N,
-                double* __restrict vec)
-{
-   forall(i, N, vec[i] *= -1.0;);
-}
diff --git a/raja/raja/kernels/blas/vector_op_eq.cpp b/raja/raja/kernels/blas/vector_op_eq.cpp
deleted file mode 100644
index af7415f2..00000000
--- a/raja/raja/kernels/blas/vector_op_eq.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_op_eq(const int N,
-                  const double c0,
-                  double* __restrict v0)
-{
-   forall(i, N, v0[i] = c0;);
-}
diff --git a/raja/raja/kernels/blas/vector_op_eq_d.cu b/raja/raja/kernels/blas/vector_op_eq_d.cu
deleted file mode 100644
index 339ca222..00000000
--- a/raja/raja/kernels/blas/vector_op_eq_d.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-static __global__ void d_vector_op_eq0(const int N,
-                                       const double c0,
-                                       double* __restrict v0){
-  const int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) v0[i] = c0;
-}
-
-// *****************************************************************************
-extern "C" __global__ void d_vector_op_eq(const int N,
-                                          const double c0,
-                                          double* __restrict v0){
-  const size_t blockSize = 128;
-  const size_t gridSize = (N+blockSize-1)/blockSize;
-#if defined(RAJA_ENABLE_CUDA)
-  d_vector_op_eq0<<<gridSize,blockSize>>>(N,c0,v0);
-#elif defined(RAJA_ENABLE_HIP)
-	hipLaunchKernelGGL((d_vector_op_eq0),dim3(gridSize),dim3(blockSize), 0, 0,
-  										N,c0,v0);
-#endif
-}
diff --git a/raja/raja/kernels/blas/vector_set_subvector.cpp b/raja/raja/kernels/blas/vector_set_subvector.cpp
deleted file mode 100644
index 56891988..00000000
--- a/raja/raja/kernels/blas/vector_set_subvector.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_set_subvector(const int N,
-                          double* __restrict v0,
-                          const double* __restrict v1,
-                          const int* __restrict v2)
-{
-   forall(i, N,
-   {
-      const int dof_i = v2[i];
-      const bool tst = dof_i >= 0;
-      const int idx = tst?dof_i:-dof_i-1;
-      const double value = tst?v1[i]:-v1[i];
-      v0[idx]=value;
-   });
-}
diff --git a/raja/raja/kernels/blas/vector_set_subvector_const.cpp b/raja/raja/kernels/blas/vector_set_subvector_const.cpp
deleted file mode 100644
index 499e581d..00000000
--- a/raja/raja/kernels/blas/vector_set_subvector_const.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_set_subvector_const(const int N,
-                                const double value,
-                                double* __restrict data,
-                                const int* __restrict tdofs)
-{
-   forall(i, N,
-   {
-      const int dof_i = tdofs[i];
-      data[dof_i] = value;
-      if (dof_i >= 0)
-      {
-         data[dof_i] = value;
-      }
-      else
-      {
-         data[-dof_i-1] = -value;
-      }
-   });
-}
diff --git a/raja/raja/kernels/blas/vector_vec_add.cpp b/raja/raja/kernels/blas/vector_vec_add.cpp
deleted file mode 100644
index 20b86a95..00000000
--- a/raja/raja/kernels/blas/vector_vec_add.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_vec_add(const int N,
-                    double* __restrict v0,
-                    const double* __restrict v1)
-{
-   forall(i, N, v0[i] += v1[i];);
-}
diff --git a/raja/raja/kernels/blas/vector_vec_mul.cpp b/raja/raja/kernels/blas/vector_vec_mul.cpp
deleted file mode 100644
index a1a60a7c..00000000
--- a/raja/raja/kernels/blas/vector_vec_mul.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_vec_mul(const int N,
-                    double* __restrict v0,
-                    const double d)
-{
-   forall(i, N, v0[i]*=d;);
-}
diff --git a/raja/raja/kernels/blas/vector_vec_sub.cpp b/raja/raja/kernels/blas/vector_vec_sub.cpp
deleted file mode 100644
index 5a70fc4a..00000000
--- a/raja/raja/kernels/blas/vector_vec_sub.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_vec_sub(const int N,
-                    double* __restrict v0,
-                    const double* __restrict v1)
-{
-   forall(i,N, v0[i] -= v1[i];);
-}
diff --git a/raja/raja/kernels/blas/vector_xpay.cpp b/raja/raja/kernels/blas/vector_xpay.cpp
deleted file mode 100644
index 98d050e6..00000000
--- a/raja/raja/kernels/blas/vector_xpay.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_xpay(const int N,
-                 const double c0,
-                 double* __restrict v0,
-                 const double* __restrict v1,
-                 const double* __restrict v2)
-{
-   forall(i,N, v0[i] = v1[i] + (c0 * v2[i]););
-}
diff --git a/raja/raja/kernels/blas/vector_xsy.cpp b/raja/raja/kernels/blas/vector_xsy.cpp
deleted file mode 100644
index b9d66302..00000000
--- a/raja/raja/kernels/blas/vector_xsy.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void vector_xsy(const int N,
-                double* __restrict v0,
-                const double* __restrict v1,
-                const double* __restrict v2)
-{
-   forall(i,N, v0[i] = v1[i]-v2[i];);
-}
diff --git a/raja/raja/kernels/force/rForce.cpp b/raja/raja/kernels/force/rForce.cpp
deleted file mode 100644
index edf6836e..00000000
--- a/raja/raja/kernels/force/rForce.cpp
+++ /dev/null
@@ -1,653 +0,0 @@
-
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D>
-static void rForceMult2D(
-   const int numElements,
-   const double* restrict L2DofToQuad,
-   const double* restrict H1QuadToDof,
-   const double* restrict H1QuadToDofD,
-   const double* restrict stressJinvT,
-   const double* restrict e,
-   double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   forall(el,numElements,
-   {
-      double e_xy[NUM_QUAD_2D];
-      for (int i = 0; i < NUM_QUAD_2D; ++i)
-      {
-         e_xy[i] = 0;
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-      {
-         double e_x[NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            e_x[qy] = 0;
-         }
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            const double r_e = e[ijkN(dx,dy,el,L2_DOFS_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               e_x[qx] += L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * r_e;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double wy = L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               e_xy[ijN(qx,qy,NUM_QUAD_1D)] += wy * e_x[qx];
-            }
-         }
-      }
-      for (int c = 0; c < 2; ++c)
-      {
-         for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)] = 0.0;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double Dxy[H1_DOFS_1D];
-            double xy[H1_DOFS_1D];
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               Dxy[dx] = 0.0;
-               xy[dx]  = 0.0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double esx = e_xy[ijN(qx,qy,NUM_QUAD_1D)] *
-                                  stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)];
-               const double esy = e_xy[ijN(qx,qy,NUM_QUAD_1D)] *
-                                  stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  Dxy[dx] += esx * H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)];
-                  xy[dx]  += esy * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-               }
-            }
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               const double wy  = H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-               const double wDy = H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)] += wy* Dxy[dx] + wDy*xy[dx];
-               }
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D>
-static void rForceMultTranspose2D(
-   const int numElements,
-   const double* restrict L2QuadToDof,
-   const double* restrict H1DofToQuad,
-   const double* restrict H1DofToQuadD,
-   const double* restrict stressJinvT,
-   const double* restrict v,
-   double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   forall(el,numElements,
-   {
-      double vStress[NUM_QUAD_2D];
-      for (int i = 0; i < NUM_QUAD_2D; ++i)
-      {
-         vStress[i] = 0;
-      }
-      for (int c = 0; c < NUM_DIM; ++c)
-      {
-         double v_Dxy[NUM_QUAD_2D];
-         double v_xDy[NUM_QUAD_2D];
-         for (int i = 0; i < NUM_QUAD_2D; ++i)
-         {
-            v_Dxy[i] = v_xDy[i] = 0;
-         }
-         for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-         {
-            double v_x[NUM_QUAD_1D];
-            double v_Dx[NUM_QUAD_1D];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               v_x[qx] = v_Dx[qx] = 0;
-            }
-
-            for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-            {
-               const double r_v = v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  v_x[qx]  += r_v * H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  v_Dx[qx] += r_v * H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy  = H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               const double wDy = H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  v_Dxy[ijN(qx,qy,NUM_QUAD_1D)] += v_Dx[qx] * wy;
-                  v_xDy[ijN(qx,qy,NUM_QUAD_1D)] += v_x[qx]  * wDy;
-               }
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               vStress[ijN(qx,qy,NUM_QUAD_1D)] +=
-                  ((v_Dxy[ijN(qx,qy,NUM_QUAD_1D)] *
-                    stressJinvT[ijklmNM(0,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)]) +
-                   (v_xDy[ijN(qx,qy,NUM_QUAD_1D)] *
-                    stressJinvT[ijklmNM(1,c,qx,qy,el,NUM_DIM,NUM_QUAD_1D)]));
-            }
-         }
-      }
-      for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-      {
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            e[ijkN(dx,dy,el,L2_DOFS_1D)] = 0;
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         double e_x[L2_DOFS_1D];
-         for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-         {
-            e_x[dx] = 0;
-         }
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            const double r_v = vStress[ijN(qx,qy,NUM_QUAD_1D)];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e_x[dx] += r_v * L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-            }
-         }
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            const double w = L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e[ijkN(dx,dy,el,L2_DOFS_1D)] += e_x[dx] * w;
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D>
-void rForceMult3D(
-   const int numElements,
-   const double* restrict L2DofToQuad,
-   const double* restrict H1QuadToDof,
-   const double* restrict H1QuadToDofD,
-   const double* restrict stressJinvT,
-   const double* restrict e,
-   double* restrict v)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   forall(el,numElements,
-   {
-      double e_xyz[NUM_QUAD_3D];
-      for (int i = 0; i < NUM_QUAD_3D; ++i)
-      {
-         e_xyz[i] = 0;
-      }
-      for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-      {
-         double e_xy[NUM_QUAD_2D];
-         for (int i = 0; i < NUM_QUAD_2D; ++i)
-         {
-            e_xy[i] = 0;
-         }
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            double e_x[NUM_QUAD_1D];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               e_x[qy] = 0;
-            }
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               const double r_e = e[ijklN(dx,dy,dz,el,L2_DOFS_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_x[qx] += L2DofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * r_e;
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = L2DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_xy[ijN(qx,qy,NUM_QUAD_1D)] += wy * e_x[qx];
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = L2DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  e_xyz[ijkN(qx,qy,qz,NUM_QUAD_1D)] += wz * e_xy[ijN(qx,qy,NUM_QUAD_1D)];
-               }
-            }
-         }
-      }
-      for (int c = 0; c < 3; ++c)
-      {
-         for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-         {
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] = 0;
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            double Dxy_x[H1_DOFS_1D * H1_DOFS_1D];
-            double xDy_y[H1_DOFS_1D * H1_DOFS_1D];
-            double xy_z[H1_DOFS_1D * H1_DOFS_1D] ;
-            for (int d = 0; d < (H1_DOFS_1D * H1_DOFS_1D); ++d)
-            {
-               Dxy_x[d] = xDy_y[d] = xy_z[d] = 0;
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               double Dx_x[H1_DOFS_1D];
-               double x_y[H1_DOFS_1D];
-               double x_z[H1_DOFS_1D];
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  Dx_x[dx] = x_y[dx] = x_z[dx] = 0;
-               }
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  const double r_e = e_xyz[ijkN(qx,qy,qz,NUM_QUAD_1D)];
-                  const double esx = r_e *
-                                     stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,NUM_DIM,NUM_QUAD_1D)];
-                  const double esy = r_e *
-                                     stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,NUM_QUAD_1D)];
-                  const double esz = r_e *
-                                     stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,NUM_QUAD_1D)];
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     Dx_x[dx] += esx * H1QuadToDofD[ijN(dx,qx,H1_DOFS_1D)];
-                     x_y[dx]  += esy * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-                     x_z[dx]  += esz * H1QuadToDof[ijN(dx,qx,H1_DOFS_1D)];
-                  }
-               }
-               for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-               {
-                  const double wy  = H1QuadToDof[ijN(dy,qy,H1_DOFS_1D)];
-                  const double wDy = H1QuadToDofD[ijN(dy,qy,H1_DOFS_1D)];
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     Dxy_x[ijN(dx,dy,H1_DOFS_1D)] += Dx_x[dx] * wy;
-                     xDy_y[ijN(dx,dy,H1_DOFS_1D)] += x_y[dx]  * wDy;
-                     xy_z[ijN(dx,dy,H1_DOFS_1D)]  += x_z[dx]  * wy;
-                  }
-               }
-            }
-            for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-            {
-               const double wz  = H1QuadToDof[ijN(dz,qz,H1_DOFS_1D)];
-               const double wDz = H1QuadToDofD[ijN(dz,qz,H1_DOFS_1D)];
-               for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-               {
-                  for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-                  {
-                     v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)] +=
-                        ((Dxy_x[ijN(dx,dy,H1_DOFS_1D)] * wz) +
-                         (xDy_y[ijN(dx,dy,H1_DOFS_1D)] * wz) +
-                         (xy_z[ijN(dx,dy,H1_DOFS_1D)]  * wDz));
-                  }
-               }
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D,
-         const int L2_DOFS_1D,
-         const int H1_DOFS_1D>
-static void rForceMultTranspose3D(
-   const int numElements,
-   const double* restrict L2QuadToDof,
-   const double* restrict H1DofToQuad,
-   const double* restrict H1DofToQuadD,
-   const double* restrict stressJinvT,
-   const double* restrict v,
-   double* restrict e)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   forall(el,numElements,
-   {
-      double vStress[NUM_QUAD_3D];
-      for (int i = 0; i < NUM_QUAD_3D; ++i)
-      {
-         vStress[i] = 0;
-      }
-      for (int c = 0; c < NUM_DIM; ++c)
-      {
-         for (int dz = 0; dz < H1_DOFS_1D; ++dz)
-         {
-            double Dxy_x[NUM_QUAD_2D];
-            double xDy_y[NUM_QUAD_2D];
-            double xy_z[NUM_QUAD_2D] ;
-            for (int i = 0; i < NUM_QUAD_2D; ++i)
-            {
-               Dxy_x[i] = xDy_y[i] = xy_z[i] = 0;
-            }
-            for (int dy = 0; dy < H1_DOFS_1D; ++dy)
-            {
-               double Dx_x[NUM_QUAD_1D];
-               double x_y[NUM_QUAD_1D];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  Dx_x[qx] = x_y[qx] = 0;
-               }
-               for (int dx = 0; dx < H1_DOFS_1D; ++dx)
-               {
-                  const double r_v =
-                     v[_ijklmNM(c,dx,dy,dz,el,NUM_DOFS_1D,numElements)];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     Dx_x[qx] += r_v * H1DofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                     x_y[qx]  += r_v * H1DofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  }
-               }
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  const double wy  = H1DofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-                  const double wDy = H1DofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     Dxy_x[ijN(qx,qy,NUM_QUAD_1D)] += Dx_x[qx] * wy;
-                     xDy_y[ijN(qx,qy,NUM_QUAD_1D)] += x_y[qx]  * wDy;
-                     xy_z[ijN(qx,qy,NUM_QUAD_1D)]  += x_y[qx]  * wy;
-                  }
-               }
-            }
-            for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-            {
-               const double wz  = H1DofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-               const double wDz = H1DofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     vStress[ijkN(qx,qy,qz,NUM_QUAD_1D)] +=
-                        ((Dxy_x[ijN(qx,qy,NUM_QUAD_1D)]*wz *
-                          stressJinvT[ijklmnNM(0,c,qx,qy,qz,el,NUM_DIM,NUM_QUAD_1D)]) +
-                         (xDy_y[ijN(qx,qy,NUM_QUAD_1D)]*wz *
-                          stressJinvT[ijklmnNM(1,c,qx,qy,qz,el,NUM_DIM,NUM_QUAD_1D)]) +
-                         (xy_z[ijN(qx,qy,NUM_QUAD_1D)] *wDz*
-                          stressJinvT[ijklmnNM(2,c,qx,qy,qz,el,NUM_DIM,NUM_QUAD_1D)]));
-                  }
-               }
-            }
-         }
-      }
-      for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-      {
-         for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] = 0;
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         double e_xy[L2_DOFS_1D * L2_DOFS_1D];
-         for (int d = 0; d < (L2_DOFS_1D * L2_DOFS_1D); ++d)
-         {
-            e_xy[d] = 0;
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double e_x[L2_DOFS_1D];
-            for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-            {
-               e_x[dx] = 0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double r_v = vStress[ijkN(qx,qy,qz,NUM_QUAD_1D)];
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e_x[dx] += r_v * L2QuadToDof[ijN(dx,qx,L2_DOFS_1D)];
-               }
-            }
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-            {
-               const double w = L2QuadToDof[ijN(dy,qy,L2_DOFS_1D)];
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e_xy[ijN(dx,dy,L2_DOFS_1D)] += e_x[dx] * w;
-               }
-            }
-         }
-         for (int dz = 0; dz < L2_DOFS_1D; ++dz)
-         {
-            const double w = L2QuadToDof[ijN(dz,qz,L2_DOFS_1D)];
-            for (int dy = 0; dy < L2_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < L2_DOFS_1D; ++dx)
-               {
-                  e[ijklN(dx,dy,dz,el,L2_DOFS_1D)] += w * e_xy[ijN(dx,dy,L2_DOFS_1D)];
-               }
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-typedef void (*fForceMult)(const int numElements,
-                           const double* restrict L2QuadToDof,
-                           const double* restrict H1DofToQuad,
-                           const double* restrict H1DofToQuadD,
-                           const double* restrict stressJinvT,
-                           const double* restrict e,
-                           double* restrict v);
-
-// *****************************************************************************
-void rForceMult(const int NUM_DIM,
-                const int NUM_DOFS_1D,
-                const int NUM_QUAD_1D,
-                const int L2_DOFS_1D,
-                const int H1_DOFS_1D,
-                const int nzones,
-                const double* restrict L2QuadToDof,
-                const double* restrict H1DofToQuad,
-                const double* restrict H1DofToQuadD,
-                const double* restrict stressJinvT,
-                const double* restrict e,
-                double* restrict v)
-{
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   static std::unordered_map<unsigned long long, fForceMult> call =
-   {
-      {0x20,&rForceMult2D<2,2,2,1,2>},
-      {0x21,&rForceMult2D<2,3,4,2,3>},
-      {0x22,&rForceMult2D<2,4,6,3,4>},
-      {0x23,&rForceMult2D<2,5,8,4,5>},
-      {0x24,&rForceMult2D<2,6,10,5,6>},
-      {0x25,&rForceMult2D<2,7,12,6,7>},
-      {0x26,&rForceMult2D<2,8,14,7,8>},
-      {0x27,&rForceMult2D<2,9,16,8,9>},
-      {0x28,&rForceMult2D<2,10,18,9,10>},
-      {0x29,&rForceMult2D<2,11,20,10,11>},
-      {0x2A,&rForceMult2D<2,12,22,11,12>},
-      {0x2B,&rForceMult2D<2,13,24,12,13>},
-      {0x2C,&rForceMult2D<2,14,26,13,14>},
-      {0x2D,&rForceMult2D<2,15,28,14,15>},
-      {0x2E,&rForceMult2D<2,16,30,15,16>},
-      {0x2F,&rForceMult2D<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMult3D<3,2,2,1,2>},
-      {0x31,&rForceMult3D<3,3,4,2,3>},
-      {0x32,&rForceMult3D<3,4,6,3,4>},
-      {0x33,&rForceMult3D<3,5,8,4,5>},
-      {0x34,&rForceMult3D<3,6,10,5,6>},
-      {0x35,&rForceMult3D<3,7,12,6,7>},
-      {0x36,&rForceMult3D<3,8,14,7,8>},
-      {0x37,&rForceMult3D<3,9,16,8,9>},
-      {0x38,&rForceMult3D<3,10,18,9,10>},
-      {0x39,&rForceMult3D<3,11,20,10,11>},
-      {0x3A,&rForceMult3D<3,12,22,11,12>},
-      {0x3B,&rForceMult3D<3,13,24,12,13>},
-      {0x3C,&rForceMult3D<3,14,26,13,14>},
-      {0x3D,&rForceMult3D<3,15,28,14,15>},
-      {0x3E,&rForceMult3D<3,16,30,15,16>},
-      {0x3F,&rForceMult3D<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMult] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call[id](nzones,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,e,v);
-}
-
-// *****************************************************************************
-typedef void (*fForceMultTranspose)(const int numElements,
-                                    const double* restrict L2QuadToDof,
-                                    const double* restrict H1DofToQuad,
-                                    const double* restrict H1DofToQuadD,
-                                    const double* restrict stressJinvT,
-                                    const double* restrict v,
-                                    double* restrict e);
-
-// *****************************************************************************
-void rForceMultTranspose(const int NUM_DIM,
-                         const int NUM_DOFS_1D,
-                         const int NUM_QUAD_1D,
-                         const int L2_DOFS_1D,
-                         const int H1_DOFS_1D,
-                         const int nzones,
-                         const double* restrict L2QuadToDof,
-                         const double* restrict H1DofToQuad,
-                         const double* restrict H1DofToQuadD,
-                         const double* restrict stressJinvT,
-                         const double* restrict v,
-                         double* restrict e)
-{
-   assert(NUM_DOFS_1D==H1_DOFS_1D);
-   assert(L2_DOFS_1D==NUM_DOFS_1D-1);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   const unsigned int id = ((NUM_DIM)<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned long long, fForceMultTranspose> call =
-   {
-      // 2D
-      {0x20,&rForceMultTranspose2D<2,2,2,1,2>},
-      {0x21,&rForceMultTranspose2D<2,3,4,2,3>},
-      {0x22,&rForceMultTranspose2D<2,4,6,3,4>},
-      {0x23,&rForceMultTranspose2D<2,5,8,4,5>},
-      {0x24,&rForceMultTranspose2D<2,6,10,5,6>},
-      {0x25,&rForceMultTranspose2D<2,7,12,6,7>},
-      {0x26,&rForceMultTranspose2D<2,8,14,7,8>},
-      {0x27,&rForceMultTranspose2D<2,9,16,8,9>},
-      {0x28,&rForceMultTranspose2D<2,10,18,9,10>},
-      {0x29,&rForceMultTranspose2D<2,11,20,10,11>},
-      {0x2A,&rForceMultTranspose2D<2,12,22,11,12>},
-      {0x2B,&rForceMultTranspose2D<2,13,24,12,13>},
-      {0x2C,&rForceMultTranspose2D<2,14,26,13,14>},
-      {0x2D,&rForceMultTranspose2D<2,15,28,14,15>},
-      {0x2E,&rForceMultTranspose2D<2,16,30,15,16>},
-      {0x2F,&rForceMultTranspose2D<2,17,32,16,17>},
-      // 3D
-      {0x30,&rForceMultTranspose3D<3,2,2,1,2>},
-      {0x31,&rForceMultTranspose3D<3,3,4,2,3>},
-      {0x32,&rForceMultTranspose3D<3,4,6,3,4>},
-      {0x33,&rForceMultTranspose3D<3,5,8,4,5>},
-      {0x34,&rForceMultTranspose3D<3,6,10,5,6>},
-      {0x35,&rForceMultTranspose3D<3,7,12,6,7>},
-      {0x36,&rForceMultTranspose3D<3,8,14,7,8>},
-      {0x37,&rForceMultTranspose3D<3,9,16,8,9>},
-      {0x38,&rForceMultTranspose3D<3,10,18,9,10>},
-      {0x39,&rForceMultTranspose3D<3,11,20,10,11>},
-      {0x3A,&rForceMultTranspose3D<3,12,22,11,12>},
-      {0x3B,&rForceMultTranspose3D<3,13,24,12,13>},
-      {0x3C,&rForceMultTranspose3D<3,14,26,13,14>},
-      {0x3D,&rForceMultTranspose3D<3,15,28,14,15>},
-      {0x3E,&rForceMultTranspose3D<3,16,30,15,16>},
-      {0x3F,&rForceMultTranspose3D<3,17,32,16,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rForceMultTranspose] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call[id](nzones,L2QuadToDof,H1DofToQuad,H1DofToQuadD,stressJinvT,v,e);
-}
-
diff --git a/raja/raja/kernels/geom/rInitGeom.cpp b/raja/raja/kernels/geom/rInitGeom.cpp
deleted file mode 100644
index e6685a8e..00000000
--- a/raja/raja/kernels/geom/rInitGeom.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void rNodeCopyByVDim(const int elements,
-                     const int numDofs,
-                     const int ndofs,
-                     const int dims,
-                     const int* eMap,
-                     const double* Sx,
-                     double* nodes)
-{
-   forall(e, elements,
-   {
-      for (int dof = 0; dof < numDofs; ++dof)
-      {
-         const int lid = dof+numDofs*e;
-         const int gid = eMap[lid];
-         for (int v = 0; v < dims; ++v)
-         {
-            const int moffset = v+dims*lid;
-            const int voffset = gid+v*ndofs;
-            nodes[moffset] = Sx[voffset];
-         }
-      }
-   }
-         );
-}
-
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD>
-void rIniGeom1D(
-   const int numElements,
-   const double* restrict dofToQuadD,
-   const double* restrict nodes,
-   double* restrict J,
-   double* restrict invJ,
-   double* restrict detJ)
-{
-   forall(e,numElements,
-   {
-      double s_nodes[NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d += NUM_QUAD)
-         {
-            s_nodes[d] = nodes[ijkN(0,d,e,NUM_QUAD)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijN(q,d,NUM_DOFS)];
-            J11 += wx * s_nodes[d];
-         }
-         J[ijN(q,e,NUM_QUAD)] = J11;
-         invJ[ijN(q, e,NUM_QUAD)] = 1.0 / J11;
-         detJ[ijN(q, e,NUM_QUAD)] = J11;
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD>
-void rIniGeom2D(
-   const int numElements,
-   const double* restrict dofToQuadD,
-   const double* restrict nodes,
-   double* restrict J,
-   double* restrict invJ,
-   double* restrict detJ)
-{
-   forall(el,numElements,
-   {
-      double s_nodes[2 * NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d +=NUM_QUAD)
-         {
-            s_nodes[ijN(0,d,2)] = nodes[ijkNM(0,d,el,2,NUM_DOFS)];
-            s_nodes[ijN(1,d,2)] = nodes[ijkNM(1,d,el,2,NUM_DOFS)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0; double J12 = 0;
-         double J21 = 0; double J22 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijkNM(0,q,d,2,NUM_QUAD)];
-            const double wy = dofToQuadD[ijkNM(1,q,d,2,NUM_QUAD)];
-            const double x = s_nodes[ijN(0,d,2)];
-            const double y = s_nodes[ijN(1,d,2)];
-            J11 += (wx * x); J12 += (wx * y);
-            J21 += (wy * x); J22 += (wy * y);
-         }
-         const double r_detJ = (J11 * J22)-(J12 * J21);
-         J[ijklNM(0, 0, q, el,2,NUM_QUAD)] = J11;
-         J[ijklNM(1, 0, q, el,2,NUM_QUAD)] = J12;
-         J[ijklNM(0, 1, q, el,2,NUM_QUAD)] = J21;
-         J[ijklNM(1, 1, q, el,2,NUM_QUAD)] = J22;
-         const double r_idetJ = 1.0 / r_detJ;
-         invJ[ijklNM(0, 0, q, el,2,NUM_QUAD)] =  J22 * r_idetJ;
-         invJ[ijklNM(1, 0, q, el,2,NUM_QUAD)] = -J12 * r_idetJ;
-         invJ[ijklNM(0, 1, q, el,2,NUM_QUAD)] = -J21 * r_idetJ;
-         invJ[ijklNM(1, 1, q, el,2,NUM_QUAD)] =  J11 * r_idetJ;
-         detJ[ijN(q, el,NUM_QUAD)] = r_detJ;
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS,
-         const int NUM_QUAD>
-void rIniGeom3D(
-   const int numElements,
-   const double* restrict dofToQuadD,
-   const double* restrict nodes,
-   double* restrict J,
-   double* restrict invJ,
-   double* restrict detJ)
-{
-   forall(e,numElements,
-   {
-      double s_nodes[3*NUM_DOFS];
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         for (int d = q; d < NUM_DOFS; d += NUM_QUAD)
-         {
-            s_nodes[ijN(0,d,3)] = nodes[ijkNM(0, d, e,3,NUM_DOFS)];
-            s_nodes[ijN(1,d,3)] = nodes[ijkNM(1, d, e,3,NUM_DOFS)];
-            s_nodes[ijN(2,d,3)] = nodes[ijkNM(2, d, e,3,NUM_DOFS)];
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double J11 = 0; double J12 = 0; double J13 = 0;
-         double J21 = 0; double J22 = 0; double J23 = 0;
-         double J31 = 0; double J32 = 0; double J33 = 0;
-         for (int d = 0; d < NUM_DOFS; ++d)
-         {
-            const double wx = dofToQuadD[ijkNM(0, q, d,3,NUM_QUAD)];
-            const double wy = dofToQuadD[ijkNM(1, q, d,3,NUM_QUAD)];
-            const double wz = dofToQuadD[ijkNM(2, q, d,3,NUM_QUAD)];
-            const double x = s_nodes[ijN(0, d,3)];
-            const double y = s_nodes[ijN(1, d,3)];
-            const double z = s_nodes[ijN(2, d,3)];
-            J11 += (wx * x); J12 += (wx * y); J13 += (wx * z);
-            J21 += (wy * x); J22 += (wy * y); J23 += (wy * z);
-            J31 += (wz * x); J32 += (wz * y); J33 += (wz * z);
-         }
-         const double r_detJ = ((J11 * J22 * J33) + (J12 * J23 * J31) +
-                                (J13 * J21 * J32) -
-                                (J13 * J22 * J31)-(J12 * J21 * J33)-(J11 * J23 * J32));
-         J[ijklNM(0, 0, q, e,3,NUM_QUAD)] = J11;
-         J[ijklNM(1, 0, q, e,3,NUM_QUAD)] = J12;
-         J[ijklNM(2, 0, q, e,3,NUM_QUAD)] = J13;
-         J[ijklNM(0, 1, q, e,3,NUM_QUAD)] = J21;
-         J[ijklNM(1, 1, q, e,3,NUM_QUAD)] = J22;
-         J[ijklNM(2, 1, q, e,3,NUM_QUAD)] = J23;
-         J[ijklNM(0, 2, q, e,3,NUM_QUAD)] = J31;
-         J[ijklNM(1, 2, q, e,3,NUM_QUAD)] = J32;
-         J[ijklNM(2, 2, q, e,3,NUM_QUAD)] = J33;
-
-         const double r_idetJ = 1.0 / r_detJ;
-         invJ[ijklNM(0, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J22 * J33)-(J23 * J32));
-         invJ[ijklNM(1, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J32 * J13)-(J33 * J12));
-         invJ[ijklNM(2, 0, q, e,3,NUM_QUAD)] = r_idetJ * ((J12 * J23)-(J13 * J22));
-
-         invJ[ijklNM(0, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J23 * J31)-(J21 * J33));
-         invJ[ijklNM(1, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J33 * J11)-(J31 * J13));
-         invJ[ijklNM(2, 1, q, e,3,NUM_QUAD)] = r_idetJ * ((J13 * J21)-(J11 * J23));
-
-         invJ[ijklNM(0, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J21 * J32)-(J22 * J31));
-         invJ[ijklNM(1, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J31 * J12)-(J32 * J11));
-         invJ[ijklNM(2, 2, q, e,3,NUM_QUAD)] = r_idetJ * ((J11 * J22)-(J12 * J21));
-         detJ[ijN(q, e,NUM_QUAD)] = r_detJ;
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-typedef void (*fIniGeom)(const int numElements,
-                         const double* restrict dofToQuadD,
-                         const double* restrict nodes,
-                         double* restrict J,
-                         double* restrict invJ,
-                         double* restrict detJ);
-
-
-// *****************************************************************************
-void rIniGeom(const int DIM,
-              const int NUM_DOFS,
-              const int NUM_QUAD,
-              const int numElements,
-              const double* dofToQuadD,
-              const double* nodes,
-              double* restrict J,
-              double* restrict invJ,
-              double* restrict detJ)
-{
-   const unsigned int dofs1D = IROOT(DIM,NUM_DOFS);
-   const unsigned int quad1D = IROOT(DIM,NUM_QUAD);
-   const unsigned int id = (DIM<<4)|(dofs1D-2);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(dofs1D-2)<=4);
-   if (quad1D!=2*(dofs1D-1))
-   {
-      printf("\033[31;1m[rIniGeom] order ERROR: -ok=p -ot=p-1, p in [1,16] (%d,%d)\033[m\n",
-                       quad1D,dofs1D);
-      return exit(1);
-   }
-   assert(quad1D==2*(dofs1D-1));
-   static std::unordered_map<unsigned int, fIniGeom> call =
-   {
-      // 2D
-      {0x20,&rIniGeom2D<2*2,(2*2-2)*(2*2-2)>},
-      {0x21,&rIniGeom2D<3*3,(3*2-2)*(3*2-2)>},
-      {0x22,&rIniGeom2D<4*4,(4*2-2)*(4*2-2)>},
-      {0x23,&rIniGeom2D<5*5,(5*2-2)*(5*2-2)>},
-      {0x24,&rIniGeom2D<6*6,(6*2-2)*(6*2-2)>},
-      {0x25,&rIniGeom2D<7*7,(7*2-2)*(7*2-2)>},
-      {0x26,&rIniGeom2D<8*8,(8*2-2)*(8*2-2)>},
-      {0x27,&rIniGeom2D<9*9,(9*2-2)*(9*2-2)>},
-      {0x28,&rIniGeom2D<10*10,(10*2-2)*(10*2-2)>},
-      {0x29,&rIniGeom2D<11*11,(11*2-2)*(11*2-2)>},
-      {0x2A,&rIniGeom2D<12*12,(12*2-2)*(12*2-2)>},
-      {0x2B,&rIniGeom2D<13*13,(13*2-2)*(13*2-2)>},
-      {0x2C,&rIniGeom2D<14*14,(14*2-2)*(14*2-2)>},
-      {0x2D,&rIniGeom2D<15*15,(15*2-2)*(15*2-2)>},
-      {0x2E,&rIniGeom2D<16*16,(16*2-2)*(16*2-2)>},
-      {0x2F,&rIniGeom2D<17*17,(17*2-2)*(17*2-2)>},
-      // 3D
-      {0x30,&rIniGeom3D<2*2*2,2*2*2>},
-      {0x31,&rIniGeom3D<3*3*3,4*4*4>},
-      {0x32,&rIniGeom3D<4*4*4,6*6*6>},
-      {0x33,&rIniGeom3D<5*5*5,8*8*8>},
-      {0x34,&rIniGeom3D<6*6*6,10*10*10>},
-      {0x35,&rIniGeom3D<7*7*7,12*12*12>},
-      {0x36,&rIniGeom3D<8*8*8,14*14*14>},
-      {0x37,&rIniGeom3D<9*9*9,16*16*16>},
-      {0x38,&rIniGeom3D<10*10*10,18*18*18>},
-      {0x39,&rIniGeom3D<11*11*11,20*20*20>},
-      {0x3A,&rIniGeom3D<12*12*12,22*22*22>},
-      {0x3B,&rIniGeom3D<13*13*13,24*24*24>},
-      {0x3C,&rIniGeom3D<14*14*14,26*26*26>},
-      {0x3D,&rIniGeom3D<15*15*15,28*28*28>},
-      {0x3E,&rIniGeom3D<16*16*16,30*30*30>},
-      {0x3F,&rIniGeom3D<17*17*17,32*32*32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rIniGeom] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call[id](numElements,dofToQuadD,nodes,J,invJ,detJ);
-}
diff --git a/raja/raja/kernels/include/forall.hpp b/raja/raja/kernels/include/forall.hpp
deleted file mode 100644
index e40e3486..00000000
--- a/raja/raja/kernels/include/forall.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_KERNELS_FORALL
-#define LAGHOS_RAJA_KERNELS_FORALL
-
-// *****************************************************************************
-#if defined(RAJA_ENABLE_CUDA)
-#define CUDA_BLOCK_SIZE 256
-
-#define cu_device __device__
-#define cu_exec RAJA::cuda_exec<CUDA_BLOCK_SIZE>
-#define cu_reduce RAJA::cuda_reduce<CUDA_BLOCK_SIZE>
-
-#define sq_device __host__
-#define sq_exec RAJA::seq_exec
-#define sq_reduce RAJA::seq_reduce
-
-#define ReduceDecl(type,var,ini) \
-  RAJA::Reduce ## type<sq_reduce, RAJA::Real_type> var(ini);
-#define ReduceForall(i,max,body) \
-  RAJA::forall<sq_exec>(RAJA::RangeSegment(0, max),[=]sq_device(RAJA::Index_type i) {body});
-
-#define forall(i,max,body)                                              \
-   if (mfem::rconfig::Get().Cuda())                                     \
-      RAJA::forall<cu_exec>(RAJA::RangeSegment(0, max),[=]cu_device(RAJA::Index_type i) {body}); \
-   else                                                                 \
-      RAJA::forall<sq_exec>(RAJA::RangeSegment(0, max),[=]sq_device(RAJA::Index_type i) {body});
-
-#elif defined(RAJA_ENABLE_HIP)
-#define HIP_BLOCK_SIZE 256
-
-#define hip_device __device__
-#define hip_exec RAJA::hip_exec<HIP_BLOCK_SIZE>
-#define hip_reduce RAJA::hip_reduce<HIP_BLOCK_SIZE>
-
-#define sq_device __host__
-#define sq_exec RAJA::seq_exec
-#define sq_reduce RAJA::seq_reduce
-
-#define ReduceDecl(type,var,ini) \
-  RAJA::Reduce ## type<sq_reduce, RAJA::Real_type> var(ini);
-#define ReduceForall(i,max,body) \
-  RAJA::forall<sq_exec>(RAJA::RangeSegment(0, max),[=]sq_device(RAJA::Index_type i) {body});
-
-#define forall(i,max,body)                                              \
-   if (mfem::rconfig::Get().Hip())                                     \
-      RAJA::forall<hip_exec>(RAJA::RangeSegment(0, max),[=]hip_device(RAJA::Index_type i) {body}); \
-   else                                                                 \
-      RAJA::forall<sq_exec>(RAJA::RangeSegment(0, max),[=]sq_device(RAJA::Index_type i) {body});
-
-#endif
-
-#endif // LAGHOS_RAJA_KERNELS_FORALL
diff --git a/raja/raja/kernels/include/kernels.hpp b/raja/raja/kernels/include/kernels.hpp
deleted file mode 100644
index 4f2d0fd2..00000000
--- a/raja/raja/kernels/include/kernels.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_KERNELS
-#define LAGHOS_RAJA_KERNELS
-
-#define restrict __restrict__
-
-// **** BLAS1 ******************************************************************
-void vector_neg(const int, double* restrict);
-//extern "C" kernel void d_vector_op_eq(const int, const double, double* restrict);
-void vector_op_eq(const int, const double, double* restrict);
-void vector_xpay(const int, const double, double* restrict,
-                 const double* restrict,
-                 const double* restrict);
-void vector_xsy(const int, double* restrict, const double* restrict,
-                const double* restrict);
-void vector_axpy(const int, const double, double* restrict,
-                 const double* restrict);
-void vector_map_dofs(const int, double* restrict, const double* restrict,
-                     const int* restrict);
-template <class T>
-void vector_map_add_dofs(const int, T* restrict, const T* restrict,
-                         const int* restrict);
-void vector_clear_dofs(const int, double* restrict, const int* restrict);
-void vector_vec_sub(const int, double* restrict, const double* restrict);
-void vector_vec_add(const int, double* restrict, const double* restrict);
-void vector_vec_mul(const int, double* restrict, const double);
-void vector_set_subvector(const int, double* restrict, const double* restrict,
-                          const int* restrict);
-void vector_get_subvector(const int, double* restrict, const double* restrict,
-                          const int* restrict);
-void vector_set_subvector_const(const int, const double, double* restrict,
-                                const int* restrict);
-double vector_dot(const int, const double* restrict, const double* restrict);
-double vector_min(const int, const double* restrict);
-
-// *****************************************************************************
-void reduceMin(int, const double*, double*);
-void reduceSum(int, const double*, const double*, double*);
-
-// *****************************************************************************
-void rGridFuncToQuad(const int dim,
-                     const int NUM_VDIM,
-                     const int NUM_DOFS_1D,
-                     const int NUM_QUAD_1D,
-                     const int numElements,
-                     const double* restrict dofToQuad,
-                     const int* l2gMap,
-                     const double* restrict gf,
-                     double* restrict out);
-
-void rGridFuncToQuadS(const int dim,
-                      const int NUM_VDIM,
-                      const int NUM_DOFS_1D,
-                      const int NUM_QUAD_1D,
-                      const int numElements,
-                      const double* restrict dofToQuad,
-                      const int* l2gMap,
-                      const double* restrict gf,
-                      double* restrict out);
-
-// mapping *********************************************************************
-void rSetSubVector(const int entries,
-                   const int* restrict indices,
-                   const double* restrict in,
-                   double* restrict out);
-
-void rMapSubVector(const int entries,
-                   const int* restrict indices,
-                   const double* restrict in,
-                   double* restrict out);
-
-void rExtractSubVector(const int entries,
-                       const int* restrict indices,
-                       const double* restrict in,
-                       double* restrict out);
-
-// kQuadratureData *************************************************************
-void rInitQuadratureData(const int NUM_QUAD,
-                         const int numElements,
-                         const double* restrict rho0,
-                         const double* restrict detJ,
-                         const double* restrict quadWeights,
-                         double* restrict rho0DetJ0w);
-
-void rUpdateQuadratureData(const double GAMMA,
-                           const double H0,
-                           const double CFL,
-                           const bool USE_VISCOSITY,
-                           const int NUM_DIM,
-                           const int NUM_QUAD,
-                           const int NUM_QUAD_1D,
-                           const int NUM_DOFS_1D,
-                           const int numElements,
-                           const double* restrict dofToQuad,
-                           const double* restrict dofToQuadD,
-                           const double* restrict quadWeights,
-                           const double* restrict v,
-                           const double* restrict e,
-                           const double* restrict rho0DetJ0w,
-                           const double* restrict invJ0,
-                           const double* restrict J,
-                           const double* restrict invJ,
-                           const double* restrict detJ,
-                           double* restrict stressJinvT,
-                           double* restrict dtEst);
-void rUpdateQuadratureDataS(const double GAMMA,
-                            const double H0,
-                            const double CFL,
-                            const bool USE_VISCOSITY,
-                            const int NUM_DIM,
-                            const int NUM_QUAD,
-                            const int NUM_QUAD_1D,
-                            const int NUM_DOFS_1D,
-                            const int numElements,
-                            const double* restrict dofToQuad,
-                            const double* restrict dofToQuadD,
-                            const double* restrict quadWeights,
-                            const double* restrict v,
-                            const double* restrict e,
-                            const double* restrict rho0DetJ0w,
-                            const double* restrict invJ0,
-                            const double* restrict J,
-                            const double* restrict invJ,
-                            const double* restrict detJ,
-                            double* restrict stressJinvT,
-                            double* restrict dtEst);
-
-// kForce **********************************************************************
-void rForceMult(const int NUM_DIM,
-                const int NUM_DOFS_1D,
-                const int NUM_QUAD_1D,
-                const int L2_DOFS_1D,
-                const int H1_DOFS_1D,
-                const int nzones,
-                const double* restrict L2DofToQuad,
-                const double* restrict H1QuadToDof,
-                const double* restrict H1QuadToDofD,
-                const double* restrict stressJinvT,
-                const double* restrict e,
-                double* restrict v);
-void rForceMultS(const int NUM_DIM,
-                 const int NUM_DOFS_1D,
-                 const int NUM_QUAD_1D,
-                 const int L2_DOFS_1D,
-                 const int H1_DOFS_1D,
-                 const int nzones,
-                 const double* restrict L2DofToQuad,
-                 const double* restrict H1QuadToDof,
-                 const double* restrict H1QuadToDofD,
-                 const double* restrict stressJinvT,
-                 const double* restrict e,
-                 double* restrict v);
-
-void rForceMultTranspose(const int NUM_DIM,
-                         const int NUM_DOFS_1D,
-                         const int NUM_QUAD_1D,
-                         const int L2_DOFS_1D,
-                         const int H1_DOFS_1D,
-                         const int nzones,
-                         const double* restrict L2QuadToDof,
-                         const double* restrict H1DofToQuad,
-                         const double* restrict H1DofToQuadD,
-                         const double* restrict stressJinvT,
-                         const double* restrict v,
-                         double* restrict e);
-void rForceMultTransposeS(const int NUM_DIM,
-                          const int NUM_DOFS_1D,
-                          const int NUM_QUAD_1D,
-                          const int L2_DOFS_1D,
-                          const int H1_DOFS_1D,
-                          const int nzones,
-                          const double* restrict L2QuadToDof,
-                          const double* restrict H1DofToQuad,
-                          const double* restrict H1DofToQuadD,
-                          const double* restrict stressJinvT,
-                          const double* restrict v,
-                          double* restrict e);
-
-// *****************************************************************************
-void rNodeCopyByVDim(const int elements,
-                     const int numDofs,
-                     const int ndofs,
-                     const int dims,
-                     const int* eMap,
-                     const double* Sx,
-                     double* nodes);
-
-// *****************************************************************************
-void rIniGeom(const int dim,
-              const int nDofs,
-              const int nQuads,
-              const int nzones,
-              const double* restrict dofToQuadD,
-              const double* restrict nodes,
-              double* restrict J,
-              double* restrict invJ,
-              double* restrict detJ);
-
-// *****************************************************************************
-void rGlobalToLocal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* restrict offsets,
-                    const int* restrict indices,
-                    const double* restrict globalX,
-                    double* restrict localX);
-
-void rLocalToGlobal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* restrict offsets,
-                    const int* restrict indices,
-                    const double* restrict localX,
-                    double* restrict globalX);
-
-// *****************************************************************************
-void rMassMultAdd(const int dim,
-                  const int NUM_DOFS_1D,
-                  const int NUM_QUAD_1D,
-                  const int numElements,
-                  const double* restrict dofToQuad,
-                  const double* restrict dofToQuadD,
-                  const double* restrict quadToDof,
-                  const double* restrict quadToDofD,
-                  const double* restrict op,
-                  const double* restrict x,
-                  double* restrict y);
-void rMassMultAddS(const int dim,
-                   const int NUM_DOFS_1D,
-                   const int NUM_QUAD_1D,
-                   const int numElements,
-                   const double* restrict dofToQuad,
-                   const double* restrict dofToQuadD,
-                   const double* restrict quadToDof,
-                   const double* restrict quadToDofD,
-                   const double* restrict op,
-                   const double* restrict x,
-                   double* restrict y);
-
-#endif // LAGHOS_RAJA_KERNELS
diff --git a/raja/raja/kernels/include/offsets.hpp b/raja/raja/kernels/include/offsets.hpp
deleted file mode 100644
index 0daa9c76..00000000
--- a/raja/raja/kernels/include/offsets.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_KERNEL_OFFSETS
-#define LAGHOS_RAJA_KERNEL_OFFSETS
-
-// Offsets *********************************************************************
-#define   ijN(i,j,N) (i)+(N)*(j)
-#define  ijkN(i,j,k,N) (i)+(N)*((j)+(N)*(k))
-#define ijklN(i,j,k,l,N) (i)+(N)*((j)+(N)*((k)+(N)*(l)))
-
-#define      ijNMt(i,j,N,M,t) (t)?((i)+(N)*(j)):((j)+(M)*(i))
-#define      ijkNM(i,j,k,N,M) (i)+(N)*((j)+(M)*(k))
-#define     _ijkNM(i,j,k,N,M) (j)+(N)*((k)+(M)*(i))
-#define     ijklNM(i,j,k,l,N,M) (i)+(N)*((j)+(N)*((k)+(M)*(l)))
-#define    _ijklNM(i,j,k,l,N,M)  (j)+(N)*((k)+(N)*((l)+(M)*(i)))
-#define    ijklmNM(i,j,k,l,m,N,M) (i)+(N)*((j)+(N)*((k)+(M)*((l)+(M)*(m))))
-#define   _ijklmNM(i,j,k,l,m,N,M) (j)+(N)*((k)+(N)*((l)+(N)*((m)+(M)*(i))))
-#define   ijklmnNM(i,j,k,l,m,n,N,M) (i)+(N)*((j)+(N)*((k)+(M)*((l)+(M)*((m)+(M)*(n)))))
-
-#endif // LAGHOS_RAJA_KERNEL_OFFSETS
diff --git a/raja/raja/kernels/maps/rGlobalToLocal.cpp b/raja/raja/kernels/maps/rGlobalToLocal.cpp
deleted file mode 100644
index 46add593..00000000
--- a/raja/raja/kernels/maps/rGlobalToLocal.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void rGlobalToLocal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* __restrict offsets,
-                    const int* __restrict indices,
-                    const double* __restrict globalX,
-                    double* __restrict localX)
-{
-   forall(i,globalEntries,
-   {
-      const int offset = offsets[i];
-      const int nextOffset = offsets[i+1];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         const int g_offset = ijNMt(v,i,NUM_VDIM,globalEntries,VDIM_ORDERING);
-         const double dofValue = globalX[g_offset];
-         for (int j = offset; j < nextOffset; ++j)
-         {
-            const int l_offset = ijNMt(v,indices[j],NUM_VDIM,localEntries,VDIM_ORDERING);
-            localX[l_offset] = dofValue;
-         }
-      }
-   });
-}
diff --git a/raja/raja/kernels/maps/rLocalToGlobal.cpp b/raja/raja/kernels/maps/rLocalToGlobal.cpp
deleted file mode 100644
index 8b2ae136..00000000
--- a/raja/raja/kernels/maps/rLocalToGlobal.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void rLocalToGlobal(const int NUM_VDIM,
-                    const bool VDIM_ORDERING,
-                    const int globalEntries,
-                    const int localEntries,
-                    const int* offsets,
-                    const int* indices,
-                    const double* localX,
-                    double* __restrict globalX)
-{
-   forall(i,globalEntries,
-   {
-      const int offset = offsets[i];
-      const int nextOffset = offsets[i + 1];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         double dofValue = 0;
-         for (int j = offset; j < nextOffset; ++j)
-         {
-            const int l_offset = ijNMt(v,indices[j],NUM_VDIM,localEntries,VDIM_ORDERING);
-            dofValue += localX[l_offset];
-         }
-         const int g_offset = ijNMt(v,i,NUM_VDIM,globalEntries,VDIM_ORDERING);
-         globalX[g_offset] = dofValue;
-      }
-   });
-}
diff --git a/raja/raja/kernels/maps/rMapping.cpp b/raja/raja/kernels/maps/rMapping.cpp
deleted file mode 100644
index c82dda33..00000000
--- a/raja/raja/kernels/maps/rMapping.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-void rSetSubVector(const int N,
-                   const int* indices,
-                   const double* in,
-                   double* __restrict out)
-{
-   forall(i,N,out[indices[i]] = in[i];);
-}
-
-// *****************************************************************************
-void rMapSubVector(const int N,
-                   const int* indices,
-                   const double* in,
-                   double* __restrict out)
-{
-   forall(i,N,
-   {
-      const int fromIdx = indices[2*i + 0];
-      const int toIdx   = indices[2*i + 1];
-      out[toIdx] = in[fromIdx];
-   });
-}
-
-// *****************************************************************************
-void rExtractSubVector(const int N,
-                       const int* indices,
-                       const double* in,
-                       double* __restrict out)
-{
-   forall(i,N,out[i] = in[indices[i]];);
-}
diff --git a/raja/raja/kernels/mass/rMassAssemble.cpp b/raja/raja/kernels/mass/rMassAssemble.cpp
deleted file mode 100644
index 3a300c31..00000000
--- a/raja/raja/kernels/mass/rMassAssemble.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-static void rMassAssemble2D(const int numElements,
-                            const int NUM_QUAD_2D,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   forall(e, numElements,
-   {
-      for (int q = 0; q < NUM_QUAD_2D; ++q)
-      {
-         const double J11 = J[ijklNM(0,0,q,e,2,NUM_QUAD_2D)];
-         const double J12 = J[ijklNM(1,0,q,e,2,NUM_QUAD_2D)];
-         const double J21 = J[ijklNM(0,1,q,e,2,NUM_QUAD_2D)];
-         const double J22 = J[ijklNM(1,1,q,e,2,NUM_QUAD_2D)];
-         const double detJ = ((J11 * J22)-(J21 * J12));
-         oper[ijN(q,e,NUM_QUAD_2D)] = quadWeights[q] * COEFF * detJ;
-      }
-   });
-}
-
-// *****************************************************************************
-static void rMassAssemble3D(const int NUM_QUAD_3D,
-                            const int numElements,
-                            const double COEFF,
-                            const double* quadWeights,
-                            const double* J,
-                            double* __restrict oper)
-{
-   forall(e,numElements,
-   {
-      for (int q = 0; q < NUM_QUAD_3D; ++q)
-      {
-         const double J11 = J[ijklNM(0,0,q,e,3,NUM_QUAD_3D)];
-         const double J12 = J[ijklNM(1,0,q,e,3,NUM_QUAD_3D)];
-         const double J13 = J[ijklNM(2,0,q,e,3,NUM_QUAD_3D)];
-         const double J21 = J[ijklNM(0,1,q,e,3,NUM_QUAD_3D)];
-         const double J22 = J[ijklNM(1,1,q,e,3,NUM_QUAD_3D)];
-         const double J23 = J[ijklNM(2,1,q,e,3,NUM_QUAD_3D)];
-         const double J31 = J[ijklNM(0,2,q,e,3,NUM_QUAD_3D)];
-         const double J32 = J[ijklNM(1,2,q,e,3,NUM_QUAD_3D)];
-         const double J33 = J[ijklNM(2,2,q,e,3,NUM_QUAD_3D)];
-         const double detJ = ((J11*J22*J33)+(J12*J23*J31)+
-         (J13*J21*J32)-(J13*J22*J31)-
-         (J12*J21*J33)-(J11*J23*J32));
-         oper[ijN(q,e,NUM_QUAD_3D)] = quadWeights[q]*COEFF*detJ;
-      }
-   } );
-}
-
-// *****************************************************************************
-void rMassAssemble(const int dim,
-                   const int NUM_QUAD,
-                   const int numElements,
-                   const double* quadWeights,
-                   const double* J,
-                   const double COEFF,
-                   double* __restrict oper)
-{
-   assert(false);
-   if (dim==1) { assert(false); }
-   if (dim==2) { rMassAssemble2D(numElements,NUM_QUAD,COEFF,quadWeights,J,oper); }
-   if (dim==3) { rMassAssemble3D(numElements,NUM_QUAD,COEFF,quadWeights,J,oper); }
-}
diff --git a/raja/raja/kernels/mass/rMassMultAdd.cpp b/raja/raja/kernels/mass/rMassMultAdd.cpp
deleted file mode 100644
index f91ae598..00000000
--- a/raja/raja/kernels/mass/rMassMultAdd.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D>
-void rMassMultAdd2D(
-   const int numElements,
-   const double* restrict dofToQuad,
-   const double* restrict dofToQuadD,
-   const double* restrict quadToDof,
-   const double* restrict quadToDofD,
-   const double* restrict oper,
-   const double* restrict solIn,
-   double* restrict solOut)
-{
-
-   forall(e,numElements,
-   {
-      double sol_xy[NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            sol_xy[qy][qx] = 0.0;
-         }
-      }
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double sol_x[NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            sol_x[qy] = 0.0;
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            const double s = solIn[ijkN(dx,dy,e,NUM_DOFS_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_x[qx] += dofToQuad[ijN(qx,dx,NUM_QUAD_1D)]* s;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double d2q = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xy[qy][qx] += d2q * sol_x[qx];
-            }
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            sol_xy[qy][qx] *= oper[ijkN(qx,qy,e,NUM_QUAD_1D)];
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         double sol_x[NUM_DOFS_1D];
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            sol_x[dx] = 0.0;
-         }
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            const double s = sol_xy[qy][qx];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_x[dx] += quadToDof[ijN(dx,qx,NUM_DOFS_1D)] * s;
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            const double q2d = quadToDof[ijN(dy,qy,NUM_DOFS_1D)];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               solOut[ijkN(dx,dy,e,NUM_DOFS_1D)] += q2d * sol_x[dx];
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D>
-void rMassMultAdd3D(
-   const int numElements,
-   const double* dofToQuad,
-   const double* dofToQuadD,
-   const double* quadToDof,
-   const double* quadToDofD,
-   const double* oper,
-   const double* solIn,
-   double* __restrict solOut)
-{
-   forall(e,numElements,
-   {
-      double sol_xyz[NUM_QUAD_1D][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xyz[qz][qy][qx] = 0;
-            }
-         }
-      }
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double sol_xy[NUM_QUAD_1D][NUM_QUAD_1D];
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xy[qy][qx] = 0;
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double sol_x[NUM_QUAD_1D];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_x[qx] = 0;
-            }
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               const double s = solIn[ijklN(dx,dy,dz,e,NUM_DOFS_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_x[qx] += dofToQuad[ijN(qx,dx,NUM_QUAD_1D)] * s;
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_xy[qy][qx] += wy * sol_x[qx];
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
-               }
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               sol_xyz[qz][qy][qx] *= oper[ijklN(qx,qy,qz,e,NUM_QUAD_1D)];
-            }
-         }
-      }
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         double sol_xy[NUM_DOFS_1D][NUM_DOFS_1D];
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_xy[dy][dx] = 0;
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            double sol_x[NUM_DOFS_1D];
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               sol_x[dx] = 0;
-            }
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double s = sol_xyz[qz][qy][qx];
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  sol_x[dx] += quadToDof[ijN(dx,qx,NUM_DOFS_1D)] * s;
-               }
-            }
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               const double wy = quadToDof[ijN(dy,qy,NUM_DOFS_1D)];
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  sol_xy[dy][dx] += wy * sol_x[dx];
-               }
-            }
-         }
-         for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-         {
-            const double wz = quadToDof[ijN(dz,qz,NUM_DOFS_1D)];
-            for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-            {
-               for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-               {
-                  solOut[ijklN(dx,dy,dz,e,NUM_DOFS_1D)] += wz * sol_xy[dy][dx];
-               }
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-typedef void (*fMassMultAdd)(const int numElements,
-                             const double* dofToQuad,
-                             const double* dofToQuadD,
-                             const double* quadToDof,
-                             const double* quadToDofD,
-                             const double* oper,
-                             const double* solIn,
-                             double* __restrict solOut);
-
-// *****************************************************************************
-void rMassMultAdd(const int DIM,
-                  const int NUM_DOFS_1D,
-                  const int NUM_QUAD_1D,
-                  const int numElements,
-                  const double* dofToQuad,
-                  const double* dofToQuadD,
-                  const double* quadToDof,
-                  const double* quadToDofD,
-                  const double* op,
-                  const double* x,
-                  double* __restrict y)
-{
-   assert(LOG2(DIM)<=4);
-   assert((NUM_QUAD_1D&1)==0);
-   assert(LOG2(NUM_DOFS_1D-1)<=8);
-   assert(LOG2(NUM_QUAD_1D>>1)<=8);
-   const unsigned int id = (DIM<<16)|((NUM_DOFS_1D-1)<<8)|(NUM_QUAD_1D>>1);
-   static std::unordered_map<unsigned int, fMassMultAdd> call =
-   {
-      // 2D
-      {0x20001,&rMassMultAdd2D<1,2>},    {0x20101,&rMassMultAdd2D<2,2>},
-      {0x20102,&rMassMultAdd2D<2,4>},    {0x20202,&rMassMultAdd2D<3,4>},
-      {0x20203,&rMassMultAdd2D<3,6>},    {0x20303,&rMassMultAdd2D<4,6>},
-      {0x20304,&rMassMultAdd2D<4,8>},    {0x20404,&rMassMultAdd2D<5,8>},
-      {0x20405,&rMassMultAdd2D<5,10>},   {0x20505,&rMassMultAdd2D<6,10>},
-      {0x20506,&rMassMultAdd2D<6,12>},   {0x20606,&rMassMultAdd2D<7,12>},
-      {0x20607,&rMassMultAdd2D<7,14>},   {0x20707,&rMassMultAdd2D<8,14>},
-      {0x20708,&rMassMultAdd2D<8,16>},   {0x20808,&rMassMultAdd2D<9,16>},
-      {0x20809,&rMassMultAdd2D<9,18>},   {0x20909,&rMassMultAdd2D<10,18>},
-      {0x2090A,&rMassMultAdd2D<10,20>},  {0x20A0A,&rMassMultAdd2D<11,20>},
-      {0x20A0B,&rMassMultAdd2D<11,22>},  {0x20B0B,&rMassMultAdd2D<12,22>},
-      {0x20B0C,&rMassMultAdd2D<12,24>},  {0x20C0C,&rMassMultAdd2D<13,24>},
-      {0x20C0D,&rMassMultAdd2D<13,26>},  {0x20D0D,&rMassMultAdd2D<14,26>},
-      {0x20D0E,&rMassMultAdd2D<14,28>},  {0x20E0E,&rMassMultAdd2D<15,28>},
-      {0x20E0F,&rMassMultAdd2D<15,30>},  {0x20F0F,&rMassMultAdd2D<16,30>},
-      {0x20F10,&rMassMultAdd2D<16,32>},  {0x21010,&rMassMultAdd2D<17,32>},
-      // 3D
-      {0x30001,&rMassMultAdd3D<1,2>},    {0x30101,&rMassMultAdd3D<2,2>},
-      {0x30102,&rMassMultAdd3D<2,4>},    {0x30202,&rMassMultAdd3D<3,4>},
-      {0x30203,&rMassMultAdd3D<3,6>},    {0x30303,&rMassMultAdd3D<4,6>},
-      {0x30304,&rMassMultAdd3D<4,8>},    {0x30404,&rMassMultAdd3D<5,8>},
-      {0x30405,&rMassMultAdd3D<5,10>},   {0x30505,&rMassMultAdd3D<6,10>},
-      {0x30506,&rMassMultAdd3D<6,12>},   {0x30606,&rMassMultAdd3D<7,12>},
-      {0x30607,&rMassMultAdd3D<7,14>},   {0x30707,&rMassMultAdd3D<8,14>},
-      {0x30708,&rMassMultAdd3D<8,16>},   {0x30808,&rMassMultAdd3D<9,16>},
-      {0x30809,&rMassMultAdd3D<9,18>},   {0x30909,&rMassMultAdd3D<10,18>},
-      {0x3090A,&rMassMultAdd3D<10,20>},  {0x30A0A,&rMassMultAdd3D<11,20>},
-      {0x30A0B,&rMassMultAdd3D<11,22>},  {0x30B0B,&rMassMultAdd3D<12,22>},
-      {0x30B0C,&rMassMultAdd3D<12,24>},  {0x30C0C,&rMassMultAdd3D<13,24>},
-      {0x30C0D,&rMassMultAdd3D<13,26>},  {0x30D0D,&rMassMultAdd3D<14,26>},
-      {0x30D0E,&rMassMultAdd3D<14,28>},  {0x30E0E,&rMassMultAdd3D<15,28>},
-      {0x30E0F,&rMassMultAdd3D<15,30>},  {0x30F0F,&rMassMultAdd3D<16,30>},
-      {0x30F10,&rMassMultAdd3D<16,32>},  {0x31010,&rMassMultAdd3D<17,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rMassMultAdd] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call[id](numElements,dofToQuad,dofToQuadD,quadToDof,quadToDofD,op,x,y);
-}
diff --git a/raja/raja/kernels/quad/rGridFuncToQuad.cpp b/raja/raja/kernels/quad/rGridFuncToQuad.cpp
deleted file mode 100644
index 6c267a17..00000000
--- a/raja/raja/kernels/quad/rGridFuncToQuad.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D>
-void rGridFuncToQuad1D(
-   const int numElements,
-   const double* restrict dofToQuad,
-   const int* restrict l2gMap,
-   const double* restrict gf,
-   double* restrict out)
-{
-   forall(e,numElements,
-   {
-      double r_out[NUM_VDIM][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            r_out[v][qx] = 0;
-         }
-      }
-      for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-      {
-         const int gid = l2gMap[(dx) + (NUM_DOFS_1D) * (e)];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            const double r_gf = gf[v + gid * NUM_VDIM];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               r_out[v][qx] += r_gf * dofToQuad[(qx) + (NUM_QUAD_1D) * (dx)];
-            }
-         }
-      }
-      for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-      {
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            out[(qx) + (NUM_QUAD_1D) * ((e) + (numElements) * (v))] = r_out[v][qx];
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D>
-void rGridFuncToQuad2D(
-   const int numElements,
-   const double* restrict dofToQuad,
-   const int* restrict l2gMap,
-   const double* restrict gf,
-   double* restrict out)
-{
-   forall(e,numElements,
-   {
-      double out_xy[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               out_xy[v][qy][qx] = 0;
-            }
-         }
-      }
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double out_x[NUM_VDIM][NUM_QUAD_1D];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               out_x[v][qy] = 0;
-            }
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            const int gid = l2gMap[ijkN(dx, dy, e,NUM_DOFS_1D)];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               const double r_gf = gf[v + gid*NUM_VDIM];
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  out_x[v][qy] += r_gf * dofToQuad[ijN(qy, dx,NUM_QUAD_1D)];
-               }
-            }
-         }
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double d2q = dofToQuad[ijN(qy, dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xy[v][qy][qx] += d2q * out_x[v][qx];
-               }
-            }
-         }
-      }
-      for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-      {
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               out[_ijklNM(v, qx, qy, e,NUM_QUAD_1D,numElements)] = out_xy[v][qy][qx];
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_VDIM,
-         const int NUM_DOFS_1D,
-         const int NUM_QUAD_1D>
-void rGridFuncToQuad3D(
-   const int numElements,
-   const double* restrict dofToQuad,
-   const int* restrict l2gMap,
-   const double* restrict gf,
-   double* restrict out)
-{
-   forall(e,numElements,
-   {
-      double out_xyz[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D][NUM_QUAD_1D];
-      for (int v = 0; v < NUM_VDIM; ++v)
-      {
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xyz[v][qz][qy][qx] = 0;
-               }
-            }
-         }
-      }
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double out_xy[NUM_VDIM][NUM_QUAD_1D][NUM_QUAD_1D];
-         for (int v = 0; v < NUM_VDIM; ++v)
-         {
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_xy[v][qy][qx] = 0;
-               }
-            }
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double out_x[NUM_VDIM][NUM_QUAD_1D];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  out_x[v][qx] = 0;
-               }
-            }
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               const int gid = l2gMap[ijklN(dx, dy, dz, e,NUM_DOFS_1D)];
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  const double r_gf = gf[v + gid*NUM_VDIM];
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_x[v][qx] += r_gf * dofToQuad[ijN(qx, dx, NUM_QUAD_1D)];
-                  }
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy = dofToQuad[ijN(qy, dy, NUM_QUAD_1D)];
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_xy[v][qy][qx] += wy * out_x[v][qx];
-                  }
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-         {
-            const double wz = dofToQuad[ijN(qz, dz, NUM_QUAD_1D)];
-            for (int v = 0; v < NUM_VDIM; ++v)
-            {
-               for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-               {
-                  for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-                  {
-                     out_xyz[v][qz][qy][qx] += wz * out_xy[v][qy][qx];
-                  }
-               }
-            }
-         }
-      }
-
-      for (int qz = 0; qz < NUM_QUAD_1D; ++qz)
-      {
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               for (int v = 0; v < NUM_VDIM; ++v)
-               {
-                  out[_ijklmNM(v, qx, qy, qz, e,NUM_QUAD_1D,
-                               numElements)] = out_xyz[v][qz][qy][qx];
-               }
-            }
-         }
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-typedef void (*fGridFuncToQuad)(const int numElements,
-                                const double* restrict dofToQuad,
-                                const int* restrict l2gMap,
-                                const double* gf,
-                                double* restrict out);
-
-// *****************************************************************************
-void rGridFuncToQuad(const int DIM,
-                     const int NUM_VDIM,
-                     const int NUM_DOFS_1D,
-                     const int NUM_QUAD_1D,
-                     const int numElements,
-                     const double* dofToQuad,
-                     const int* l2gMap,
-                     const double* gf,
-                     double* __restrict out)
-{
-   const unsigned int id = (DIM<<8)|(NUM_VDIM<<4)|(NUM_DOFS_1D-1);
-   assert(LOG2(DIM)<=4);
-   assert(LOG2(NUM_VDIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-1)<=4);
-   assert(NUM_QUAD_1D==2*NUM_DOFS_1D);
-   if (NUM_QUAD_1D!=2*NUM_DOFS_1D)
-   {
-      printf("\033[31;1m[rGridFuncToQuad] order ERROR: -ok=p -ot=p-1, p in [1,16]\033[m\n");
-      return exit(1);
-   }
-   static std::unordered_map<unsigned int, fGridFuncToQuad> call =
-   {
-      // 2D
-      {0x210,&rGridFuncToQuad2D<1,1,2>},
-      {0x211,&rGridFuncToQuad2D<1,2,4>},
-      {0x212,&rGridFuncToQuad2D<1,3,6>},
-      {0x213,&rGridFuncToQuad2D<1,4,8>},
-      {0x214,&rGridFuncToQuad2D<1,5,10>},
-      {0x215,&rGridFuncToQuad2D<1,6,12>},
-      {0x216,&rGridFuncToQuad2D<1,7,14>},
-      {0x217,&rGridFuncToQuad2D<1,8,16>},
-      {0x218,&rGridFuncToQuad2D<1,9,18>},
-      {0x219,&rGridFuncToQuad2D<1,10,20>},
-      {0x21A,&rGridFuncToQuad2D<1,11,22>},
-      {0x21B,&rGridFuncToQuad2D<1,12,24>},
-      {0x21C,&rGridFuncToQuad2D<1,13,26>},
-      {0x21D,&rGridFuncToQuad2D<1,14,28>},
-      {0x21E,&rGridFuncToQuad2D<1,15,30>},
-      {0x21F,&rGridFuncToQuad2D<1,16,32>},
-
-      // 3D
-      {0x310,&rGridFuncToQuad3D<1,1,2>},
-      {0x311,&rGridFuncToQuad3D<1,2,4>},
-      {0x312,&rGridFuncToQuad3D<1,3,6>},
-      {0x313,&rGridFuncToQuad3D<1,4,8>},
-      {0x314,&rGridFuncToQuad3D<1,5,10>},
-      {0x315,&rGridFuncToQuad3D<1,6,12>},
-      {0x316,&rGridFuncToQuad3D<1,7,14>},
-      {0x317,&rGridFuncToQuad3D<1,8,16>},
-      {0x318,&rGridFuncToQuad3D<1,9,18>},
-      {0x319,&rGridFuncToQuad3D<1,10,20>},
-      {0x31A,&rGridFuncToQuad3D<1,11,22>},
-      {0x31B,&rGridFuncToQuad3D<1,12,24>},
-      {0x31C,&rGridFuncToQuad3D<1,13,26>},
-      {0x31D,&rGridFuncToQuad3D<1,14,28>},
-      {0x31E,&rGridFuncToQuad3D<1,15,30>},
-      {0x31F,&rGridFuncToQuad3D<1,16,32>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rGridFuncToQuad] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call[id](numElements,dofToQuad,l2gMap,gf,out);
-}
diff --git a/raja/raja/kernels/quad/rQDataInit.cpp b/raja/raja/kernels/quad/rQDataInit.cpp
deleted file mode 100644
index ec7579c6..00000000
--- a/raja/raja/kernels/quad/rQDataInit.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-
-// *****************************************************************************
-template<const int NUM_QUAD>
-void rInitQuadData(
-   const int nzones,
-   const double* restrict rho0,
-   const double* restrict detJ,
-   const double* restrict quadWeights,
-   double* restrict rho0DetJ0w)
-{
-   forall(el,nzones,
-   {
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         rho0DetJ0w[ijN(q,el,NUM_QUAD)] =
-         rho0[ijN(q,el,NUM_QUAD)]*detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-      }
-   });
-}
-typedef void (*fInitQuadratureData)(const int,const double*,const double*,
-                                    const double*,double*);
-void rInitQuadratureData(const int NUM_QUAD,
-                         const int numElements,
-                         const double* restrict rho0,
-                         const double* restrict detJ,
-                         const double* restrict quadWeights,
-                         double* restrict rho0DetJ0w)
-{
-   const unsigned int id = NUM_QUAD;
-   static std::unordered_map<unsigned int,
-          fInitQuadratureData> call =
-   {
-      {2,&rInitQuadData<2>},
-      {4,&rInitQuadData<4>},
-      {8,&rInitQuadData<8>},
-      {16,&rInitQuadData<16>},
-      {25,&rInitQuadData<25>},
-      {36,&rInitQuadData<36>},
-      {49,&rInitQuadData<49>},
-      {64,&rInitQuadData<64>},
-      {81,&rInitQuadData<81>},
-      {100,&rInitQuadData<100>},
-      {121,&rInitQuadData<121>},
-      {125,&rInitQuadData<125>},
-      {144,&rInitQuadData<144>},
-      {196,&rInitQuadData<196>},
-      {216,&rInitQuadData<216>},
-      {256,&rInitQuadData<256>},
-      {324,&rInitQuadData<324>},
-      {400,&rInitQuadData<400>},
-      {484,&rInitQuadData<484>},
-      {512,&rInitQuadData<512>},
-      {576,&rInitQuadData<576>},
-      {676,&rInitQuadData<676>},
-      {900,&rInitQuadData<900>},
-      {1000,&rInitQuadData<1000>},
-      {1024,&rInitQuadData<1024>},
-      {1728,&rInitQuadData<1728>},
-      {2744,&rInitQuadData<2744>},
-      {4096,&rInitQuadData<4096>},
-      {5832,&rInitQuadData<5832>},
-      {8000,&rInitQuadData<8000>},
-      {10648,&rInitQuadData<10648>},
-      {13824,&rInitQuadData<13824>},
-      {17576,&rInitQuadData<17576>},
-      {21952,&rInitQuadData<21952>},
-      {27000,&rInitQuadData<27000>},
-      {32768,&rInitQuadData<32768>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rInitQuadratureData] id \033[33m0x%X (%d)\033[m ",id,id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call[id](numElements,rho0,detJ,quadWeights,rho0DetJ0w);
-}
diff --git a/raja/raja/kernels/quad/rQDataUpdate.cpp b/raja/raja/kernels/quad/rQDataUpdate.cpp
deleted file mode 100644
index bd0fc1f4..00000000
--- a/raja/raja/kernels/quad/rQDataUpdate.cpp
+++ /dev/null
@@ -1,640 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#include "../raja.hpp"
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D>
-void rUpdateQuadratureData2D(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int VDIMQ = NUM_DIM*NUM_DIM * NUM_QUAD_2D;
-   forall(el,numElements,
-   {
-      double s_gradv[VDIMQ];
-      for (int i = 0; i < VDIMQ; ++i) s_gradv[i] = 0.0;
-
-      for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-      {
-         double vDx[NUM_DIM*NUM_QUAD_1D];
-         double  vx[NUM_DIM*NUM_QUAD_1D];
-
-         for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-         {
-            for (int c = 0; c < NUM_DIM; ++c)
-            {
-               vDx[ijN(c,qx,NUM_DIM)] = 0.0;
-               vx[ijN(c,qx,NUM_DIM)] = 0.0;
-            }
-         }
-         for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-         {
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               const double wx  =  dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-               const double wDx = dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-               for (int c = 0; c < NUM_DIM; ++c)
-               {
-                  vDx[ijN(c,qx,NUM_DIM)] += wDx * v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-                  vx[ijN(c,qx,NUM_DIM)] +=  wx * v[_ijklNM(c,dx,dy,el,NUM_DOFS_1D,numElements)];
-               }
-            }
-         }
-         for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-         {
-            const double  wy =  dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-            const double wDy = dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-            for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-            {
-               for (int c = 0; c < NUM_DIM; ++c)
-               {
-                  s_gradv[ijkN(c,0,qx+qy*NUM_QUAD_1D,NUM_DIM)] += wy *vDx[ijN(c,qx,NUM_DIM)];
-                  s_gradv[ijkN(c,1,qx+qy*NUM_QUAD_1D,NUM_DIM)] += wDy*vx[ijN(c,qx,NUM_DIM)];
-               }
-            }
-         }
-      }
-
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double q_gradv[NUM_DIM*NUM_DIM];
-         double q_stress[NUM_DIM*NUM_DIM];
-
-         const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-
-         q_gradv[ijN(0,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_00)+(s_gradv[ijkN(1,0,q,
-                                                                               2)]*invJ_01));
-         q_gradv[ijN(1,0,2)] = ((s_gradv[ijkN(0,0,q,2)]*invJ_10)+(s_gradv[ijkN(1,0,q,
-                                                                               2)]*invJ_11));
-         q_gradv[ijN(0,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_00)+(s_gradv[ijkN(1,1,q,
-                                                                               2)]*invJ_01));
-         q_gradv[ijN(1,1,2)] = ((s_gradv[ijkN(0,1,q,2)]*invJ_10)+(s_gradv[ijkN(1,1,q,
-                                                                               2)]*invJ_11));
-
-         const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-
-         const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-         const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-
-         // TODO: Input OccaVector eos(q,e) -> (stress,soundSpeed)
-         const double s = -(GAMMA-1.0)*q_rho*q_e;
-         q_stress[ijN(0,0,2)] = s; q_stress[ijN(1,0,2)] = 0;
-         q_stress[ijN(0,1,2)] = 0; q_stress[ijN(1,1,2)] = s;
-
-         const double gradv00 = q_gradv[ijN(0,0,2)];
-         const double gradv11 = q_gradv[ijN(1,1,2)];
-         const double gradv10 = 0.5*(q_gradv[ijN(1,0,2)]+q_gradv[ijN(0,1,2)]);
-         q_gradv[ijN(1,0,2)] = gradv10;
-         q_gradv[ijN(0,1,2)] = gradv10;
-
-         double comprDirX = 1;
-         double comprDirY = 0;
-         double minEig = 0;
-         // linalg/densemat.cpp: Eigensystem2S()
-         if (gradv10 == 0)
-         {
-            minEig = (gradv00 < gradv11) ? gradv00 : gradv11;
-         }
-         else
-         {
-            const double zeta  = (gradv11-gradv00) / (2.0*gradv10);
-            const double azeta = fabs(zeta);
-            double t = 1.0 / (azeta+sqrt(1.0+zeta*zeta));
-            if ((t < 0) != (zeta < 0))
-            {
-               t = -t;
-            }
-            const double c = sqrt(1.0 / (1.0+t*t));
-            const double s = c*t;
-            t *= gradv10;
-            if ((gradv00-t) <= (gradv11+t))
-            {
-               minEig = gradv00-t;
-               comprDirX = c;
-               comprDirY = -s;
-            }
-            else
-            {
-               minEig = gradv11+t;
-               comprDirX = s;
-               comprDirY = c;
-            }
-         }
-
-         // Computes the initial->physical transformation Jacobian.
-         const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double Jpi_00 = ((J_00*invJ0_00)+(J_10*invJ0_01));
-         const double Jpi_10 = ((J_00*invJ0_10)+(J_10*invJ0_11));
-         const double Jpi_01 = ((J_01*invJ0_00)+(J_11*invJ0_01));
-         const double Jpi_11 = ((J_01*invJ0_10)+(J_11*invJ0_11));
-         const double physDirX = (Jpi_00*comprDirX)+(Jpi_10*comprDirY);
-         const double physDirY = (Jpi_01*comprDirX)+(Jpi_11*comprDirY);
-         const double q_h = H0*sqrt((physDirX*physDirX)+(physDirY*physDirY));
-         // TODO: soundSpeed will be an input as well (function call or values per q)
-         const double soundSpeed = sqrt(GAMMA*(GAMMA-1.0)*q_e);
-         dtEst[ijN(q,el,NUM_QUAD)] = CFL*q_h / soundSpeed;
-         //printf("\ndt_est=%.15e",q_h);
-         //printf("\ndt_est=%.15e",dtEst[ijN(q,el)]);
-         if (USE_VISCOSITY)
-         {
-            // TODO: Check how we can extract outside of kernel
-            const double mu = minEig;
-            double coeff = 2.0*q_rho*q_h*q_h*fabs(mu);
-            if (mu < 0)
-            {
-               coeff += 0.5*q_rho*q_h*soundSpeed;
-            }
-            for (int y = 0; y < NUM_DIM; ++y)
-            {
-               for (int x = 0; x < NUM_DIM; ++x)
-               {
-                  q_stress[ijN(x,y,2)] += coeff*q_gradv[ijN(x,y,2)];
-               }
-            }
-         }
-         const double S00 = q_stress[ijN(0,0,2)];
-         const double S10 = q_stress[ijN(1,0,2)];
-         const double S01 = q_stress[ijN(0,1,2)];
-         const double S11 = q_stress[ijN(1,1,2)];
-         stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_00)+(S10*invJ_01));
-         stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_10)+(S10*invJ_11));
-         stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_00)+(S11*invJ_01));
-         stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_10)+(S11*invJ_11));
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-template<const int NUM_DIM,
-         const int NUM_QUAD,
-         const int NUM_QUAD_1D,
-         const int NUM_DOFS_1D>
-void rUpdateQuadratureData3D(const double GAMMA,
-                             const double H0,
-                             const double CFL,
-                             const bool USE_VISCOSITY,
-                             const int numElements,
-                             const double* restrict dofToQuad,
-                             const double* restrict dofToQuadD,
-                             const double* restrict quadWeights,
-                             const double* restrict v,
-                             const double* restrict e,
-                             const double* restrict rho0DetJ0w,
-                             const double* restrict invJ0,
-                             const double* restrict J,
-                             const double* restrict invJ,
-                             const double* restrict detJ,
-                             double* restrict stressJinvT,
-                             double* restrict dtEst)
-{
-   const int NUM_QUAD_2D = NUM_QUAD_1D*NUM_QUAD_1D;
-   const int NUM_QUAD_3D = NUM_QUAD_1D*NUM_QUAD_1D*NUM_QUAD_1D;
-   forall(el,numElements,
-   {
-      double s_gradv[9*NUM_QUAD_3D];
-      for (int i = 0; i < (9*NUM_QUAD_3D); ++i)
-      {
-         s_gradv[i] = 0;
-      }
-
-      for (int dz = 0; dz < NUM_DOFS_1D; ++dz)
-      {
-         double vDxy[3*NUM_QUAD_2D] ;
-         double vxDy[3*NUM_QUAD_2D] ;
-         double vxy[3*NUM_QUAD_2D]  ;
-         for (int i = 0; i < (3*NUM_QUAD_2D); ++i)
-         {
-            vDxy[i] = 0;
-            vxDy[i] = 0;
-            vxy[i]  = 0;
-         }
-         for (int dy = 0; dy < NUM_DOFS_1D; ++dy)
-         {
-            double vDx[3*NUM_QUAD_1D] ;
-            double vx[3*NUM_QUAD_1D]  ;
-            for (int i = 0; i < (3*NUM_QUAD_1D); ++i)
-            {
-               vDx[i] = 0;
-               vx[i]  = 0;
-            }
-            for (int dx = 0; dx < NUM_DOFS_1D; ++dx)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     vDx[ijN(vi,qx,3)] += v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,
-                                                     numElements)]*dofToQuadD[ijN(qx,dx,NUM_QUAD_1D)];
-                     vx[ijN(vi,qx,3)]  += v[_ijklmNM(vi,dx,dy,dz,el,NUM_DOFS_1D,
-                                                     numElements)]*dofToQuad[ijN(qx,dx,NUM_QUAD_1D)];
-                  }
-               }
-            }
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               const double wy  = dofToQuad[ijN(qy,dy,NUM_QUAD_1D)];
-               const double wDy = dofToQuadD[ijN(qy,dy,NUM_QUAD_1D)];
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     vDxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)] += wy *vDx[ijN(vi,qx,3)];
-                     vxDy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)] += wDy*vx[ijN(vi,qx,3)];
-                     vxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)]  += wy *vx[ijN(vi,qx,3)];
-                  }
-               }
-            }
-         }
-         for (int qz = 0; qz < NUM_DOFS_1D; ++qz)
-         {
-            const double wz  = dofToQuad[ijN(qz,dz,NUM_QUAD_1D)];
-            const double wDz = dofToQuadD[ijN(qz,dz,NUM_QUAD_1D)];
-            for (int qy = 0; qy < NUM_QUAD_1D; ++qy)
-            {
-               for (int qx = 0; qx < NUM_QUAD_1D; ++qx)
-               {
-                  const int q = qx+qy*NUM_QUAD_1D+qz*NUM_QUAD_2D;
-                  for (int vi = 0; vi < 3; ++vi)
-                  {
-                     s_gradv[ijkN(vi,0,q,3)] += wz *vDxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                     s_gradv[ijkN(vi,1,q,3)] += wz *vxDy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                     s_gradv[ijkN(vi,2,q,3)] += wDz*vxy[ijkNM(vi,qx,qy,3,NUM_QUAD_1D)];
-                  }
-               }
-            }
-         }
-      }
-      for (int q = 0; q < NUM_QUAD; ++q)
-      {
-         double q_gradv[9];
-         double q_stress[9];
-         const double invJ_00 = invJ[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_10 = invJ[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_20 = invJ[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_01 = invJ[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_11 = invJ[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_21 = invJ[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_02 = invJ[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_12 = invJ[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ_22 = invJ[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-         q_gradv[ijN(0,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_02));
-         q_gradv[ijN(1,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_12));
-         q_gradv[ijN(2,0,3)] = ((s_gradv[ijkN(0,0,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,0,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,0,q,3)]*invJ_22));
-         q_gradv[ijN(0,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_02));
-         q_gradv[ijN(1,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_12));
-         q_gradv[ijN(2,1,3)] = ((s_gradv[ijkN(0,1,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,1,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,1,q,3)]*invJ_22));
-         q_gradv[ijN(0,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_00) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_01) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_02));
-         q_gradv[ijN(1,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_10) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_11) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_12));
-         q_gradv[ijN(2,2,3)] = ((s_gradv[ijkN(0,2,q,3)]*invJ_20) +
-                                (s_gradv[ijkN(1,2,q,3)]*invJ_21) +
-                                (s_gradv[ijkN(2,2,q,3)]*invJ_22));
-         const double q_Jw = detJ[ijN(q,el,NUM_QUAD)]*quadWeights[q];
-         const double q_rho = rho0DetJ0w[ijN(q,el,NUM_QUAD)] / q_Jw;
-         const double q_e   = fmax(0.0,e[ijN(q,el,NUM_QUAD)]);
-         const double s = -(GAMMA-1.0)*q_rho*q_e;
-         q_stress[ijN(0,0,3)] = s; q_stress[ijN(1,0,3)] = 0; q_stress[ijN(2,0,3)] = 0;
-         q_stress[ijN(0,1,3)] = 0; q_stress[ijN(1,1,3)] = s; q_stress[ijN(2,1,3)] = 0;
-         q_stress[ijN(0,2,3)] = 0; q_stress[ijN(1,2,3)] = 0; q_stress[ijN(2,2,3)] = s;
-         const double gradv00 = q_gradv[ijN(0,0,3)];
-         const double gradv11 = q_gradv[ijN(1,1,3)];
-         const double gradv22 = q_gradv[ijN(2,2,3)];
-         const double gradv10 = 0.5*(q_gradv[ijN(1,0,3)]+q_gradv[ijN(0,1,3)]);
-         const double gradv20 = 0.5*(q_gradv[ijN(2,0,3)]+q_gradv[ijN(0,2,3)]);
-         const double gradv21 = 0.5*(q_gradv[ijN(2,1,3)]+q_gradv[ijN(1,2,3)]);
-         q_gradv[ijN(1,0,3)] = gradv10; q_gradv[ijN(2,0,3)] = gradv20;
-         q_gradv[ijN(0,1,3)] = gradv10; q_gradv[ijN(2,1,3)] = gradv21;
-         q_gradv[ijN(0,2,3)] = gradv20; q_gradv[ijN(1,2,3)] = gradv21;
-         double minEig = 0;
-         double comprDirX = 1;
-         double comprDirY = 0;
-         double comprDirZ = 0;
-         {
-            // Compute eigenvalues using quadrature formula
-            const double q_ = (gradv00+gradv11+gradv22) / 3.0;
-            const double gradv_q00 = (gradv00-q_);
-            const double gradv_q11 = (gradv11-q_);
-            const double gradv_q22 = (gradv22-q_);
-
-            const double p1 = ((gradv10*gradv10) +
-                               (gradv20*gradv20) +
-                               (gradv21*gradv21));
-            const double p2 = ((gradv_q00*gradv_q00) +
-                               (gradv_q11*gradv_q11) +
-                               (gradv_q22*gradv_q22) +
-                               (2.0*p1));
-            const double p    = sqrt(p2 / 6.0);
-            const double pinv = 1.0 / p;
-            // det(pinv*(gradv-q*I))
-            const double r = (0.5*pinv*pinv*pinv *
-                              ((gradv_q00*gradv_q11*gradv_q22) +
-                               (2.0*gradv10*gradv21*gradv20) -
-                               (gradv_q11*gradv20*gradv20) -
-                               (gradv_q22*gradv10*gradv10) -
-                               (gradv_q00*gradv21*gradv21)));
-
-            double phi = 0;
-            if (r <= -1.0)
-            {
-               phi = M_PI / 3.0;
-            }
-            else if (r < 1.0)
-            {
-               phi = acos(r) / 3.0;
-            }
-
-            minEig = q_+(2.0*p*cos(phi+(2.0*M_PI / 3.0)));
-            const double eig3 = q_+(2.0*p*cos(phi));
-            const double eig2 = 3.0*q_-minEig-eig3;
-            double maxNorm = 0;
-
-            for (int i = 0; i < 3; ++i)
-            {
-               const double x = q_gradv[i+3*0]-(i == 0)*eig3;
-               const double y = q_gradv[i+3*1]-(i == 1)*eig3;
-               const double z = q_gradv[i+3*2]-(i == 2)*eig3;
-               const double cx = ((x*(gradv00-eig2)) +
-                                  (y*gradv10) +
-                                  (z*gradv20));
-               const double cy = ((x*gradv10) +
-                                  (y*(gradv11-eig2)) +
-                                  (z*gradv21));
-               const double cz = ((x*gradv20) +
-                                  (y*gradv21) +
-                                  (z*(gradv22-eig2)));
-               const double cNorm = (cx*cx+cy*cy+cz*cz);
-               //#warning 1e-16 to 1
-               if ((cNorm > 1.e-16) && (maxNorm < cNorm))
-               {
-                  comprDirX = cx;
-                  comprDirY = cy;
-                  comprDirZ = cz;
-                  maxNorm = cNorm;
-               }
-            }
-            //#warning 1e-16 to 1
-            if (maxNorm > 1.e-16)
-            {
-               const double maxNormInv = 1.0 / sqrt(maxNorm);
-               comprDirX *= maxNormInv;
-               comprDirY *= maxNormInv;
-               comprDirZ *= maxNormInv;
-            }
-         }
-
-         // Computes the initial->physical transformation Jacobian.
-         const double J_00 = J[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_10 = J[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_20 = J[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_01 = J[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_11 = J[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_21 = J[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_02 = J[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_12 = J[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double J_22 = J[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         const double invJ0_00 = invJ0[ijklNM(0,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_10 = invJ0[ijklNM(1,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_20 = invJ0[ijklNM(2,0,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_01 = invJ0[ijklNM(0,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_11 = invJ0[ijklNM(1,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_21 = invJ0[ijklNM(2,1,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_02 = invJ0[ijklNM(0,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_12 = invJ0[ijklNM(1,2,q,el,NUM_DIM,NUM_QUAD)];
-         const double invJ0_22 = invJ0[ijklNM(2,2,q,el,NUM_DIM,NUM_QUAD)];
-
-         const double Jpi_00 = ((J_00*invJ0_00)+(J_10*invJ0_01)+(J_20*invJ0_02));
-         const double Jpi_10 = ((J_00*invJ0_10)+(J_10*invJ0_11)+(J_20*invJ0_12));
-         const double Jpi_20 = ((J_00*invJ0_20)+(J_10*invJ0_21)+(J_20*invJ0_22));
-
-         const double Jpi_01 = ((J_01*invJ0_00)+(J_11*invJ0_01)+(J_21*invJ0_02));
-         const double Jpi_11 = ((J_01*invJ0_10)+(J_11*invJ0_11)+(J_21*invJ0_12));
-         const double Jpi_21 = ((J_01*invJ0_20)+(J_11*invJ0_21)+(J_21*invJ0_22));
-
-         const double Jpi_02 = ((J_02*invJ0_00)+(J_12*invJ0_01)+(J_22*invJ0_02));
-         const double Jpi_12 = ((J_02*invJ0_10)+(J_12*invJ0_11)+(J_22*invJ0_12));
-         const double Jpi_22 = ((J_02*invJ0_20)+(J_12*invJ0_21)+(J_22*invJ0_22));
-
-         const double physDirX = ((Jpi_00*comprDirX)+(Jpi_10*comprDirY)+
-                                  (Jpi_20*comprDirZ));
-         const double physDirY = ((Jpi_01*comprDirX)+(Jpi_11*comprDirY)+
-                                  (Jpi_21*comprDirZ));
-         const double physDirZ = ((Jpi_02*comprDirX)+(Jpi_12*comprDirY)+
-                                  (Jpi_22*comprDirZ));
-
-         const double q_h = H0*sqrt((physDirX*physDirX)+
-                                    (physDirY*physDirY)+
-                                    (physDirZ*physDirZ));
-
-         const double soundSpeed = sqrt(GAMMA*(GAMMA-1.0)*q_e);
-         dtEst[ijN(q,el,NUM_QUAD)] = CFL*q_h / soundSpeed;
-
-         if (USE_VISCOSITY)
-         {
-            // TODO: Check how we can extract outside of kernel
-            const double mu = minEig;
-            double coeff = 2.0*q_rho*q_h*q_h*fabs(mu);
-            if (mu < 0)
-            {
-               coeff += 0.5*q_rho*q_h*soundSpeed;
-            }
-            for (int y = 0; y < 3; ++y)
-            {
-               for (int x = 0; x < 3; ++x)
-               {
-                  q_stress[ijN(x,y,3)] += coeff*q_gradv[ijN(x,y,3)];
-               }
-            }
-         }
-
-         const double S00 = q_stress[ijN(0,0,3)];
-         const double S10 = q_stress[ijN(1,0,3)];
-         const double S20 = q_stress[ijN(2,0,3)];
-         const double S01 = q_stress[ijN(0,1,3)];
-         const double S11 = q_stress[ijN(1,1,3)];
-         const double S21 = q_stress[ijN(2,1,3)];
-         const double S02 = q_stress[ijN(0,2,3)];
-         const double S12 = q_stress[ijN(1,2,3)];
-         const double S22 = q_stress[ijN(2,2,3)];
-
-         stressJinvT[ijklNM(0,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_00)+(S10*invJ_01)+(S20*invJ_02));
-         stressJinvT[ijklNM(1,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_10)+(S10*invJ_11)+(S20*invJ_12));
-         stressJinvT[ijklNM(2,0,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S00*invJ_20)+(S10*invJ_21)+(S20*invJ_22));
-
-         stressJinvT[ijklNM(0,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_00)+(S11*invJ_01)+(S21*invJ_02));
-         stressJinvT[ijklNM(1,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_10)+(S11*invJ_11)+(S21*invJ_12));
-         stressJinvT[ijklNM(2,1,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S01*invJ_20)+(S11*invJ_21)+(S21*invJ_22));
-
-         stressJinvT[ijklNM(0,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_00)+(S12*invJ_01)+(S22*invJ_02));
-         stressJinvT[ijklNM(1,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_10)+(S12*invJ_11)+(S22*invJ_12));
-         stressJinvT[ijklNM(2,2,q,el,NUM_DIM,
-                            NUM_QUAD)] = q_Jw*((S02*invJ_20)+(S12*invJ_21)+(S22*invJ_22));
-      }
-   }
-         );
-}
-
-// *****************************************************************************
-typedef void (*fUpdateQuadratureData)(const double GAMMA,
-                                      const double H0,
-                                      const double CFL,
-                                      const bool USE_VISCOSITY,
-                                      const int numElements,
-                                      const double* restrict dofToQuad,
-                                      const double* restrict dofToQuadD,
-                                      const double* restrict quadWeights,
-                                      const double* restrict v,
-                                      const double* restrict e,
-                                      const double* restrict rho0DetJ0w,
-                                      const double* restrict invJ0,
-                                      const double* restrict J,
-                                      const double* restrict invJ,
-                                      const double* restrict detJ,
-                                      double* restrict stressJinvT,
-                                      double* restrict dtEst);
-
-// *****************************************************************************
-void rUpdateQuadratureData(const double GAMMA,
-                           const double H0,
-                           const double CFL,
-                           const bool USE_VISCOSITY,
-                           const int NUM_DIM,
-                           const int NUM_QUAD,
-                           const int NUM_QUAD_1D,
-                           const int NUM_DOFS_1D,
-                           const int nzones,
-                           const double* restrict dofToQuad,
-                           const double* restrict dofToQuadD,
-                           const double* restrict quadWeights,
-                           const double* restrict v,
-                           const double* restrict e,
-                           const double* restrict rho0DetJ0w,
-                           const double* restrict invJ0,
-                           const double* restrict J,
-                           const double* restrict invJ,
-                           const double* restrict detJ,
-                           double* restrict stressJinvT,
-                           double* restrict dtEst)
-{
-   assert(LOG2(NUM_DIM)<=4);
-   assert(LOG2(NUM_DOFS_1D-2)<=4);
-   assert(NUM_QUAD_1D==2*(NUM_DOFS_1D-1));
-   assert(IROOT(NUM_DIM,NUM_QUAD)==NUM_QUAD_1D);
-   const unsigned int id = (NUM_DIM<<4)|(NUM_DOFS_1D-2);
-   static std::unordered_map<unsigned int, fUpdateQuadratureData> call =
-   {
-      // 2D
-      {0x20,&rUpdateQuadratureData2D<2,2*2,2,2>},
-      {0x21,&rUpdateQuadratureData2D<2,4*4,4,3>},
-      {0x22,&rUpdateQuadratureData2D<2,6*6,6,4>},
-      {0x23,&rUpdateQuadratureData2D<2,8*8,8,5>},
-      {0x24,&rUpdateQuadratureData2D<2,10*10,10,6>},
-      {0x25,&rUpdateQuadratureData2D<2,12*12,12,7>},
-      {0x26,&rUpdateQuadratureData2D<2,14*14,14,8>},
-      {0x27,&rUpdateQuadratureData2D<2,16*16,16,9>},
-      {0x28,&rUpdateQuadratureData2D<2,18*18,18,10>},
-      {0x29,&rUpdateQuadratureData2D<2,20*20,20,11>},
-      {0x2A,&rUpdateQuadratureData2D<2,22*22,22,12>},
-      {0x2B,&rUpdateQuadratureData2D<2,24*24,24,13>},
-      {0x2C,&rUpdateQuadratureData2D<2,26*26,26,14>},
-      {0x2D,&rUpdateQuadratureData2D<2,28*28,28,15>},
-      {0x2E,&rUpdateQuadratureData2D<2,30*30,30,16>},
-      {0x2F,&rUpdateQuadratureData2D<2,32*32,32,17>},
-      // 3D
-      {0x30,&rUpdateQuadratureData3D<3,2*2*2,2,2>},
-      {0x31,&rUpdateQuadratureData3D<3,4*4*4,4,3>},
-      {0x32,&rUpdateQuadratureData3D<3,6*6*6,6,4>},
-      {0x33,&rUpdateQuadratureData3D<3,8*8*8,8,5>},
-      {0x34,&rUpdateQuadratureData3D<3,10*10*10,10,6>},
-      {0x35,&rUpdateQuadratureData3D<3,12*12*12,12,7>},
-      {0x36,&rUpdateQuadratureData3D<3,14*14*14,14,8>},
-      {0x37,&rUpdateQuadratureData3D<3,16*16*16,16,9>},
-      {0x38,&rUpdateQuadratureData3D<3,18*18*18,18,10>},
-      {0x39,&rUpdateQuadratureData3D<3,20*20*20,20,11>},
-      {0x3A,&rUpdateQuadratureData3D<3,22*22*22,22,12>},
-      {0x3B,&rUpdateQuadratureData3D<3,24*24*24,24,13>},
-      {0x3C,&rUpdateQuadratureData3D<3,26*26*26,26,14>},
-      {0x3D,&rUpdateQuadratureData3D<3,28*28*28,28,15>},
-      {0x3E,&rUpdateQuadratureData3D<3,30*30*30,30,16>},
-      {0x3F,&rUpdateQuadratureData3D<3,32*32*32,32,17>},
-   };
-   if (!call[id])
-   {
-      printf("\n[rUpdateQuadratureData] id \033[33m0x%X\033[m ",id);
-      fflush(stdout);
-   }
-   assert(call[id]);
-   call[id](GAMMA,H0,CFL,USE_VISCOSITY,
-            nzones,dofToQuad,dofToQuadD,quadWeights,
-            v,e,rho0DetJ0w,invJ0,J,invJ,detJ,
-            stressJinvT,dtEst);
-}
diff --git a/raja/raja/kernels/raja.hpp b/raja/raja/kernels/raja.hpp
deleted file mode 100644
index 54b87a20..00000000
--- a/raja/raja/kernels/raja.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_KERNELS_RAJA
-#define LAGHOS_RAJA_KERNELS_RAJA
-
-// *****************************************************************************
-#include <math.h>
-#include <stdarg.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <functional>
-#include <unordered_map>
-
-// *****************************************************************************
-#define LOG2(X) ((unsigned) (8*sizeof(unsigned long long)-__builtin_clzll((X))))
-#define ISQRT(N) static_cast<unsigned>(sqrt(static_cast<float>(N)))
-#define ICBRT(N) static_cast<unsigned>(cbrt(static_cast<float>(N)))
-#define IROOT(D,N) ((D==1)?N:(D==2)?ISQRT(N):(D==3)?ICBRT(N):0)
-
-// *****************************************************************************
-#include "RAJA/RAJA.hpp"
-
-// *****************************************************************************
-#include "../config/rconfig.hpp"
-#include "../general/rmemcpy.hpp"
-#include "../general/rmalloc.hpp"
-
-// *****************************************************************************
-#include "include/forall.hpp"
-#include "include/offsets.hpp"
-#include "include/kernels.hpp"
-
-#endif // LAGHOS_RAJA_KERNELS_RAJA
diff --git a/raja/raja/linalg/rode.hpp b/raja/raja/linalg/rode.hpp
deleted file mode 100644
index aa24d9fe..00000000
--- a/raja/raja/linalg/rode.hpp
+++ /dev/null
@@ -1,279 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_ODE
-#define LAGHOS_RAJA_ODE
-
-namespace mfem
-{
-
-// ***************************************************************************
-class RajaODESolver
-{
-protected:
-   RajaTimeDependentOperator *f;
-public:
-   RajaODESolver() : f(NULL) {}
-   virtual ~RajaODESolver() {}
-   virtual void Init(RajaTimeDependentOperator &f) { this->f = &f; }
-   virtual void Step(RajaVector &x, double &t, double &dt) =0;
-};
-
-// ***************************************************************************
-class RajaForwardEulerSolver : public RajaODESolver
-{
-private:
-   RajaVector dxdt;
-public:
-   void Init(RajaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      dxdt.SetSize(f->Width());
-   }
-   void Step(RajaVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, dxdt);
-      x.Add(dt, dxdt);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class RajaRK2Solver : public RajaODESolver
-{
-private:
-   double a;
-   RajaVector dxdt, x1;
-public:
-   RajaRK2Solver(const double _a = 2./3.) : a(_a) { }
-   void Init(RajaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      dxdt.SetSize(n);
-      x1.SetSize(n);
-   }
-   void Step(RajaVector &x, double &t, double &dt)
-   {
-      const double b = 0.5/a;
-      f->SetTime(t);
-      f->Mult(x, dxdt);
-      add(x, (1. - b)*dt, dxdt, x1);
-      x.Add(a*dt, dxdt);
-      f->SetTime(t + a*dt);
-      f->Mult(x, dxdt);
-      add(x1, b*dt, dxdt, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class RajaRK3SSPSolver : public RajaODESolver
-{
-private:
-   RajaVector y, k;
-public:
-   void Init(RajaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      y.SetSize(n);
-      k.SetSize(n);
-   }
-   void Step(RajaVector &x, double &t, double &dt)
-   {
-      // x0 = x, t0 = t, k0 = dt*f(t0, x0)
-      f->SetTime(t);
-      f->Mult(x, k);
-      // x1 = x + k0, t1 = t + dt, k1 = dt*f(t1, x1)
-      add(x, dt, k, y);
-      f->SetTime(t + dt);
-      f->Mult(y, k);
-      // x2 = 3/4*x + 1/4*(x1 + k1), t2 = t + 1/2*dt, k2 = dt*f(t2, x2)
-      y.Add(dt, k);
-      add(3./4, x, 1./4, y, y);
-      f->SetTime(t + dt/2);
-      f->Mult(y, k);
-      // x3 = 1/3*x + 2/3*(x2 + k2), t3 = t + dt
-      y.Add(dt, k);
-      add(1./3, x, 2./3, y, x);
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class RajaRK4Solver : public RajaODESolver
-{
-private:
-   RajaVector y, k, z;
-public:
-   void Init(RajaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = RajaODESolver::f->Width();
-      y.SetSize(n);
-      k.SetSize(n);
-      z.SetSize(n);
-   }
-
-   void Step(RajaVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, k); // k1
-
-      add(x, dt/2, k, y);
-      add(x, dt/6, k, z);
-
-      f->SetTime(t + dt/2);
-
-      f->Mult(y, k); // k2
-
-      add(x, dt/2, k, y);
-      z.Add(dt/3, k);
-
-      f->Mult(y, k); // k3
-
-      add(x, dt, k, y);
-      z.Add(dt/3, k);
-      f->SetTime(t + dt);
-
-      f->Mult(y, k); // k4
-
-      add(z, dt/6, k, x);
-
-      t += dt;
-   }
-};
-
-// ***************************************************************************
-class RajaExplicitRKSolver : public RajaODESolver
-{
-private:
-   int s;
-   const double *a, *b, *c;
-   RajaVector y, *k;
-public:
-   RajaExplicitRKSolver(int _s, const double *_a,
-                        const double *_b, const double *_c)
-   {
-      s = _s;
-      a = _a;
-      b = _b;
-      c = _c;
-      k = new RajaVector[s];
-   }
-   void Init(RajaTimeDependentOperator &_f)
-   {
-      f = &_f;
-      int n = f->Width();
-      y.SetSize(n);
-      for (int i = 0; i < s; i++)
-      {
-         k[i].SetSize(n);
-      }
-   }
-   void Step(RajaVector &x, double &t, double &dt)
-   {
-      f->SetTime(t);
-      f->Mult(x, k[0]);
-      for (int l = 0, i = 1; i < s; i++)
-      {
-         add(x, a[l++]*dt, k[0], y);
-         for (int j = 1; j < i; j++)
-         {
-            y.Add(a[l++]*dt, k[j]);
-         }
-         f->SetTime(t + c[i-1]*dt);
-         f->Mult(y, k[i]);
-      }
-      for (int i = 0; i < s; i++)
-      {
-         x.Add(b[i]*dt, k[i]);
-      }
-      t += dt;
-   }
-   ~RajaExplicitRKSolver()
-   {
-      delete [] k;
-   }
-};
-
-// ***************************************************************************
-// ***************************************************************************
-static const double RK6_a[28] =
-{
-   .6e-1,
-   .1923996296296296296296296296296296296296e-1,
-   .7669337037037037037037037037037037037037e-1,
-   .35975e-1,
-   0.,
-   .107925,
-   1.318683415233148260919747276431735612861,
-   0.,
-   -5.042058063628562225427761634715637693344,
-   4.220674648395413964508014358283902080483,
-   -41.87259166432751461803757780644346812905,
-   0.,
-   159.4325621631374917700365669070346830453,
-   -122.1192135650100309202516203389242140663,
-   5.531743066200053768252631238332999150076,
-   -54.43015693531650433250642051294142461271,
-   0.,
-   207.0672513650184644273657173866509835987,
-   -158.6108137845899991828742424365058599469,
-   6.991816585950242321992597280791793907096,
-   -.1859723106220323397765171799549294623692e-1,
-   -54.66374178728197680241215648050386959351,
-   0.,
-   207.9528062553893734515824816699834244238,
-   -159.2889574744995071508959805871426654216,
-   7.018743740796944434698170760964252490817,
-   -.1833878590504572306472782005141738268361e-1,
-   -.5119484997882099077875432497245168395840e-3
-};
-
-static const double RK6_b[8] =
-{
-   .3438957868357036009278820124728322386520e-1,
-   0.,
-   0.,
-   .2582624555633503404659558098586120858767,
-   .4209371189673537150642551514069801967032,
-   4.405396469669310170148836816197095664891,
-   -176.4831190242986576151740942499002125029,
-   172.3641334014150730294022582711902413315
-};
-
-static const double RK6_c[7] =
-{
-   .6e-1,
-   .9593333333333333333333333333333333333333e-1,
-   .1439,
-   .4973,
-   .9725,
-   .9995,
-   1.,
-};
-
-class RajaRK6Solver : public RajaExplicitRKSolver
-{
-public:
-   RajaRK6Solver() : RajaExplicitRKSolver(8, RK6_a, RK6_b, RK6_c) { }
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_ODE
diff --git a/raja/raja/linalg/roperator.hpp b/raja/raja/linalg/roperator.hpp
deleted file mode 100644
index e2661610..00000000
--- a/raja/raja/linalg/roperator.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_OPERATOR
-#define LAGHOS_RAJA_OPERATOR
-
-namespace mfem
-{
-
-// ***************************************************************************
-class RajaOperator : public rmemcpy
-{
-protected:
-   int height;
-   int width;
-public:
-   explicit RajaOperator(int s = 0) { height = width = s; }
-   RajaOperator(int h, int w) { height = h; width = w; }
-   inline int Height() const { return height; }
-   inline int Width() const { return width; }
-   virtual void Mult(const RajaVector &x, RajaVector &y) const  { assert(false); };
-   virtual void MultTranspose(const RajaVector &x, RajaVector &y) const { assert(false); }
-   virtual const RajaOperator *GetProlongation() const { assert(false); return NULL; }
-   virtual const RajaOperator *GetRestriction() const  { assert(false); return NULL; }
-   virtual void RecoverFEMSolution(const RajaVector &X,
-                                   const RajaVector &b,
-                                   RajaVector &x) {assert(false);}
-};
-
-
-// ***************************************************************************
-class RajaTimeDependentOperator : public RajaOperator
-{
-private:
-   double t;
-public:
-   explicit RajaTimeDependentOperator(int n = 0,
-                                      double t_ = 0.0) : RajaOperator(n), t(t_) {}
-   void SetTime(const double _t) { t = _t; }
-};
-
-// ***************************************************************************
-class RajaSolverOperator : public RajaOperator
-{
-public:
-   bool iterative_mode;
-   explicit RajaSolverOperator(int s = 0,
-                               bool iter_mode = false) :
-      RajaOperator(s),
-      iterative_mode(iter_mode) { }
-   virtual void SetOperator(const RajaOperator &op) = 0;
-};
-
-// ***************************************************************************
-class RajaRAPOperator : public RajaOperator
-{
-private:
-   const RajaOperator &Rt;
-   const RajaOperator &A;
-   const RajaOperator &P;
-   mutable RajaVector Px;
-   mutable RajaVector APx;
-public:
-   /// Construct the RAP operator given R^T, A and P.
-   RajaRAPOperator(const RajaOperator &Rt_, const RajaOperator &A_,
-                   const RajaOperator &P_)
-      : RajaOperator(Rt_.Width(), P_.Width()), Rt(Rt_), A(A_), P(P_),
-        Px(P.Height()), APx(A.Height()) { }
-   /// Operator application.
-   void Mult(const RajaVector & x, RajaVector & y) const
-   {
-      P.Mult(x, Px);
-      A.Mult(Px, APx);
-      Rt.MultTranspose(APx, y);
-   }
-   /// Application of the transpose.
-   void MultTranspose(const RajaVector & x, RajaVector & y) const
-   {
-      Rt.Mult(x, APx);
-      A.MultTranspose(APx, Px);
-      P.MultTranspose(Px, y);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_OPERATOR
diff --git a/raja/raja/linalg/rsolvers.cpp b/raja/raja/linalg/rsolvers.cpp
deleted file mode 100644
index c60d1401..00000000
--- a/raja/raja/linalg/rsolvers.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-// *************************************************************************
-void RajaCGSolver::h_Mult(const RajaVector &b, RajaVector &x) const
-{
-   int i;
-   double r0, den, nom, nom0, betanom, alpha, beta;
-   if (iterative_mode)
-   {
-      oper->Mult(x, r);
-      subtract(b, r, r); // r = b - A x
-   }
-   else
-   {
-      r = b;
-      x = 0.0;
-   }
-
-   if (prec)
-   {
-      prec->Mult(r, z); // z = B r
-      d = z;
-   }
-   else
-   {
-      d = r;
-   }
-   nom0 = nom = Dot(d, r);
-   MFEM_ASSERT(IsFinite(nom), "nom = " << nom);
-
-   if (print_level == 1
-       || print_level == 3)
-   {
-      mfem::out << "   Iteration : " << std::setw(3) << 0 << "  (B r, r) = "
-                << nom << (print_level == 3 ? " ...\n" : "\n");
-   }
-
-   r0 = std::max(nom*rel_tol*rel_tol,abs_tol*abs_tol);
-
-   if (nom <= r0)
-   {
-      converged = 1;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-   }
-
-   oper->Mult(d, z);  // z = A d
-
-   den = Dot(z, d);
-   MFEM_ASSERT(IsFinite(den), "den = " << den);
-
-   if (print_level >= 0 && den < 0.0)
-   {
-      mfem::out << "Negative denominator in step 0 of PCG: " << den << '\n';
-   }
-
-   if (den == 0.0)
-   {
-      converged = 0;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-   }
-
-   // start iteration
-   converged = 0;
-   final_iter = max_iter;
-   for (i = 1; true; )
-   {
-      alpha = nom/den;
-      add(x,  alpha, d, x);     //  x = x + alpha d
-      add(r, -alpha, z, r);     //  r = r - alpha A d
-
-      if (prec)
-      {
-         prec->Mult(r, z);      //  z = B r
-         betanom = Dot(r, z);
-      }
-      else
-      {
-         betanom = Dot(r, r);
-      }
-      MFEM_ASSERT(IsFinite(betanom), "betanom = " << betanom);
-
-      if (print_level == 1)
-      {
-         mfem::out << "   Iteration : " << std::setw(3) << i << "  (B r, r) = "
-                   << betanom << '\n';
-      }
-
-      if (betanom < r0)
-      {
-         if (print_level == 2)
-         {
-            mfem::out << "Number of PCG iterations: " << i << '\n';
-         }
-         else if (print_level == 3)
-         {
-            mfem::out << "   Iteration : " << std::setw(3) << i << "  (B r, r) = "
-                      << betanom << '\n';
-         }
-         converged = 1;
-         final_iter = i;
-         break;
-      }
-
-      if (++i > max_iter)
-      {
-         break;
-      }
-
-      beta = betanom/nom;
-      if (prec)
-      {
-         add(z, beta, d, d);   //  d = z + beta d
-      }
-      else
-      {
-         add(r, beta, d, d);
-      }
-
-      oper->Mult(d, z);       //  z = A d
-
-      den = Dot(d, z);
-
-      MFEM_ASSERT(IsFinite(den), "den = " << den);
-      if (den <= 0.0)
-      {
-         if (print_level >= 0 && Dot(d, d) > 0.0)
-            mfem::out << "PCG: The operator is not positive definite. (Ad, d) = "
-                      << den << '\n';
-      }
-      nom = betanom;
-   }
-
-   if (print_level >= 0 && !converged)
-   {
-      if (print_level != 1)
-      {
-         if (print_level != 3)
-         {
-            mfem::out << "   Iteration : " << std::setw(3) << 0 << "  (B r, r) = "
-                      << nom0 << " ...\n";
-         }
-         mfem::out << "   Iteration : " << std::setw(3) << final_iter << "  (B r, r) = "
-                   << betanom << '\n';
-      }
-      mfem::out << "PCG: No convergence!" << '\n';
-   }
-
-   if (print_level >= 1 || (print_level >= 0 && !converged))
-   {
-      mfem::out << "Average reduction factor = "
-                << pow (betanom/nom0, 0.5/final_iter) << '\n';
-   }
-   final_norm = sqrt(betanom);
-}
-
-} // mfem
diff --git a/raja/raja/linalg/rsolvers.hpp b/raja/raja/linalg/rsolvers.hpp
deleted file mode 100644
index 30bd82bf..00000000
--- a/raja/raja/linalg/rsolvers.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_SOLVERS
-#define LAGHOS_RAJA_SOLVERS
-
-#ifdef MFEM_USE_MPI
-#include <mpi.h>
-#endif
-
-namespace mfem
-{
-
-// ***************************************************************************
-class RajaIterativeSolver : public RajaSolverOperator
-{
-#ifdef MFEM_USE_MPI
-private:
-   int dot_prod_type; // 0 - local, 1 - global over 'comm'
-   MPI_Comm comm;
-#endif
-protected:
-   const RajaOperator *oper;
-   RajaSolverOperator *prec;
-   int max_iter, print_level;
-   double rel_tol, abs_tol;
-   // stats
-   mutable int final_iter, converged;
-   mutable double final_norm;
-   double Dot(const RajaVector &x,
-              const RajaVector &y) const
-   {
-#ifndef MFEM_USE_MPI
-      return (x * y);
-#else
-      if (dot_prod_type == 0)
-      {
-         return (x * y);
-      }
-      double local_dot = (x * y);
-      double global_dot;
-      MPI_Allreduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, comm);
-      return global_dot;
-#endif
-   }
-   double Norm(const RajaVector &x) const { return sqrt(Dot(x, x)); }
-public:
-   RajaIterativeSolver(): RajaSolverOperator(0, true)
-   {
-      oper = NULL;
-      prec = NULL;
-      max_iter = 10;
-      print_level = -1;
-      rel_tol = abs_tol = 0.0;
-#ifdef MFEM_USE_MPI
-      dot_prod_type = 0;
-#endif
-   }
-
-#ifdef MFEM_USE_MPI
-   RajaIterativeSolver(MPI_Comm _comm)
-      : RajaSolverOperator(0, true)
-   {
-      oper = NULL;
-      prec = NULL;
-      max_iter = 10;
-      print_level = -1;
-      rel_tol = abs_tol = 0.0;
-      dot_prod_type = 1;
-      comm = _comm;
-   }
-#endif
-
-   void SetRelTol(double rtol) { rel_tol = rtol; }
-   void SetAbsTol(double atol) { abs_tol = atol; }
-   void SetMaxIter(int max_it) { max_iter = max_it; }
-   void SetPrintLevel(int print_lvl)
-   {
-#ifndef MFEM_USE_MPI
-      print_level = print_lvl;
-#else
-      if (dot_prod_type == 0)
-      {
-         print_level = print_lvl;
-      }
-      else
-      {
-         int rank;
-         MPI_Comm_rank(comm, &rank);
-         if (rank == 0)
-         {
-            print_level = print_lvl;
-         }
-      }
-#endif
-   }
-   int GetNumIterations() const { return final_iter; }
-   int GetConverged() const { return converged; }
-   double GetFinalNorm() const { return final_norm; }
-   /// This should be called before SetOperator
-   virtual void SetPreconditioner(RajaSolverOperator &pr)
-   {
-      prec = &pr;
-      prec->iterative_mode = false;
-   }
-   /// Also calls SetOperator for the preconditioner if there is one
-   virtual void SetOperator(const RajaOperator &op)
-   {
-      oper = &op;
-      height = op.Height();
-      width = op.Width();
-      if (prec)
-      {
-         prec->SetOperator(*oper);
-      }
-   }
-};
-
-// ***************************************************************************
-// Conjugate gradient method
-// ***************************************************************************
-class RajaCGSolver : public RajaIterativeSolver
-{
-protected:
-   mutable RajaVector r, d, z;
-   void UpdateVectors()
-   {
-      r.SetSize(width);
-      d.SetSize(width);
-      z.SetSize(width);
-   }
-public:
-   RajaCGSolver() { }
-#ifdef MFEM_USE_MPI
-   RajaCGSolver(MPI_Comm _comm) : RajaIterativeSolver(_comm) { }
-#endif
-   virtual void SetOperator(const RajaOperator &op)
-   {
-      RajaIterativeSolver::SetOperator(op);
-      UpdateVectors();
-   }
-   void h_Mult(const RajaVector &b, RajaVector &x) const ;
-   virtual void Mult(const RajaVector &b, RajaVector &x) const
-   {
-      h_Mult(b,x);
-   }
-};
-
-} // mfem
-
-#endif // LAGHOS_RAJA_SOLVERS
diff --git a/raja/raja/linalg/rsolvers_d.cu b/raja/raja/linalg/rsolvers_d.cu
deleted file mode 100644
index 24ec9e49..00000000
--- a/raja/raja/linalg/rsolvers_d.cu
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-#include "../raja.hpp"
-
-namespace mfem {
-  
-  // ***************************************************************************
-  static __device__ double Dot(const RajaVector &x,
-                               const RajaVector &y,
-                               const int dot_prod_type=1) {
-#ifndef MFEM_USE_MPI
-      return (x * y);
-#else
-      //if (dot_prod_type == 0) return (x * y);
-      
-      //const double local_dot = (x * y);
-      const double global_dot = 0.0;
-      //MPI_Allreduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, comm);
-      return global_dot;
-#endif
-    }
-  // ***************************************************************************
-  static __global__ void cuCG(const RajaVector &b, RajaVector &x,
-                              const int N,
-                              const RajaOperator *oper,
-                              RajaSolverOperator *prec,
-                              const bool iterative_mode,
-                              RajaVector &r,
-                              RajaVector &d,
-                              RajaVector &z,
-                              double &final_norm,
-                              const double rel_tol = 0.0,
-                              const double abs_tol = 0.0,
-                              const int max_iter = 1024,
-                              const int print_level =0) {
-    int i,converged,final_iter;
-    double r0, den, nom, nom0, betanom;//, alpha, beta;
-    if (iterative_mode) {
-      //oper->Mult(x, r);
-      //r=b-r;//subtract(b, r, r); // r = b - A x
-    }
-    else
-    {
-      //r = b;
-      //x = 0.0;
-      //d_vector_op_eq(N,0.0,x);
-    }
-
-    if (prec)
-    {
-      //prec->Mult(r, z); // z = B r
-      //d = z;
-    }
-    else
-    {
-      //d = r;
-    }
-
-    nom0 = nom = Dot(d, r);
-    //MFEM_ASSERT(IsFinite(nom), "nom = " << nom);
-      
-    if (print_level == 1 || print_level == 3) {
-      printf("   Iteration : %d (B r, r) = %f", 0, nom);
-    }
-      
-    r0 = max(nom*rel_tol*rel_tol,abs_tol*abs_tol);
-
-    if (nom <= r0)
-    {
-      converged = 1;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-    }
-
-    //oper->Mult(d, z);  // z = A d
-
-    den = Dot(z, d);
-    MFEM_ASSERT(IsFinite(den), "den = " << den);
-
-    if (print_level >= 0 && den < 0.0) {
-      printf("Negative denominator in step 0 of PCG: %f\n", den);
-    }
-
-    if (den == 0.0)
-    {
-      converged = 0;
-      final_iter = 0;
-      final_norm = sqrt(nom);
-      return;
-    }
-
-    // start iteration
-    converged = 0;
-    final_iter = max_iter;
-    for (i = 1; true; ){
-      //alpha = nom/den;
-      //add(x,  alpha, d, x);     //  x = x + alpha d
-      //add(r, -alpha, z, r);     //  r = r - alpha A d
-      if (prec)
-      {
-        //prec->Mult(r, z);      //  z = B r
-        betanom = Dot(r, z);
-      }
-      else
-      {
-        betanom = Dot(r, r);
-      }        
-        
-      if (print_level == 1){
-        printf("   Iteration : %d  (B r, r) = %f\n",i,betanom);
-      }
-
-      if (betanom < r0)
-      {
-        if (print_level == 2)
-        {
-          printf("Number of PCG iterations: %d\n",i);
-        }
-        else if (print_level == 3)
-        {
-          printf("   Iteration : %d (B r, r) = %f\n",i,betanom);
-        }
-        converged = 1;
-        final_iter = i;
-        break;
-      }
-
-      if (++i > max_iter)
-      {
-        break;
-      }
-
-      //beta = betanom/nom;
-      if (prec)
-      {
-        //add(z, beta, d, d);   //  d = z + beta d
-      }
-      else
-      {
-        //add(r, beta, d, d);
-      }
-        
-      //oper->Mult(d, z);       //  z = A d
-
-      den = Dot(d, z);
-        
-      //assert(IsFinite(den))); 
-      if (den <= 0.0)
-      {
-        if (print_level >= 0 && Dot(d, d) > 0.0)
-          printf("PCG: The operator is not positive definite. (Ad, d) = %f\n",den);
-      }
-      nom = betanom;
-    }
-     
-    if (print_level >= 0 && !converged)
-    {
-      if (print_level != 1)
-      {
-        if (print_level != 3)
-        {
-          printf("   Iteration : 0 (B r, r) = %f\n",nom0);
-        }
-        printf("   Iteration : %d (B r, r) = %f\n",final_iter, betanom);
-      }
-      printf("PCG: No convergence!\n");
-    }
-      
-    if (print_level >= 1 || (print_level >= 0 && !converged)) {
-      printf("Average reduction factor = %f\n",
-             pow (betanom/nom0, 0.5/final_iter));
-    }
-    final_norm = sqrt(betanom);
-  }
-
-  // ***************************************************************************
-  void d_Mult(const RajaVector &b,
-              RajaVector &x,
-              const int N,
-              const RajaOperator *oper,
-              RajaSolverOperator *prec,
-              const bool iterative_mode,
-              RajaVector &r,
-              RajaVector &d,
-              RajaVector &z,
-              double &final_norm) {
-    assert(false);
-    cuCG<<<1,1>>>(b,x,b.Size(),oper,prec,iterative_mode,r,d,z,final_norm);
-  }
- 
-} // mfem
diff --git a/raja/raja/linalg/rvector.cpp b/raja/raja/linalg/rvector.cpp
deleted file mode 100644
index cd6fa957..00000000
--- a/raja/raja/linalg/rvector.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-// Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
-// reserved. See file COPYRIGHT for details.
-//
-// This file is part of the MFEM library. For more information and source code
-// availability see http://mfem.org.
-//
-// MFEM is free software; you can redistribute it and/or modify it under the
-// terms of the GNU Lesser General Public License (as published by the Free
-// Software Foundation) version 2.1 dated February 1999.
-/////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2018,2019 Advanced Micro Devices, Inc.
-/////////////////////////////////////////////////////////////////////////////////
-#include "../raja.hpp"
-
-namespace mfem
-{
-
-RajaVector::~RajaVector()
-{
-   if (!own) { return; }
-   rmalloc::operator delete (data);
-}
-
-// ***************************************************************************
-double* RajaVector::alloc(const size_t sz)
-{
-   return (double*) rmalloc::operator new (sz);
-}
-
-// ***************************************************************************
-void RajaVector::SetSize(const size_t sz, const void* ptr)
-{
-   own=true;
-   size = sz;
-   if (!data) { data = alloc(sz); }
-   if (ptr) { rDtoD(data,ptr,bytes()); }
-}
-
-// ***************************************************************************
-RajaVector::RajaVector(const size_t sz):size(sz),data(alloc(sz)),own(true) {}
-RajaVector::RajaVector(const size_t sz,double value):
-   size(sz),data(alloc(sz)),own(true)
-{
-   *this=value;
-}
-
-RajaVector::RajaVector(const RajaVector& v):
-   size(0),data(NULL),own(true) { SetSize(v.Size(), v); }
-
-RajaVector::RajaVector(const RajaVector *v):size(v->size),data(v->data),
-   own(false) {}
-
-RajaVector::RajaVector(RajaArray<double>& v):size(v.size()),data(v.ptr()),
-   own(false) {}
-
-// Host 2 Device ***************************************************************
-RajaVector::RajaVector(const Vector& v):size(v.Size()),data(alloc(size)),
-   own(true)
-{
-   assert(v.GetData());
-   rmemcpy::rHtoD(data,v.GetData(),size*sizeof(double));
-}
-
-// Device 2 Host ***************************************************************
-RajaVector::operator Vector()
-{
-   if (!rconfig::Get().Cuda() && !rconfig::Get().Hip())
-      { return Vector(data,size); }
-   double *h_data= (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   Vector mfem_vector(h_data,size);
-   mfem_vector.MakeDataOwner();
-   return mfem_vector;
-}
-
-RajaVector::operator Vector() const
-{
-   if (!rconfig::Get().Cuda() && !rconfig::Get().Hip())
-      { return Vector(data,size); }
-   double *h_data= (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   Vector mfem_vector(h_data,size);
-   mfem_vector.MakeDataOwner();
-   return mfem_vector;
-}
-
-// ***************************************************************************
-void RajaVector::Print(std::ostream& out, int width) const
-{
-   double *h_data = (double*) ::malloc(bytes());
-   rmemcpy::rDtoH(h_data,data,bytes());
-   for (size_t i=0; i<size; i+=1)
-   {
-      printf("\n\t[%ld] %.15e",i,h_data[i]);
-   }
-   free(h_data);
-}
-
-// ***************************************************************************
-RajaVector* RajaVector::GetRange(const size_t offset,
-                                 const size_t entries) const
-{
-   static RajaVector ref;
-   ref.size = entries;
-   ref.data = (double*) ((unsigned char*)data + (offset*sizeof(double)));
-   ref.own = false;
-   return &ref;
-}
-
-// ***************************************************************************
-RajaVector& RajaVector::operator=(const RajaVector& v)
-{
-   SetSize(v.Size(),v.data);
-   own = false;
-   return *this;
-}
-
-// ***************************************************************************
-RajaVector& RajaVector::operator=(const Vector& v)
-{
-   size=v.Size();
-   if (!rconfig::Get().Cuda() && !rconfig::Get().Hip())
-      { SetSize(size,v.GetData()); }
-   else { rHtoD(data,v.GetData(),bytes()); }
-   own = false;
-   return *this;
-}
-
-// ***************************************************************************
-RajaVector& RajaVector::operator=(double value)
-{
-   vector_op_eq(size, value, data);
-   return *this;
-}
-
-// ***************************************************************************
-double RajaVector::operator*(const RajaVector& v) const
-{
-   return vector_dot(size, data, v.data);
-}
-
-// *****************************************************************************
-RajaVector& RajaVector::operator-=(const RajaVector& v)
-{
-   vector_vec_sub(size, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-RajaVector& RajaVector::operator+=(const RajaVector& v)
-{
-   vector_vec_add(size, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-RajaVector& RajaVector::operator+=(const Vector& v)
-{
-   double *d_v_data;
-   assert(v.GetData());
-   if (!rconfig::Get().Cuda() && !rconfig::Get().Hip())
-      { d_v_data=v.GetData(); }
-   else { rmemcpy::rHtoD(d_v_data = alloc(size),v.GetData(),bytes()); }
-   vector_vec_add(size, data, d_v_data);
-   return *this;
-}
-
-// ***************************************************************************
-RajaVector& RajaVector::operator*=(const double d)
-{
-   vector_vec_mul(size, data, d);
-   return *this;
-}
-
-// ***************************************************************************
-RajaVector& RajaVector::Add(const double alpha, const RajaVector& v)
-{
-   vector_axpy(Size(),alpha, data, v.data);
-   return *this;
-}
-
-// ***************************************************************************
-void RajaVector::Neg()
-{
-   vector_neg(Size(),ptr());
-}
-
-// *****************************************************************************
-void RajaVector::SetSubVector(const RajaArray<int> &ess_tdofs,
-                              const double value,
-                              const int N)
-{
-   vector_set_subvector_const(N, value, data, ess_tdofs.ptr());
-}
-
-
-// ***************************************************************************
-double RajaVector::Min() const
-{
-   return vector_min(Size(),(double*)data);
-}
-
-// ***************************************************************************
-void add(const RajaVector& v1, const double alpha,
-         const RajaVector& v2, RajaVector& out)
-{
-   vector_xpay(out.Size(),alpha,out.ptr(),v1.ptr(),v2.ptr());
-}
-
-// *****************************************************************************
-void add(const double alpha,
-         const RajaVector& v1,
-         const double beta,
-         const RajaVector& v2,
-         RajaVector& out) { assert(false); }
-
-// ***************************************************************************
-void subtract(const RajaVector& v1,
-              const RajaVector& v2,
-              RajaVector& out)
-{
-   vector_xsy(out.Size(),out.ptr(),v1.ptr(),v2.ptr());
-}
-
-} // mfem
diff --git a/raja/raja/linalg/rvector.hpp b/raja/raja/linalg/rvector.hpp
deleted file mode 100644
index c19bbd9d..00000000
--- a/raja/raja/linalg/rvector.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA_VECTOR
-#define LAGHOS_RAJA_VECTOR
-
-namespace mfem
-{
-
-class RajaVector : public rmalloc<double>
-{
-private:
-   size_t size = 0;
-   double* data = NULL;
-   bool own = true;
-public:
-   RajaVector(): size(0),data(NULL),own(true) {}
-   RajaVector(const RajaVector&);
-   RajaVector(const RajaVector*);
-   RajaVector(const size_t);
-   RajaVector(const size_t,double);
-   RajaVector(const Vector& v);
-   RajaVector(RajaArray<double>& v);
-   operator Vector();
-   operator Vector() const;
-   double* alloc(const size_t);
-   inline double* ptr() const { return data;}
-   inline double* GetData() const { return data;}
-   inline operator double* () { return data; }
-   inline operator const double* () const { return data; }
-   void Print(std::ostream& = std::cout, int = 8) const;
-   void SetSize(const size_t,const void* =NULL);
-   inline size_t Size() const { return size; }
-   inline size_t bytes() const { return size*sizeof(double); }
-   double operator* (const RajaVector& v) const;
-   RajaVector& operator = (const RajaVector& v);
-   RajaVector& operator = (const Vector& v);
-   RajaVector& operator = (double value);
-   RajaVector& operator -= (const RajaVector& v);
-   RajaVector& operator += (const RajaVector& v);
-   RajaVector& operator += (const Vector& v);
-   RajaVector& operator *=(const double d);
-   RajaVector& Add(const double a, const RajaVector& Va);
-   void Neg();
-   RajaVector* GetRange(const size_t, const size_t) const;
-   void SetSubVector(const RajaArray<int> &, const double, const int);
-   double Min() const;
-   ~RajaVector();
-};
-
-// ***************************************************************************
-void add(const RajaVector&,const double,const RajaVector&,RajaVector&);
-void add(const RajaVector&,const RajaVector&,RajaVector&);
-void add(const double,const RajaVector&,const double,const RajaVector&,
-         RajaVector&);
-void subtract(const RajaVector&,const RajaVector&,RajaVector&);
-
-}
-
-#endif // LAGHOS_RAJA_VECTOR
diff --git a/raja/raja/raja.hpp b/raja/raja/raja.hpp
deleted file mode 100644
index c443ab76..00000000
--- a/raja/raja/raja.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-#ifndef LAGHOS_RAJA
-#define LAGHOS_RAJA
-
-// stdincs *********************************************************************
-#include <stdio.h>
-#include <stdarg.h>
-#include <assert.h>
-
-// *****************************************************************************
-#include "RAJA/RAJA.hpp"
-
-// MFEM/fem  *******************************************************************
-#include "fem/gridfunc.hpp"
-#include "general/communication.hpp"
-#include "fem/pfespace.hpp"
-
-// LAGHOS/raja/config **********************************************************
-#include "config/rconfig.hpp"
-
-// LAGHOS/raja/general *********************************************************
-#include "general/rmemcpy.hpp"
-#include "general/rmalloc.hpp"
-#include "general/rarray.hpp"
-#include "general/rtable.hpp"
-#include "general/rcommd.hpp"
-
-// LAGHOS/raja/linalg **********************************************************
-#include "linalg/rvector.hpp"
-#include "linalg/roperator.hpp"
-#include "linalg/rode.hpp"
-#include "linalg/rsolvers.hpp"
-
-// LAGHOS/raja/kernels *********************************************************
-#include "kernels/include/kernels.hpp"
-
-// LAGHOS/raja/fem *************************************************************
-#include "fem/rconform.hpp"
-#include "fem/rprolong.hpp"
-#include "fem/rrestrict.hpp"
-#include "fem/rfespace.hpp"
-#include "fem/rbilinearform.hpp"
-#include "fem/rgridfunc.hpp"
-#include "fem/rbilininteg.hpp"
-
-#endif // LAGHOS_RAJA
-
diff --git a/serial/README.md b/serial/README.md
index 9c506653..080c0f17 100644
--- a/serial/README.md
+++ b/serial/README.md
@@ -88,25 +88,36 @@ of Laghos, without MPI parallelization.
 To make sure the results are correct, we tabulate reference final iterations
 (`step`), time steps (`dt`) and energies (`|e|`) for the runs listed below:
 
-1. `./laghos -p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.75 -pa`
-2. `./laghos -p 0 -m ../data/cube01_hex.mesh -rs 1 -tf 0.75 -pa`
-3. `./laghos -p 1 -m ../data/square01_quad.mesh -rs 3 -tf 0.8 -pa`
-4. `./laghos -p 1 -m ../data/cube01_hex.mesh -rs 2 -tf 0.6 -pa`
-5. `./laghos -p 2 -m ../data/segment01.mesh -rs 5 -tf 0.2 -fa`
+1. `./laghos -p 0 -dim 2 -rs 3 -tf 0.75 -pa`
+2. `./laghos -p 0 -dim 3 -rs 1 -tf 0.75 -pa`
+3. `./laghos -p 1 -dim 2 -rs 3 -tf 0.8 -pa`
+4. `./laghos -p 1 -dim 3 -rs 2 -tf 0.6 -pa`
+5. `./laghos -p 2 -dim 1 -rs 5 -tf 0.2 -fa`
 6. `./laghos -p 3 -m ../data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa`
 7. `./laghos -p 3 -m ../data/box01_hex.mesh -rs 1 -tf 3.0 -pa`
 8. `./laghos -p 4 -m ../data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62831853 -s 7 -pa`
 
 | `run` | `step` | `dt` | `e` |
 | ----- | ------ | ---- | --- |
-|  1. |  339 | 0.000702 | 49.6955373491   |
-|  2. | 1041 | 0.000121 | 3390.9635545458 |
-|  3. | 1154 | 0.001655 | 46.3033960530   |
-|  4. |  560 | 0.002449 | 134.0861672181  |
-|  5. |  413 | 0.000470 | 32.0120774101   |
-|  6. | 5301 | 0.000360 | 141.8352298401  |
-|  7. |  975 | 0.001601 | 144.2461751623  |
-|  8. |  776 | 0.000045 | 409.8243172608  |
+|  1. |  339 | 0.000702 | 4.9695537349e+01 |
+|  2. | 1041 | 0.000121 | 3.3909635545e+03 |
+|  3. | 1154 | 0.001655 | 4.6303396053e+01 |
+|  4. |  560 | 0.002449 | 1.3408616722e+02 |
+|  5. |  413 | 0.000470 | 3.2012077410e+01 |
+|  6. | 2872 | 0.000064 | 5.6547039096e+01 |
+|  7. |  528 | 0.000180 | 5.6505348812e+01 |
+|  8. |  776 | 0.000045 | 4.0982431726e+02 |
+
+Similar CUDA runs can be launched with these commands:
+
+1. `./laghos -p 0 -dim 2 -rs 3 -tf 0.75 -pa -d cuda`
+2. `./laghos -p 0 -dim 3 -rs 1 -tf 0.75 -pa -d cuda`
+3. `./laghos -p 1 -dim 2 -rs 3 -tf 0.80 -pa -d cuda`
+4. `./laghos -p 1 -dim 3 -rs 2 -tf 0.60 -pa -d cuda`
+5. `./laghos -p 2 -dim 1 -rs 5 -tf 0.20 -fa`
+6. `./laghos -p 3 -m ../data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa -d cuda`
+7. `./laghos -p 3 -m ../data/box01_hex.mesh -rs 1 -tf 3.0 -pa -cgt 1e-12 -d cuda`
+8. `./laghos -p 4 -m ../data/square_gresho.mesh -rs 3 -ok 3 -ot 2 -tf 0.62831853 -s 7 -pa -d cuda`
 
 An implementation is considered valid if the final energy values are all within
 round-off distance from the above reference values.
diff --git a/serial/laghos.cpp b/serial/laghos.cpp
index 8ebbf411..fdb672ff 100644
--- a/serial/laghos.cpp
+++ b/serial/laghos.cpp
@@ -23,8 +23,6 @@
 //
 //             High-order Lagrangian Hydrodynamics Miniapp
 //
-//                            SERIAL version
-//
 // Laghos(LAGrangian High-Order Solver) is a miniapp that solves the
 // time-dependent Euler equation of compressible gas dynamics in a moving
 // Lagrangian frame using unstructured high-order finite element spatial
@@ -41,27 +39,32 @@
 //    p = 2  --> 1D Sod shock tube.
 //    p = 3  --> Triple point.
 //    p = 4  --> Gresho vortex (smooth problem).
+//    p = 5  --> 2D Riemann problem, config. 12 of doi.org/10.1002/num.10025
+//    p = 6  --> 2D Riemann problem, config.  6 of doi.org/10.1002/num.10025
 //
-// Sample runs: see README.md, section 'Verification of Results'
-// All tests should be run in serial with the correct path to the mesh files.
-//
+// Sample runs: see README.md, section 'Verification of Results'.
 
-#include "laghos_solver.hpp"
-#include "laghos_timeinteg.hpp"
 #include <fstream>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "laghos_solver.hpp"
 
-using namespace std;
+using std::cout;
+using std::endl;
 using namespace mfem;
-using namespace mfem::hydrodynamics;
 
 // Choice for the problem setup.
-int problem;
+static int problem;
 
-double rho0(const Vector &);
-void v0(const Vector &, Vector &);
+// Forward declarations.
 double e0(const Vector &);
+double rho0(const Vector &);
 double gamma(const Vector &);
-void display_banner(ostream & os);
+void v0(const Vector &, Vector &);
+
+static long GetMaxRssMB();
+static void display_banner(std::ostream&);
+static void Checks(const int dim, const int ti, const double norm, int &checks);
 
 int main(int argc, char *argv[])
 {
@@ -70,14 +73,17 @@ int main(int argc, char *argv[])
 
    // Parse command-line options.
    problem = 1;
-   const char *mesh_file = "../data/cube01_hex.mesh";
+   int dim = 3;
+   const char *mesh_file = "default";
    int rs_levels = 2;
    int order_v = 2;
    int order_e = 1;
+   int order_q = -1;
    int ode_solver_type = 4;
    double t_final = 0.6;
    double cfl = 0.5;
    double cg_tol = 1e-8;
+   double ftz_tol = 0.0;
    int cg_max_iter = 300;
    int max_tsteps = -1;
    bool p_assembly = true;
@@ -87,10 +93,17 @@ int main(int argc, char *argv[])
    bool visit = false;
    bool gfprint = false;
    const char *basename = "results/Laghos";
+   const char *device = "cpu";
+   bool check = false;
+   bool mem_usage = false;
+   bool fom = false;
+   int dev = 0;
+   double blast_energy = 0.25;
+   double blast_position[] = {0.0, 0.0, 0.0};
 
    OptionsParser args(argc, argv);
-   args.AddOption(&mesh_file, "-m", "--mesh",
-                  "Mesh file to use.");
+   args.AddOption(&dim, "-dim", "--dimension", "Dimension of the problem.");
+   args.AddOption(&mesh_file, "-m", "--mesh", "Mesh file to use.");
    args.AddOption(&rs_levels, "-rs", "--refine-serial",
                   "Number of times to refine the mesh uniformly in serial.");
    args.AddOption(&problem, "-p", "--problem", "Problem setup to use.");
@@ -98,6 +111,8 @@ int main(int argc, char *argv[])
                   "Order (degree) of the kinematic finite element space.");
    args.AddOption(&order_e, "-ot", "--order-thermo",
                   "Order (degree) of the thermodynamic finite element space.");
+   args.AddOption(&order_q, "-oq", "--order-intrule",
+                  "Order  of the integration rule.");
    args.AddOption(&ode_solver_type, "-s", "--ode-solver",
                   "ODE solver: 1 - Forward Euler,\n\t"
                   "            2 - RK2 SSP, 3 - RK3 SSP, 4 - RK4, 6 - RK6,\n\t"
@@ -107,6 +122,8 @@ int main(int argc, char *argv[])
    args.AddOption(&cfl, "-cfl", "--cfl", "CFL-condition number.");
    args.AddOption(&cg_tol, "-cgt", "--cg-tol",
                   "Relative CG tolerance (velocity linear solve).");
+   args.AddOption(&ftz_tol, "-ftz", "--ftz-tol",
+                  "Absolute flush-to-zero tolerance.");
    args.AddOption(&cg_max_iter, "-cgm", "--cg-max-steps",
                   "Maximum number of CG iterations (velocity linear solve).");
    args.AddOption(&max_tsteps, "-ms", "--max-steps",
@@ -128,6 +145,15 @@ int main(int argc, char *argv[])
                   "Enable or disable result output (files in mfem format).");
    args.AddOption(&basename, "-k", "--outputfilename",
                   "Name of the visit dump files");
+   args.AddOption(&device, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
+   args.AddOption(&check, "-chk", "--checks", "-no-chk", "--no-checks",
+                  "Enable 2D checks.");
+   args.AddOption(&mem_usage, "-mb", "--mem", "-no-mem", "--no-mem",
+                  "Enable memory usage.");
+   args.AddOption(&fom, "-f", "--fom", "-no-fom", "--no-fom",
+                  "Enable figure of merit output.");
+   args.AddOption(&dev, "-dev", "--dev", "GPU device to use.");
    args.Parse();
    if (!args.Good())
    {
@@ -136,18 +162,66 @@ int main(int argc, char *argv[])
    }
    args.PrintOptions(cout);
 
-   // Read the serial mesh from the given mesh file on all processors.
-   // Refine the mesh in serial to increase the resolution.
-   Mesh *mesh = new Mesh(mesh_file, 1, 1);
-   const int dim = mesh->Dimension();
-   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
+   // Configure the device from the command line options
+   Device backend;
+   backend.Configure(device, dev);
+   backend.Print();
 
+   // On all processors, use the default builtin 1D/2D/3D mesh or read the
+   // serial one given on the command line.
+   Mesh *mesh;
+   if (strncmp(mesh_file, "default", 7) != 0)
+   {
+      mesh = new Mesh(mesh_file, true, true);
+   }
+   else
+   {
+      if (dim == 1)
+      {
+         mesh = new Mesh(2);
+         mesh->GetBdrElement(0)->SetAttribute(1);
+         mesh->GetBdrElement(1)->SetAttribute(1);
+      }
+      if (dim == 2)
+      {
+         mesh = new Mesh(2, 2, Element::QUADRILATERAL, true);
+         const int NBE = mesh->GetNBE();
+         for (int b = 0; b < NBE; b++)
+         {
+            Element *bel = mesh->GetBdrElement(b);
+            const int attr = (b < NBE/2) ? 2 : 1;
+            bel->SetAttribute(attr);
+         }
+      }
+      if (dim == 3)
+      {
+         mesh = new Mesh(2, 2, 2, Element::HEXAHEDRON, true);
+         const int NBE = mesh->GetNBE();
+         for (int b = 0; b < NBE; b++)
+         {
+            Element *bel = mesh->GetBdrElement(b);
+            const int attr = (b < NBE/3) ? 3 : (b < 2*NBE/3) ? 1 : 2;
+            bel->SetAttribute(attr);
+         }
+      }
+   }
+   dim = mesh->Dimension();
+
+   // 1D vs partial assembly sanity check.
    if (p_assembly && dim == 1)
    {
       p_assembly = false;
       cout << "Laghos does not support PA in 1D. Switching to FA." << endl;
    }
 
+   // Refine the mesh in serial to increase the resolution.
+   for (int lev = 0; lev < rs_levels; lev++) { mesh->UniformRefinement(); }
+   const int mesh_NE = mesh->GetNE();
+   cout << "Number of zones in the serial mesh: " << mesh_NE << endl;
+
+   const int NE = mesh->GetNE();
+   cout << "Zones: " << NE << endl;
+
    // Define the parallel finite element spaces. We use:
    // - H1 (Gauss-Lobatto, continuous) for position and velocity.
    // - L2 (Bernstein, discontinuous) for specific internal energy.
@@ -158,17 +232,16 @@ int main(int argc, char *argv[])
 
    // Boundary conditions: all tests use v.n = 0 on the boundary, and we assume
    // that the boundaries are straight.
-   Array<int> vdofs_marker, ess_vdofs;
+   Array<int> ess_tdofs;
    {
-      Array<int> ess_bdr(mesh->bdr_attributes.Max()), vdofs1d;
+      Array<int> ess_bdr(mesh->bdr_attributes.Max()), tdofs1d;
       for (int d = 0; d < mesh->Dimension(); d++)
       {
-         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e., we must
-         // enforce v_x/y/z = 0 for the velocity components.
+         // Attributes 1/2/3 correspond to fixed-x/y/z boundaries,
+         // i.e., we must enforce v_x/y/z = 0 for the velocity components.
          ess_bdr = 0; ess_bdr[d] = 1;
-         H1FESpace.GetEssentialVDofs(ess_bdr, vdofs_marker, d);
-         FiniteElementSpace::MarkerToList(vdofs_marker, vdofs1d);
-         ess_vdofs.Append(vdofs1d);
+         H1FESpace.GetEssentialTrueDofs(ess_bdr, tdofs1d, d);
+         ess_tdofs.Append(tdofs1d);
       }
    }
 
@@ -188,9 +261,8 @@ int main(int argc, char *argv[])
          return 3;
    }
 
-   int Vsize_l2 = L2FESpace.GetVSize();
-   int Vsize_h1 = H1FESpace.GetVSize();
-
+   const int Vsize_l2 = L2FESpace.GetVSize();
+   const int Vsize_h1 = H1FESpace.GetVSize();
    cout << "Number of kinematic (position, velocity) dofs: "
         << Vsize_h1 << endl;
    cout << "Number of specific internal energy dofs: "
@@ -200,16 +272,15 @@ int main(int argc, char *argv[])
    // - 0 -> position
    // - 1 -> velocity
    // - 2 -> specific internal energy
-
    Array<int> true_offset(4);
    true_offset[0] = 0;
    true_offset[1] = true_offset[0] + Vsize_h1;
    true_offset[2] = true_offset[1] + Vsize_h1;
    true_offset[3] = true_offset[2] + Vsize_l2;
-   BlockVector S(true_offset);
+   BlockVector S(true_offset, Device::GetMemoryType());
 
    // Define GridFunction objects for the position, velocity and specific
-   // internal energy.  There is no function for the density, as we can always
+   // internal energy. There is no function for the density, as we can always
    // compute the density values given the current mesh position, using the
    // property of pointwise mass conservation.
    GridFunction x_gf, v_gf, e_gf;
@@ -219,28 +290,33 @@ int main(int argc, char *argv[])
 
    // Initialize x_gf using the starting mesh coordinates.
    mesh->SetNodalGridFunction(&x_gf);
+   // Sync the data location of x_gf with its base, S
+   x_gf.SyncAliasMemory(S);
 
    // Initialize the velocity.
    VectorFunctionCoefficient v_coeff(mesh->Dimension(), v0);
    v_gf.ProjectCoefficient(v_coeff);
+   // Sync the data location of v_gf with its base, S
+   v_gf.SyncAliasMemory(S);
 
    // Initialize density and specific internal energy values. We interpolate in
-   // a non-positive basis to get the correct values at the dofs.  Then we do an
+   // a non-positive basis to get the correct values at the dofs. Then we do an
    // L2 projection to the positive basis in which we actually compute. The goal
    // is to get a high-order representation of the initial condition. Note that
    // this density is a temporary function and it will not be updated during the
    // time evolution.
-   GridFunction rho(&L2FESpace);
-   FunctionCoefficient rho_coeff(rho0);
+   GridFunction rho0_gf(&L2FESpace);
+   FunctionCoefficient rho0_coeff(rho0);
    L2_FECollection l2_fec(order_e, mesh->Dimension());
    FiniteElementSpace l2_fes(mesh, &l2_fec);
-   GridFunction l2_rho(&l2_fes), l2_e(&l2_fes);
-   l2_rho.ProjectCoefficient(rho_coeff);
-   rho.ProjectGridFunction(l2_rho);
+   GridFunction l2_rho0_gf(&l2_fes), l2_e(&l2_fes);
+   l2_rho0_gf.ProjectCoefficient(rho0_coeff);
+   rho0_gf.ProjectGridFunction(l2_rho0_gf);
    if (problem == 1)
    {
       // For the Sedov test, we use a delta function at the origin.
-      DeltaCoefficient e_coeff(0, 0, 0.25);
+      DeltaCoefficient e_coeff(blast_position[0], blast_position[1],
+                               blast_position[2], blast_energy);
       l2_e.ProjectCoefficient(e_coeff);
    }
    else
@@ -249,68 +325,69 @@ int main(int argc, char *argv[])
       l2_e.ProjectCoefficient(e_coeff);
    }
    e_gf.ProjectGridFunction(l2_e);
+   // Sync the data location of e_gf with its base, S
+   e_gf.SyncAliasMemory(S);
 
-   // Piecewise constant ideal gas coefficient over the Lagrangian mesh. The
-   // gamma values are projected on a function that stays constant on the moving
-   // mesh.
+   // Piecewise constant ideal gas coefficient over the Lagrangian mesh.
+   // gamma values are projected on function that's constant on the moving mesh.
    L2_FECollection mat_fec(0, mesh->Dimension());
    FiniteElementSpace mat_fes(mesh, &mat_fec);
    GridFunction mat_gf(&mat_fes);
    FunctionCoefficient mat_coeff(gamma);
    mat_gf.ProjectCoefficient(mat_coeff);
-   GridFunctionCoefficient *mat_gf_coeff = new GridFunctionCoefficient(&mat_gf);
 
    // Additional details, depending on the problem.
    int source = 0; bool visc = true;
    switch (problem)
    {
-      case 0: if (mesh->Dimension() == 2) { source = 1; }
-         visc = false; break;
+      case 0: if (mesh->Dimension() == 2) { source = 1; } visc = false; break;
       case 1: visc = true; break;
       case 2: visc = true; break;
-      case 3: visc = true; break;
+      case 3: visc = true; S.HostRead(); break;
       case 4: visc = false; break;
+      case 5: visc = true; break;
+      case 6: visc = true; break;
       default: MFEM_ABORT("Wrong problem specification!");
    }
    if (impose_visc) { visc = true; }
 
-   LagrangianHydroOperator oper(S.Size(), H1FESpace, L2FESpace,
-                                ess_vdofs, rho, source, cfl, mat_gf_coeff,
-                                visc, p_assembly, cg_tol, cg_max_iter,
-                                H1FEC.GetBasisType());
+   hydrodynamics::LagrangianHydroOperator hydro(S.Size(),
+                                                H1FESpace, L2FESpace, ess_tdofs,
+                                                rho0_coeff, rho0_gf,
+                                                mat_coeff, mat_gf,
+                                                source, cfl,
+                                                visc, p_assembly,
+                                                cg_tol, cg_max_iter, ftz_tol,
+                                                order_q);
 
    socketstream vis_rho, vis_v, vis_e;
    char vishost[] = "localhost";
    int  visport   = 19916;
 
    GridFunction rho_gf;
-   if (visualization || visit) { oper.ComputeDensity(rho_gf); }
-
-   const double energy_init = oper.InternalEnergy(e_gf) +
-                              oper.KineticEnergy(v_gf);
+   if (visualization || visit) { hydro.ComputeDensity(rho_gf); }
+   const double energy_init = hydro.InternalEnergy(e_gf) +
+                              hydro.KineticEnergy(v_gf);
 
    if (visualization)
    {
       vis_rho.precision(8);
       vis_v.precision(8);
       vis_e.precision(8);
-
       int Wx = 0, Wy = 0; // window position
       const int Ww = 350, Wh = 350; // window size
       int offx = Ww+10; // window offsets
-
       if (problem != 0 && problem != 4)
       {
-         VisualizeField(vis_rho, vishost, visport, rho_gf,
-                        "Density", Wx, Wy, Ww, Wh);
+         hydrodynamics::VisualizeField(vis_rho, vishost, visport, rho_gf,
+                                       "Density", Wx, Wy, Ww, Wh);
       }
-
       Wx += offx;
-      VisualizeField(vis_v, vishost, visport, v_gf,
-                     "Velocity", Wx, Wy, Ww, Wh);
+      hydrodynamics::VisualizeField(vis_v, vishost, visport, v_gf,
+                                    "Velocity", Wx, Wy, Ww, Wh);
       Wx += offx;
-      VisualizeField(vis_e, vishost, visport, e_gf,
-                     "Specific Internal Energy", Wx, Wy, Ww, Wh);
+      hydrodynamics::VisualizeField(vis_e, vishost, visport, e_gf,
+                                    "Specific Internal Energy", Wx, Wy, Ww, Wh);
    }
 
    // Save data for VisIt visualization.
@@ -328,12 +405,15 @@ int main(int argc, char *argv[])
    // Perform time-integration (looping over the time iterations, ti, with a
    // time-step dt). The object oper is of type LagrangianHydroOperator that
    // defines the Mult() method that used by the time integrators.
-   ode_solver->Init(oper);
-   oper.ResetTimeStepEstimate();
-   double t = 0.0, dt = oper.GetTimeStepEstimate(S), t_old;
+   ode_solver->Init(hydro);
+   hydro.ResetTimeStepEstimate();
+   double t = 0.0, dt = hydro.GetTimeStepEstimate(S), t_old;
    bool last_step = false;
    int steps = 0;
    BlockVector S_old(S);
+   long mem=0, mmax=0, msum=0;
+   int checks = 0;
+
    for (int ti = 1; !last_step; ti++)
    {
       if (t + dt >= t_final)
@@ -342,10 +422,9 @@ int main(int argc, char *argv[])
          last_step = true;
       }
       if (steps == max_tsteps) { last_step = true; }
-
       S_old = S;
       t_old = t;
-      oper.ResetTimeStepEstimate();
+      hydro.ResetTimeStepEstimate();
 
       // S is the vector of dofs, t is the current time, and dt is the time step
       // to advance.
@@ -353,23 +432,30 @@ int main(int argc, char *argv[])
       steps++;
 
       // Adaptive time step control.
-      const double dt_est = oper.GetTimeStepEstimate(S);
+      const double dt_est = hydro.GetTimeStepEstimate(S);
       if (dt_est < dt)
       {
          // Repeat (solve again) with a decreased time step - decrease of the
          // time estimate suggests appearance of oscillations.
          dt *= 0.85;
-         if (dt < numeric_limits<double>::epsilon())
+         if (dt < std::numeric_limits<double>::epsilon())
          { MFEM_ABORT("The time step crashed!"); }
          t = t_old;
          S = S_old;
-         oper.ResetQuadratureData();
+         hydro.ResetQuadratureData();
          cout << "Repeating step " << ti << endl;
          if (steps < max_tsteps) { last_step = false; }
          ti--; continue;
       }
       else if (dt_est > 1.25 * dt) { dt *= 1.02; }
 
+      // Ensure the sub-vectors x_gf, v_gf, and e_gf know the location of the
+      // data in S. This operation simply updates the Memory validity flags of
+      // the sub-vectors to match those of S.
+      x_gf.SyncAliasMemory(S);
+      v_gf.SyncAliasMemory(S);
+      e_gf.SyncAliasMemory(S);
+
       // Make sure that the mesh corresponds to the new solution state. This is
       // needed, because some time integrators use different S-type vectors
       // and the oper object might have redirected the mesh positions to those.
@@ -377,33 +463,37 @@ int main(int argc, char *argv[])
 
       if (last_step || (ti % vis_steps) == 0)
       {
-         const double loc_norm = e_gf * e_gf;
-         cout << fixed;
-         cout << "step " << setw(5) << ti
-              << ",\tt = " << setw(5) << setprecision(4) << t
-              << ",\tdt = " << setw(5) << setprecision(6) << dt
-              << ",\t|e| = " << setprecision(10)
-              << sqrt(loc_norm) << endl;
-
-         if (visualization || visit) { oper.ComputeDensity(rho_gf); }
+         double norm = e_gf * e_gf;
+         if (mem_usage) { mem = GetMaxRssMB(); }
+         const double sqrt_norm = sqrt(norm);
+         cout << std::fixed;
+         cout << "step " << std::setw(5) << ti
+              << ",\tt = " << std::setw(5) << std::setprecision(4) << t
+              << ",\tdt = " << std::setw(5) << std::setprecision(6) << dt
+              << ",\t|e| = " << std::setprecision(10) << std::scientific
+              << sqrt_norm;
+         cout << std::fixed;
+         if (mem_usage) { cout << ", mem: " << mem << " MB"; }
+         cout << endl;
+
+         if (visualization || visit || gfprint) { hydro.ComputeDensity(rho_gf); }
          if (visualization)
          {
             int Wx = 0, Wy = 0; // window position
             int Ww = 350, Wh = 350; // window size
             int offx = Ww+10; // window offsets
-
             if (problem != 0 && problem != 4)
             {
-               VisualizeField(vis_rho, vishost, visport, rho_gf,
-                              "Density", Wx, Wy, Ww, Wh);
+               hydrodynamics::VisualizeField(vis_rho, vishost, visport, rho_gf,
+                                             "Density", Wx, Wy, Ww, Wh);
             }
-
             Wx += offx;
-            VisualizeField(vis_v, vishost, visport,
-                           v_gf, "Velocity", Wx, Wy, Ww, Wh);
+            hydrodynamics::VisualizeField(vis_v, vishost, visport,
+                                          v_gf, "Velocity", Wx, Wy, Ww, Wh);
             Wx += offx;
-            VisualizeField(vis_e, vishost, visport, e_gf,
-                           "Specific Internal Energy", Wx, Wy, Ww,Wh);
+            hydrodynamics::VisualizeField(vis_e, vishost, visport, e_gf,
+                                          "Specific Internal Energy",
+                                          Wx, Wy, Ww,Wh);
             Wx += offx;
          }
 
@@ -416,34 +506,51 @@ int main(int argc, char *argv[])
 
          if (gfprint)
          {
-            ostringstream v_name, rho_name, e_name, m_name;
+            std::ostringstream v_name, rho_name, e_name, m_name;
             m_name << basename << "_" << ti << "_mesh";
             rho_name  << basename << "_" << ti << "_rho";
             v_name << basename << "_" << ti << "_v";
             e_name << basename << "_" << ti << "_e";
 
-            ofstream mesh_ofs(m_name.str().c_str());
+            std::ofstream mesh_ofs(m_name.str().c_str());
             mesh_ofs.precision(8);
             mesh->Print(mesh_ofs);
             mesh_ofs.close();
 
-            ofstream rho_ofs(rho_name.str().c_str());
+            std::ofstream rho_ofs(rho_name.str().c_str());
             rho_ofs.precision(8);
             rho_gf.Save(rho_ofs);
             rho_ofs.close();
 
-            ofstream v_ofs(v_name.str().c_str());
+            std::ofstream v_ofs(v_name.str().c_str());
             v_ofs.precision(8);
             v_gf.Save(v_ofs);
             v_ofs.close();
 
-            ofstream e_ofs(e_name.str().c_str());
+            std::ofstream e_ofs(e_name.str().c_str());
             e_ofs.precision(8);
             e_gf.Save(e_ofs);
             e_ofs.close();
          }
       }
+
+      // Problems checks
+      if (check)
+      {
+         const double norm = e_gf * e_gf;
+         const double e_norm = sqrt(norm);
+         MFEM_VERIFY(rs_levels==0, "check: rs");
+         MFEM_VERIFY(order_v==2, "check: order_v");
+         MFEM_VERIFY(order_e==1, "check: order_e");
+         MFEM_VERIFY(ode_solver_type==4, "check: ode_solver_type");
+         MFEM_VERIFY(t_final == 0.6, "check: t_final");
+         MFEM_VERIFY(cfl==0.5, "check: cfl");
+         MFEM_VERIFY(strncmp(mesh_file, "default", 7) == 0, "check: mesh_file");
+         MFEM_VERIFY(dim==2 || dim==3, "check: dimension");
+         Checks(dim, ti, e_norm, checks);
+      }
    }
+   MFEM_VERIFY(!check || checks == 2, "Check error!");
 
    switch (ode_solver_type)
    {
@@ -453,13 +560,24 @@ int main(int argc, char *argv[])
       case 6: steps *= 6; break;
       case 7: steps *= 2;
    }
-   oper.PrintTimingData(steps);
 
-   const double energy_final = oper.InternalEnergy(e_gf) +
-                               oper.KineticEnergy(v_gf);
+   hydro.PrintTimingData(steps, fom);
+
+   if (mem_usage) { mem = GetMaxRssMB(); }
+
+   const double energy_final = hydro.InternalEnergy(e_gf) +
+                               hydro.KineticEnergy(v_gf);
    cout << endl;
-   cout << "Energy  diff: " << scientific << setprecision(2)
-        << fabs(energy_init - energy_final) << endl;
+   if (!p_assembly)
+   {
+      cout << "Energy  diff: " << std::scientific << std::setprecision(2)
+           << fabs(energy_init - energy_final) << endl;
+   }
+   if (mem_usage)
+   {
+      cout << "Maximum memory resident set size: "
+           << mmax << "/" << msum << " MB" << endl;
+   }
 
    // Print the error.
    // For problems 0 and 4 the exact velocity is constant in time.
@@ -482,7 +600,6 @@ int main(int argc, char *argv[])
    // Free the used memory.
    delete ode_solver;
    delete mesh;
-   delete mat_gf_coeff;
 
    return 0;
 }
@@ -494,8 +611,20 @@ double rho0(const Vector &x)
       case 0: return 1.0;
       case 1: return 1.0;
       case 2: return (x(0) < 0.5) ? 1.0 : 0.1;
-      case 3: return (x(0) > 1.0 && x(1) <= 1.5) ? 1.0 : 0.125;
+      case 3: return (x(0) > 1.0 && x(1) > 1.5) ? 0.125 : 1.0;
       case 4: return 1.0;
+      case 5:
+      {
+         if (x(0) >= 0.5 && x(1) >= 0.5) { return 0.5313; }
+         if (x(0) <  0.5 && x(1) <  0.5) { return 0.8; }
+         return 1.0;
+      }
+      case 6:
+      {
+         if (x(0) <  0.5 && x(1) >= 0.5) { return 2.0; }
+         if (x(0) >= 0.5 && x(1) <  0.5) { return 3.0; }
+         return 1.0;
+      }
       default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
    }
 }
@@ -509,17 +638,17 @@ double gamma(const Vector &x)
       case 2: return 1.4;
       case 3: return (x(0) > 1.0 && x(1) <= 1.5) ? 1.4 : 1.5;
       case 4: return 5.0 / 3.0;
+      case 5: return 1.4;
+      case 6: return 1.4;
       default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
    }
 }
 
-double rad(double x, double y)
-{
-   return sqrt(x*x + y*y);
-}
+static double rad(double x, double y) { return sqrt(x*x + y*y); }
 
 void v0(const Vector &x, Vector &v)
 {
+   const double atn = pow((x(0)*(1.0-x(0))*4*x(1)*(1.0-x(1))*4.0),0.4);
    switch (problem)
    {
       case 0:
@@ -537,6 +666,7 @@ void v0(const Vector &x, Vector &v)
       case 3: v = 0.0; break;
       case 4:
       {
+         v = 0.0;
          const double r = rad(x(0), x(1));
          if (r < 0.2)
          {
@@ -548,9 +678,29 @@ void v0(const Vector &x, Vector &v)
             v(0) =  2.0 * x(1) / r - 5.0 * x(1);
             v(1) = -2.0 * x(0) / r + 5.0 * x(0);
          }
-         else { v = 0.0; }
+         else { }
          break;
       }
+      case 5:
+      {
+         v = 0.0;
+         if (x(0) >= 0.5 && x(1) >= 0.5) { v(0)=0.0*atn, v(1)=0.0*atn; return;}
+         if (x(0) <  0.5 && x(1) >= 0.5) { v(0)=0.7276*atn, v(1)=0.0*atn; return;}
+         if (x(0) <  0.5 && x(1) <  0.5) { v(0)=0.0*atn, v(1)=0.0*atn; return;}
+         if (x(0) >= 0.5 && x(1) <  0.5) { v(0)=0.0*atn, v(1)=0.7276*atn; return; }
+         MFEM_ABORT("Error in problem 5!");
+         return;
+      }
+      case 6:
+      {
+         v = 0.0;
+         if (x(0) >= 0.5 && x(1) >= 0.5) { v(0)=+0.75*atn, v(1)=-0.5*atn; return;}
+         if (x(0) <  0.5 && x(1) >= 0.5) { v(0)=+0.75*atn, v(1)=+0.5*atn; return;}
+         if (x(0) <  0.5 && x(1) <  0.5) { v(0)=-0.75*atn, v(1)=+0.5*atn; return;}
+         if (x(0) >= 0.5 && x(1) <  0.5) { v(0)=-0.75*atn, v(1)=-0.5*atn; return;}
+         MFEM_ABORT("Error in problem 6!");
+         return;
+      }
       default: MFEM_ABORT("Bad number given for problem id!");
    }
 }
@@ -595,11 +745,31 @@ double e0(const Vector &x)
          }
          else { return (3.0 + 4.0 * log(2.0)) / (gamma - 1.0); }
       }
+      case 5:
+      {
+         const double irg = 1.0 / rho0(x) / (gamma(x) - 1.0);
+         if (x(0) >= 0.5 && x(1) >= 0.5) { return 0.4 * irg; }
+         if (x(0) <  0.5 && x(1) >= 0.5) { return 1.0 * irg; }
+         if (x(0) <  0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         if (x(0) >= 0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         MFEM_ABORT("Error in problem 5!");
+         return 0.0;
+      }
+      case 6:
+      {
+         const double irg = 1.0 / rho0(x) / (gamma(x) - 1.0);
+         if (x(0) >= 0.5 && x(1) >= 0.5) { return 1.0 * irg; }
+         if (x(0) <  0.5 && x(1) >= 0.5) { return 1.0 * irg; }
+         if (x(0) <  0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         if (x(0) >= 0.5 && x(1) <  0.5) { return 1.0 * irg; }
+         MFEM_ABORT("Error in problem 5!");
+         return 0.0;
+      }
       default: MFEM_ABORT("Bad number given for problem id!"); return 0.0;
    }
 }
 
-void display_banner(ostream & os)
+static void display_banner(std::ostream &os)
 {
    os << endl
       << "       __                __                 " << endl
@@ -609,3 +779,91 @@ void display_banner(ostream & os)
       << "   /_____/\\__,_/\\__, /_/ /_/\\____/____/  " << endl
       << "               /____/                       " << endl << endl;
 }
+
+static long GetMaxRssMB()
+{
+   struct rusage usage;
+   if (getrusage(RUSAGE_SELF, &usage)) { return -1; }
+#ifndef __APPLE__
+   const long unit = 1024; // kilo
+#else
+   const long unit = 1024*1024; // mega
+#endif
+   return usage.ru_maxrss/unit; // mega bytes
+}
+
+static bool rerr(const double a, const double v, const double eps)
+{
+   MFEM_VERIFY(fabs(a) > eps && fabs(v) > eps, "One value is near zero!");
+   const double err_a = fabs((a-v)/a);
+   const double err_v = fabs((a-v)/v);
+   return fmax(err_a, err_v) < eps;
+}
+
+static void Checks(const int dim, const int ti, const double nrm, int &chk)
+{
+   const int pb = problem;
+   const double eps = 1.e-13;
+   if (dim==2)
+   {
+      const double p0_05 = 6.54653862453438e+00;
+      const double p0_27 = 7.58857635779292e+00;
+      if (pb==0 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p0_05,eps),"P0, #05");}
+      if (pb==0 && ti==27) {chk++; MFEM_VERIFY(rerr(nrm,p0_27,eps),"P0, #27");}
+      const double p1_05 = 3.50825494522579e+00;
+      const double p1_15 = 2.75644459682321e+00;
+      if (pb==1 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p1_05,eps),"P1, #05");}
+      if (pb==1 && ti==15) {chk++; MFEM_VERIFY(rerr(nrm,p1_15,eps),"P1, #15");}
+      const double p2_05 = 1.02074579565124e+01;
+      const double p2_59 = 1.72159020590190e+01;
+      if (pb==2 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p2_05,eps),"P2, #05");}
+      if (pb==2 && ti==59) {chk++; MFEM_VERIFY(rerr(nrm,p2_59,eps),"P2, #59");}
+      const double p3_05 = 8.0;
+      const double p3_16 = 8.0;
+      if (pb==3 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p3_05,eps),"P3, #05");}
+      if (pb==3 && ti==16) {chk++; MFEM_VERIFY(rerr(nrm,p3_16,eps),"P3, #16");}
+      const double p4_05 = 3.436923188323578e+01;
+      const double p4_52 = 2.682244912720685e+01;
+      if (pb==4 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p4_05,eps),"P4, #05");}
+      if (pb==4 && ti==52) {chk++; MFEM_VERIFY(rerr(nrm,p4_52,eps),"P4, #52");}
+      const double p5_05 = 1.030899557252528e+01;
+      const double p5_36 = 1.057362418574309e+01;
+      if (pb==5 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p5_05,eps),"P5, #05");}
+      if (pb==5 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p5_36,eps),"P5, #36");}
+      const double p6_05 = 8.039707010835693e+00;
+      const double p6_36 = 8.316970976817373e+00;
+      if (pb==6 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p6_05,eps),"P6, #05");}
+      if (pb==6 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p6_36,eps),"P6, #36");}
+   }
+   if (dim==3)
+   {
+      const double  p0_05 = 1.198510951452527e+03;
+      const double p0_188 = 1.199384410059154e+03;
+      if (pb==0 && ti==005) {chk++; MFEM_VERIFY(rerr(nrm,p0_05,eps),"P0, #05");}
+      if (pb==0 && ti==188) {chk++; MFEM_VERIFY(rerr(nrm,p0_188,eps),"P0, #188");}
+      const double p1_05 = 1.33916371859257e+01;
+      const double p1_28 = 7.52107367739800e+00;
+      if (pb==1 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p1_05,eps),"P1, #05");}
+      if (pb==1 && ti==28) {chk++; MFEM_VERIFY(rerr(nrm,p1_28,eps),"P1, #28");}
+      const double p2_05 = 2.041491591302486e+01;
+      const double p2_59 = 3.443180411803796e+01;
+      if (pb==2 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p2_05,eps),"P2, #05");}
+      if (pb==2 && ti==59) {chk++; MFEM_VERIFY(rerr(nrm,p2_59,eps),"P2, #59");}
+      const double p3_05 = 1.600000000000000e+01;
+      const double p3_16 = 1.600000000000000e+01;
+      if (pb==3 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p3_05,eps),"P3, #05");}
+      if (pb==3 && ti==16) {chk++; MFEM_VERIFY(rerr(nrm,p3_16,eps),"P3, #16");}
+      const double p4_05 = 6.873846376647157e+01;
+      const double p4_52 = 5.364489825441373e+01;
+      if (pb==4 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p4_05,eps),"P4, #05");}
+      if (pb==4 && ti==52) {chk++; MFEM_VERIFY(rerr(nrm,p4_52,eps),"P4, #52");}
+      const double p5_05 = 2.061984481890964e+01;
+      const double p5_36 = 2.114519664792607e+01;
+      if (pb==5 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p5_05,eps),"P5, #05");}
+      if (pb==5 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p5_36,eps),"P5, #36");}
+      const double p6_05 = 1.607988713996459e+01;
+      const double p6_36 = 1.662736010353023e+01;
+      if (pb==6 && ti==05) {chk++; MFEM_VERIFY(rerr(nrm,p6_05,eps),"P6, #05");}
+      if (pb==6 && ti==36) {chk++; MFEM_VERIFY(rerr(nrm,p6_36,eps),"P6, #36");}
+   }
+}
diff --git a/serial/laghos_assembly.cpp b/serial/laghos_assembly.cpp
new file mode 100644
index 00000000..a9591540
--- /dev/null
+++ b/serial/laghos_assembly.cpp
@@ -0,0 +1,967 @@
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+// reserved. See files LICENSE and NOTICE for details.
+//
+// This file is part of CEED, a collection of benchmarks, miniapps, software
+// libraries and APIs for efficient high-order finite element and spectral
+// element discretizations for exascale applications. For more information and
+// source code availability see http://github.com/ceed.
+//
+// The CEED research is supported by the Exascale Computing Project (17-SC-20-SC)
+// a collaborative effort of two U.S. Department of Energy organizations (Office
+// of Science and the National Nuclear Security Administration) responsible for
+// the planning and preparation of a capable exascale ecosystem, including
+// software, applications, hardware, advanced system engineering and early
+// testbed platforms, in support of the nation's exascale computing imperative.
+
+#include "laghos_assembly.hpp"
+#include <unordered_map>
+
+namespace mfem
+{
+
+namespace hydrodynamics
+{
+
+void DensityIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
+                                               ElementTransformation &Tr,
+                                               Vector &elvect)
+{
+   const int nqp = IntRule->GetNPoints();
+   Vector shape(fe.GetDof());
+   elvect.SetSize(fe.GetDof());
+   elvect = 0.0;
+   for (int q = 0; q < nqp; q++)
+   {
+      fe.CalcShape(IntRule->IntPoint(q), shape);
+      // Note that rhoDetJ = rho0DetJ0.
+      shape *= qdata.rho0DetJ0w(Tr.ElementNo*nqp + q);
+      elvect += shape;
+   }
+}
+
+void ForceIntegrator::AssembleElementMatrix2(const FiniteElement &trial_fe,
+                                             const FiniteElement &test_fe,
+                                             ElementTransformation &Tr,
+                                             DenseMatrix &elmat)
+{
+   const int e = Tr.ElementNo;
+   const int nqp = IntRule->GetNPoints();
+   const int dim = trial_fe.GetDim();
+   const int h1dofs_cnt = test_fe.GetDof();
+   const int l2dofs_cnt = trial_fe.GetDof();
+   elmat.SetSize(h1dofs_cnt*dim, l2dofs_cnt);
+   elmat = 0.0;
+   DenseMatrix vshape(h1dofs_cnt, dim), loc_force(h1dofs_cnt, dim);
+   Vector shape(l2dofs_cnt), Vloc_force(loc_force.Data(), h1dofs_cnt*dim);
+   for (int q = 0; q < nqp; q++)
+   {
+      const IntegrationPoint &ip = IntRule->IntPoint(q);
+      // Form stress:grad_shape at the current point.
+      test_fe.CalcDShape(ip, vshape);
+      for (int i = 0; i < h1dofs_cnt; i++)
+      {
+         for (int vd = 0; vd < dim; vd++) // Velocity components.
+         {
+            loc_force(i, vd) = 0.0;
+            for (int gd = 0; gd < dim; gd++) // Gradient components.
+            {
+               const int eq = e*nqp + q;
+               const double stressJinvT = qdata.stressJinvT(vd)(eq, gd);
+               loc_force(i, vd) +=  stressJinvT * vshape(i,gd);
+            }
+         }
+      }
+      trial_fe.CalcShape(ip, shape);
+      AddMultVWt(Vloc_force, shape, elmat);
+   }
+}
+
+MassPAOperator::MassPAOperator(FiniteElementSpace &fes,
+                               const IntegrationRule &ir,
+                               Coefficient &Q) :
+   Operator(fes.GetTrueVSize()),
+   dim(fes.GetMesh()->Dimension()),
+   NE(fes.GetMesh()->GetNE()),
+   vsize(fes.GetVSize()),
+   pabf(&fes),
+   ess_tdofs_count(0),
+   ess_tdofs(0)
+{
+   pabf.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+   pabf.AddDomainIntegrator(new mfem::MassIntegrator(Q, &ir));
+   pabf.Assemble();
+   pabf.FormSystemMatrix(mfem::Array<int>(), mass);
+}
+
+void MassPAOperator::SetEssentialTrueDofs(Array<int> &dofs)
+{
+   ess_tdofs_count = dofs.Size();
+   if (ess_tdofs.Size() == 0)
+   {
+      ess_tdofs.SetSize(ess_tdofs_count);
+   }
+   if (ess_tdofs_count == 0) { return; }
+   ess_tdofs = dofs;
+}
+
+void MassPAOperator::EliminateRHS(Vector &b) const
+{
+   if (ess_tdofs_count > 0) { b.SetSubVector(ess_tdofs, 0.0); }
+}
+
+void MassPAOperator::Mult(const Vector &x, Vector &y) const
+{
+   mass->Mult(x, y);
+   if (ess_tdofs_count > 0) { y.SetSubVector(ess_tdofs, 0.0); }
+}
+
+ForcePAOperator::ForcePAOperator(const QuadratureData &qdata,
+                                 FiniteElementSpace &h1,
+                                 FiniteElementSpace &l2,
+                                 const IntegrationRule &ir) :
+   Operator(),
+   dim(h1.GetMesh()->Dimension()),
+   NE(h1.GetMesh()->GetNE()),
+   qdata(qdata),
+   H1(h1),
+   L2(l2),
+   H1R(H1.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
+   L2R(L2.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
+   ir1D(IntRules.Get(Geometry::SEGMENT, ir.GetOrder())),
+   D1D(H1.GetFE(0)->GetOrder()+1),
+   Q1D(ir1D.GetNPoints()),
+   L1D(L2.GetFE(0)->GetOrder()+1),
+   H1sz(H1.GetVDim() * H1.GetFE(0)->GetDof() * NE),
+   L2sz(L2.GetFE(0)->GetDof() * NE),
+   L2D2Q(&L2.GetFE(0)->GetDofToQuad(ir, DofToQuad::TENSOR)),
+   H1D2Q(&H1.GetFE(0)->GetDofToQuad(ir, DofToQuad::TENSOR)),
+   X(L2sz), Y(H1sz) { }
+
+template<int DIM, int D1D, int Q1D, int L1D, int NBZ = 1> static
+void ForceMult2D(const int NE,
+                 const Array<double> &B_,
+                 const Array<double> &Bt_,
+                 const Array<double> &Gt_,
+                 const DenseTensor &sJit_,
+                 const Vector &x, Vector &y)
+{
+   auto b = Reshape(B_.Read(), Q1D, L1D);
+   auto bt = Reshape(Bt_.Read(), D1D, Q1D);
+   auto gt = Reshape(Gt_.Read(), D1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, NE, DIM, DIM);
+   auto energy = Reshape(x.Read(), L1D, L1D, NE);
+   const double eps1 = std::numeric_limits<double>::epsilon();
+   const double eps2 = eps1*eps1;
+   auto velocity = Reshape(y.Write(), D1D, D1D, DIM, NE);
+
+   MFEM_FORALL_2D(e, NE, Q1D, Q1D, 1,
+   {
+      const int z = MFEM_THREAD_ID(z);
+
+      MFEM_SHARED double B[Q1D][L1D];
+      MFEM_SHARED double Bt[D1D][Q1D];
+      MFEM_SHARED double Gt[D1D][Q1D];
+
+      MFEM_SHARED double Ez[NBZ][L1D][L1D];
+      double (*E)[L1D] = (double (*)[L1D])(Ez + z);
+
+      MFEM_SHARED double LQz[2][NBZ][D1D][Q1D];
+      double (*LQ0)[Q1D] = (double (*)[Q1D])(LQz[0] + z);
+      double (*LQ1)[Q1D] = (double (*)[Q1D])(LQz[1] + z);
+
+      MFEM_SHARED double QQz[3][NBZ][Q1D][Q1D];
+      double (*QQ)[Q1D] = (double (*)[Q1D])(QQz[0] + z);
+      double (*QQ0)[Q1D] = (double (*)[Q1D])(QQz[1] + z);
+      double (*QQ1)[Q1D] = (double (*)[Q1D])(QQz[2] + z);
+
+      if (z == 0)
+      {
+         MFEM_FOREACH_THREAD(q,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(l,y,Q1D)
+            {
+               if (l < L1D) { B[q][l] = b(q,l); }
+               if (l < D1D) { Bt[l][q] = bt(l,q); }
+               if (l < D1D) { Gt[l][q] = gt(l,q); }
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+
+      MFEM_FOREACH_THREAD(lx,x,L1D)
+      {
+         MFEM_FOREACH_THREAD(ly,y,L1D)
+         {
+            E[lx][ly] = energy(lx,ly,e);
+         }
+      }
+      MFEM_SYNC_THREAD;
+
+      MFEM_FOREACH_THREAD(ly,y,L1D)
+      {
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            double u = 0.0;
+            for (int lx = 0; lx < L1D; ++lx)
+            {
+               u += B[qx][lx] * E[lx][ly];
+            }
+            LQ0[ly][qx] = u;
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qy,y,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            double u = 0.0;
+            for (int ly = 0; ly < L1D; ++ly)
+            {
+               u += B[qy][ly] * LQ0[ly][qx];
+            }
+            QQ[qy][qx] = u;
+         }
+      }
+      MFEM_SYNC_THREAD;
+
+      for (int c = 0; c < DIM; ++c)
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               const double esx = QQ[qy][qx] * sJit(qx,qy,e,0,c);
+               const double esy = QQ[qy][qx] * sJit(qx,qy,e,1,c);
+               QQ0[qy][qx] = esx;
+               QQ1[qy][qx] = esy;
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(dx,x,D1D)
+            {
+               double u = 0.0;
+               double v = 0.0;
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  u += Gt[dx][qx] * QQ0[qy][qx];
+                  v += Bt[dx][qx] * QQ1[qy][qx];
+               }
+               LQ0[dx][qy] = u;
+               LQ1[dx][qy] = v;
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dy,y,D1D)
+         {
+            MFEM_FOREACH_THREAD(dx,x,D1D)
+            {
+               double u = 0.0;
+               double v = 0.0;
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  u += LQ0[dx][qy] * Bt[dy][qy];
+                  v += LQ1[dx][qy] * Gt[dy][qy];
+               }
+               velocity(dx,dy,c,e) = u + v;
+            }
+         }
+         MFEM_SYNC_THREAD;
+      }
+      for (int c = 0; c < DIM; ++c)
+      {
+         MFEM_FOREACH_THREAD(dy,y,D1D)
+         {
+            MFEM_FOREACH_THREAD(dx,x,D1D)
+            {
+               const double v = velocity(dx,dy,c,e);
+               if (fabs(v) < eps2)
+               {
+                  velocity(dx,dy,c,e) = 0.0;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+      }
+   });
+}
+
+template<int DIM, int D1D, int Q1D, int L1D> static
+void ForceMult3D(const int NE,
+                 const Array<double> &B_,
+                 const Array<double> &Bt_,
+                 const Array<double> &Gt_,
+                 const DenseTensor &sJit_,
+                 const Vector &x, Vector &y)
+{
+   auto b = Reshape(B_.Read(), Q1D, L1D);
+   auto bt = Reshape(Bt_.Read(), D1D, Q1D);
+   auto gt = Reshape(Gt_.Read(), D1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, Q1D, NE, DIM, DIM);
+   auto energy = Reshape(x.Read(), L1D, L1D, L1D, NE);
+   const double eps1 = std::numeric_limits<double>::epsilon();
+   const double eps2 = eps1*eps1;
+   auto velocity = Reshape(y.Write(), D1D, D1D, D1D, DIM, NE);
+
+   MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
+   {
+      const int z = MFEM_THREAD_ID(z);
+
+      MFEM_SHARED double B[Q1D][L1D];
+      MFEM_SHARED double Bt[D1D][Q1D];
+      MFEM_SHARED double Gt[D1D][Q1D];
+
+      MFEM_SHARED double E[L1D][L1D][L1D];
+
+      MFEM_SHARED double sm0[3][Q1D*Q1D*Q1D];
+      MFEM_SHARED double sm1[3][Q1D*Q1D*Q1D];
+
+      double (*MMQ0)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+0);
+      double (*MMQ1)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+1);
+      double (*MMQ2)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+2);
+
+      double (*MQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+0);
+      double (*MQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+1);
+      double (*MQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+2);
+
+      MFEM_SHARED double QQQ[Q1D][Q1D][Q1D];
+      double (*QQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+0);
+      double (*QQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+1);
+      double (*QQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+2);
+
+      if (z == 0)
+      {
+         MFEM_FOREACH_THREAD(q,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(l,y,Q1D)
+            {
+               if (l < L1D) { B[q][l] = b(q,l); }
+               if (l < D1D) { Bt[l][q] = bt(l,q); }
+               if (l < D1D) { Gt[l][q] = gt(l,q); }
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lx,x,L1D)
+      {
+         MFEM_FOREACH_THREAD(ly,y,L1D)
+         {
+            MFEM_FOREACH_THREAD(lz,z,L1D)
+            {
+               E[lx][ly][lz] = energy(lx,ly,lz,e);
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lz,z,L1D)
+      {
+         MFEM_FOREACH_THREAD(ly,y,L1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               double u = 0.0;
+               for (int lx = 0; lx < L1D; ++lx)
+               {
+                  u += B[qx][lx] * E[lx][ly][lz];
+               }
+               MMQ0[lz][ly][qx] = u;
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lz,z,L1D)
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               double u = 0.0;
+               for (int ly = 0; ly < L1D; ++ly)
+               {
+                  u += B[qy][ly] * MMQ0[lz][ly][qx];
+               }
+               MQQ0[lz][qy][qx] = u;
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               double u = 0.0;
+               for (int lz = 0; lz < L1D; ++lz)
+               {
+                  u += B[qz][lz] * MQQ0[lz][qy][qx];
+               }
+               QQQ[qz][qy][qx] = u;
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      for (int c = 0; c < 3; ++c)
+      {
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  const double esx = QQQ[qz][qy][qx] * sJit(qx,qy,qz,e,0,c);
+                  const double esy = QQQ[qz][qy][qx] * sJit(qx,qy,qz,e,1,c);
+                  const double esz = QQQ[qz][qy][qx] * sJit(qx,qy,qz,e,2,c);
+                  QQQ0[qz][qy][qx] = esx;
+                  QQQ1[qz][qy][qx] = esy;
+                  QQQ2[qz][qy][qx] = esz;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(hx,x,D1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     u += Gt[hx][qx] * QQQ0[qz][qy][qx];
+                     v += Bt[hx][qx] * QQQ1[qz][qy][qx];
+                     w += Bt[hx][qx] * QQQ2[qz][qy][qx];
+                  }
+                  MQQ0[hx][qy][qz] = u;
+                  MQQ1[hx][qy][qz] = v;
+                  MQQ2[hx][qy][qz] = w;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(hy,y,D1D)
+            {
+               MFEM_FOREACH_THREAD(hx,x,D1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int qy = 0; qy < Q1D; ++qy)
+                  {
+                     u += MQQ0[hx][qy][qz] * Bt[hy][qy];
+                     v += MQQ1[hx][qy][qz] * Gt[hy][qy];
+                     w += MQQ2[hx][qy][qz] * Bt[hy][qy];
+                  }
+                  MMQ0[hx][hy][qz] = u;
+                  MMQ1[hx][hy][qz] = v;
+                  MMQ2[hx][hy][qz] = w;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(hz,z,D1D)
+         {
+            MFEM_FOREACH_THREAD(hy,y,D1D)
+            {
+               MFEM_FOREACH_THREAD(hx,x,D1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int qz = 0; qz < Q1D; ++qz)
+                  {
+                     u += MMQ0[hx][hy][qz] * Bt[hz][qz];
+                     v += MMQ1[hx][hy][qz] * Bt[hz][qz];
+                     w += MMQ2[hx][hy][qz] * Gt[hz][qz];
+                  }
+                  velocity(hx,hy,hz,c,e) = u + v + w;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+      }
+      for (int c = 0; c < 3; ++c)
+      {
+         MFEM_FOREACH_THREAD(hz,z,D1D)
+         {
+            MFEM_FOREACH_THREAD(hy,y,D1D)
+            {
+               MFEM_FOREACH_THREAD(hx,x,D1D)
+               {
+                  const double v = velocity(hx,hy,hz,c,e);
+                  if (fabs(v) < eps2)
+                  {
+                     velocity(hx,hy,hz,c,e) = 0.0;
+                  }
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+      }
+   });
+}
+
+typedef void (*fForceMult)(const int E,
+                           const Array<double> &B,
+                           const Array<double> &Bt,
+                           const Array<double> &Gt,
+                           const DenseTensor &stressJinvT,
+                           const Vector &X, Vector &Y);
+
+static void ForceMult(const int DIM, const int D1D, const int Q1D,
+                      const int L1D, const int H1D, const int NE,
+                      const Array<double> &B,
+                      const Array<double> &Bt,
+                      const Array<double> &Gt,
+                      const DenseTensor &stressJinvT,
+                      const Vector &e,
+                      Vector &v)
+{
+   MFEM_VERIFY(D1D==H1D, "D1D!=H1D");
+   MFEM_VERIFY(L1D==D1D-1,"L1D!=D1D-1");
+   const int id = ((DIM)<<8)|(D1D)<<4|(Q1D);
+   static std::unordered_map<int, fForceMult> call =
+   {
+      // 2D
+      {0x234,&ForceMult2D<2,3,4,2>},
+      {0x246,&ForceMult2D<2,4,6,3>},
+      {0x258,&ForceMult2D<2,5,8,4>},
+      // 3D
+      {0x334,&ForceMult3D<3,3,4,2>},
+      {0x346,&ForceMult3D<3,4,6,3>},
+      {0x358,&ForceMult3D<3,5,8,4>},
+   };
+   if (!call[id])
+   {
+      mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
+      MFEM_ABORT("Unknown kernel");
+   }
+   call[id](NE, B, Bt, Gt, stressJinvT, e, v);
+}
+
+void ForcePAOperator::Mult(const Vector &x, Vector &y) const
+{
+   if (L2R) { L2R->Mult(x, X); }
+   else { X = x; }
+   ForceMult(dim, D1D, Q1D, L1D, D1D, NE,
+             L2D2Q->B, H1D2Q->Bt, H1D2Q->Gt,
+             qdata.stressJinvT, X, Y);
+   H1R->MultTranspose(Y, y);
+}
+
+template<int DIM, int D1D, int Q1D, int L1D, int NBZ = 1> static
+void ForceMultTranspose2D(const int NE,
+                          const Array<double> &Bt_,
+                          const Array<double> &B_,
+                          const Array<double> &G_,
+                          const DenseTensor &sJit_,
+                          const Vector &x, Vector &y)
+{
+   auto b = Reshape(B_.Read(), Q1D, D1D);
+   auto g = Reshape(G_.Read(), Q1D, D1D);
+   auto bt = Reshape(Bt_.Read(), L1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, NE, DIM, DIM);
+   auto velocity = Reshape(x.Read(), D1D, D1D, DIM, NE);
+   auto energy = Reshape(y.Write(), L1D, L1D, NE);
+
+   MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
+   {
+      const int z = MFEM_THREAD_ID(z);
+
+      MFEM_SHARED double Bt[L1D][Q1D];
+      MFEM_SHARED double B[Q1D][D1D];
+      MFEM_SHARED double G[Q1D][D1D];
+
+      MFEM_SHARED double Vz[NBZ][D1D*D1D];
+      double (*V)[D1D] = (double (*)[D1D])(Vz + z);
+
+      MFEM_SHARED double DQz[DIM][NBZ][D1D*Q1D];
+      double (*DQ0)[Q1D] = (double (*)[Q1D])(DQz[0] + z);
+      double (*DQ1)[Q1D] = (double (*)[Q1D])(DQz[1] + z);
+
+      MFEM_SHARED double QQz[3][NBZ][Q1D*Q1D];
+      double (*QQ)[Q1D] = (double (*)[Q1D])(QQz[0] + z);
+      double (*QQ0)[Q1D] = (double (*)[Q1D])(QQz[1] + z);
+      double (*QQ1)[Q1D] = (double (*)[Q1D])(QQz[2] + z);
+
+      MFEM_SHARED double QLz[NBZ][Q1D*L1D];
+      double (*QL)[L1D] = (double (*)[L1D]) (QLz + z);
+
+      if (z == 0)
+      {
+         MFEM_FOREACH_THREAD(q,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(h,y,Q1D)
+            {
+               if (h < D1D) { B[q][h] = b(q,h); }
+               if (h < D1D) { G[q][h] = g(q,h); }
+               const int l = h;
+               if (l < L1D) { Bt[l][q] = bt(l,q); }
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qy,y,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            QQ[qy][qx] = 0.0;
+         }
+      }
+      MFEM_SYNC_THREAD;
+
+      for (int c = 0; c < DIM; ++c)
+      {
+
+         MFEM_FOREACH_THREAD(dx,x,D1D)
+         {
+            MFEM_FOREACH_THREAD(dy,y,D1D)
+            {
+               V[dx][dy] = velocity(dx,dy,c,e);
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dy,y,D1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               double u = 0.0;
+               double v = 0.0;
+               for (int dx = 0; dx < D1D; ++dx)
+               {
+                  const double input = V[dx][dy];
+                  u += B[qx][dx] * input;
+                  v += G[qx][dx] * input;
+               }
+               DQ0[dy][qx] = u;
+               DQ1[dy][qx] = v;
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               double u = 0.0;
+               double v = 0.0;
+               for (int dy = 0; dy < D1D; ++dy)
+               {
+                  u += DQ1[dy][qx] * B[qy][dy];
+                  v += DQ0[dy][qx] * G[qy][dy];
+               }
+               QQ0[qy][qx] = u;
+               QQ1[qy][qx] = v;
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               const double esx = QQ0[qy][qx] * sJit(qx,qy,e,0,c);
+               const double esy = QQ1[qy][qx] * sJit(qx,qy,e,1,c);
+               QQ[qy][qx] += esx + esy;
+            }
+         }
+         MFEM_SYNC_THREAD;
+      }
+      MFEM_SYNC_THREAD;
+
+      MFEM_FOREACH_THREAD(qy,y,Q1D)
+      {
+         MFEM_FOREACH_THREAD(lx,x,L1D)
+         {
+            double u = 0.0;
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               u += QQ[qy][qx] * Bt[lx][qx];
+            }
+            QL[qy][lx] = u;
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(ly,y,L1D)
+      {
+         MFEM_FOREACH_THREAD(lx,x,L1D)
+         {
+            double u = 0.0;
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               u += QL[qy][lx] * Bt[ly][qy];
+            }
+            energy(lx,ly,e) = u;
+         }
+      }
+      MFEM_SYNC_THREAD;
+   });
+}
+
+template<int DIM, int D1D, int Q1D, int L1D> static
+void ForceMultTranspose3D(const int NE,
+                          const Array<double> &Bt_,
+                          const Array<double> &B_,
+                          const Array<double> &G_,
+                          const DenseTensor &sJit_,
+                          const Vector &v_,
+                          Vector &e_)
+{
+   auto b = Reshape(B_.Read(), Q1D, D1D);
+   auto g = Reshape(G_.Read(), Q1D, D1D);
+   auto bt = Reshape(Bt_.Read(), L1D, Q1D);
+   const double *StressJinvT = Read(sJit_.GetMemory(), Q1D*Q1D*Q1D*NE*DIM*DIM);
+   auto sJit = Reshape(StressJinvT, Q1D, Q1D, Q1D, NE, DIM, DIM);
+   auto velocity = Reshape(v_.Read(), D1D, D1D, D1D, DIM, NE);
+   auto energy = Reshape(e_.Write(), L1D, L1D, L1D, NE);
+
+   MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
+   {
+      const int z = MFEM_THREAD_ID(z);
+
+      MFEM_SHARED double Bt[L1D][Q1D];
+      MFEM_SHARED double B[Q1D][D1D];
+      MFEM_SHARED double G[Q1D][D1D];
+
+      MFEM_SHARED double sm0[3][Q1D*Q1D*Q1D];
+      MFEM_SHARED double sm1[3][Q1D*Q1D*Q1D];
+      double (*V)[D1D][D1D]    = (double (*)[D1D][D1D]) (sm0+0);
+      double (*MMQ0)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+1);
+      double (*MMQ1)[D1D][Q1D] = (double (*)[D1D][Q1D]) (sm0+2);
+
+      double (*MQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+0);
+      double (*MQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+1);
+      double (*MQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm1+2);
+
+      double (*QQQ0)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+0);
+      double (*QQQ1)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+1);
+      double (*QQQ2)[Q1D][Q1D] = (double (*)[Q1D][Q1D]) (sm0+2);
+
+      MFEM_SHARED double QQQ[Q1D][Q1D][Q1D];
+
+      if (z == 0)
+      {
+         MFEM_FOREACH_THREAD(q,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(h,y,Q1D)
+            {
+               if (h < D1D) { B[q][h] = b(q,h); }
+               if (h < D1D) { G[q][h] = g(q,h); }
+               const int l = h;
+               if (l < L1D) { Bt[l][q] = bt(l,q); }
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               QQQ[qz][qy][qx] = 0.0;
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+
+      for (int c = 0; c < DIM; ++c)
+      {
+         MFEM_FOREACH_THREAD(dx,x,D1D)
+         {
+            MFEM_FOREACH_THREAD(dy,y,D1D)
+            {
+               MFEM_FOREACH_THREAD(dz,z,D1D)
+               {
+                  V[dx][dy][dz] = velocity(dx,dy,dz,c,e);
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dz,z,D1D)
+         {
+            MFEM_FOREACH_THREAD(dy,y,D1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  for (int dx = 0; dx < D1D; ++dx)
+                  {
+                     const double input = V[dx][dy][dz];
+                     u += G[qx][dx] * input;
+                     v += B[qx][dx] * input;
+                  }
+                  MMQ0[dz][dy][qx] = u;
+                  MMQ1[dz][dy][qx] = v;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(dz,z,D1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int dy = 0; dy < D1D; ++dy)
+                  {
+                     u += MMQ0[dz][dy][qx] * B[qy][dy];
+                     v += MMQ1[dz][dy][qx] * G[qy][dy];
+                     w += MMQ1[dz][dy][qx] * B[qy][dy];
+                  }
+                  MQQ0[dz][qy][qx] = u;
+                  MQQ1[dz][qy][qx] = v;
+                  MQQ2[dz][qy][qx] = w;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  double u = 0.0;
+                  double v = 0.0;
+                  double w = 0.0;
+                  for (int dz = 0; dz < D1D; ++dz)
+                  {
+                     u += MQQ0[dz][qy][qx] * B[qz][dz];
+                     v += MQQ1[dz][qy][qx] * B[qz][dz];
+                     w += MQQ2[dz][qy][qx] * G[qz][dz];
+                  }
+                  QQQ0[qz][qy][qx] = u;
+                  QQQ1[qz][qy][qx] = v;
+                  QQQ2[qz][qy][qx] = w;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  const double esx = QQQ0[qz][qy][qx] * sJit(qx,qy,qz,e,0,c);
+                  const double esy = QQQ1[qz][qy][qx] * sJit(qx,qy,qz,e,1,c);
+                  const double esz = QQQ2[qz][qy][qx] * sJit(qx,qy,qz,e,2,c);
+                  QQQ[qz][qy][qx] += esx + esy + esz;
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(lx,x,L1D)
+            {
+               double u = 0.0;
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  u += QQQ[qz][qy][qx] * Bt[lx][qx];
+               }
+               MQQ0[qz][qy][lx] = u;
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(qz,z,Q1D)
+      {
+         MFEM_FOREACH_THREAD(ly,y,L1D)
+         {
+            MFEM_FOREACH_THREAD(lx,x,L1D)
+            {
+               double u = 0.0;
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  u += MQQ0[qz][qy][lx] * Bt[ly][qy];
+               }
+               MMQ0[qz][ly][lx] = u;
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(lz,z,L1D)
+      {
+         MFEM_FOREACH_THREAD(ly,y,L1D)
+         {
+            MFEM_FOREACH_THREAD(lx,x,L1D)
+            {
+               double u = 0.0;
+               for (int qz = 0; qz < Q1D; ++qz)
+               {
+                  u += MMQ0[qz][ly][lx] * Bt[lz][qz];
+               }
+               energy(lx,ly,lz,e) = u;
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+   });
+}
+
+typedef void (*fForceMultTranspose)(const int NE,
+                                    const Array<double> &Bt,
+                                    const Array<double> &B,
+                                    const Array<double> &G,
+                                    const DenseTensor &sJit,
+                                    const Vector &X, Vector &Y);
+
+static void ForceMultTranspose(const int DIM, const int D1D, const int Q1D,
+                               const int L1D, const int NE,
+                               const Array<double> &L2Bt,
+                               const Array<double> &H1B,
+                               const Array<double> &H1G,
+                               const DenseTensor &stressJinvT,
+                               const Vector &v,
+                               Vector &e)
+{
+   // DIM, D1D, Q1D, L1D(=D1D-1)
+   MFEM_VERIFY(L1D==D1D-1, "L1D!=D1D-1");
+   const int id = ((DIM)<<8)|(D1D)<<4|(Q1D);
+   static std::unordered_map<int, fForceMultTranspose> call =
+   {
+      {0x234,&ForceMultTranspose2D<2,3,4,2>},
+      {0x246,&ForceMultTranspose2D<2,4,6,3>},
+      {0x258,&ForceMultTranspose2D<2,5,8,4>},
+      {0x334,&ForceMultTranspose3D<3,3,4,2>},
+      {0x346,&ForceMultTranspose3D<3,4,6,3>},
+      {0x358,&ForceMultTranspose3D<3,5,8,4>}
+   };
+   if (!call[id])
+   {
+      mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
+      MFEM_ABORT("Unknown kernel");
+   }
+   call[id](NE, L2Bt, H1B, H1G, stressJinvT, v, e);
+}
+
+void ForcePAOperator::MultTranspose(const Vector &x, Vector &y) const
+{
+   H1R->Mult(x, Y);
+   ForceMultTranspose(dim, D1D, Q1D, L1D, NE,
+                      L2D2Q->Bt, H1D2Q->B, H1D2Q->G,
+                      qdata.stressJinvT, Y, X);
+   if (L2R) { L2R->MultTranspose(X, y); }
+   else { y = X; }
+}
+
+} // namespace hydrodynamics
+
+} // namespace mfem
diff --git a/serial/laghos_assembly.hpp b/serial/laghos_assembly.hpp
new file mode 100644
index 00000000..34c290b2
--- /dev/null
+++ b/serial/laghos_assembly.hpp
@@ -0,0 +1,135 @@
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
+// reserved. See files LICENSE and NOTICE for details.
+//
+// This file is part of CEED, a collection of benchmarks, miniapps, software
+// libraries and APIs for efficient high-order finite element and spectral
+// element discretizations for exascale applications. For more information and
+// source code availability see http://github.com/ceed.
+//
+// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
+// a collaborative effort of two U.S. Department of Energy organizations (Office
+// of Science and the National Nuclear Security Administration) responsible for
+// the planning and preparation of a capable exascale ecosystem, including
+// software, applications, hardware, advanced system engineering and early
+// testbed platforms, in support of the nation's exascale computing imperative.
+
+#ifndef MFEM_LAGHOS_ASSEMBLY
+#define MFEM_LAGHOS_ASSEMBLY
+
+#include "mfem.hpp"
+#include "general/forall.hpp"
+#include "linalg/dtensor.hpp"
+
+namespace mfem
+{
+
+namespace hydrodynamics
+{
+
+// Container for all data needed at quadrature points.
+struct QuadratureData
+{
+   // Reference to physical Jacobian for the initial mesh.
+   // These are computed only at time zero and stored here.
+   DenseTensor Jac0inv;
+
+   // Quadrature data used for full/partial assembly of the force operator.
+   // At each quadrature point, it combines the stress, inverse Jacobian,
+   // determinant of the Jacobian and the integration weight.
+   // It must be recomputed in every time step.
+   DenseTensor stressJinvT;
+
+   // Quadrature data used for full/partial assembly of the mass matrices.
+   // At time zero, we compute and store (rho0 * det(J0) * qp_weight) at each
+   // quadrature point. Note the at any other time, we can compute
+   // rho = rho0 * det(J0) / det(J), representing the notion of pointwise mass
+   // conservation.
+   Vector rho0DetJ0w;
+
+   // Initial length scale. This represents a notion of local mesh size.
+   // We assume that all initial zones have similar size.
+   double h0;
+
+   // Estimate of the minimum time step over all quadrature points. This is
+   // recomputed at every time step to achieve adaptive time stepping.
+   double dt_est;
+
+   QuadratureData(int dim, int NE, int quads_per_el)
+      : Jac0inv(dim, dim, NE * quads_per_el),
+        stressJinvT(NE * quads_per_el, dim, dim),
+        rho0DetJ0w(NE * quads_per_el) { }
+};
+
+// This class is used only for visualization. It assembles (rho, phi) in each
+// zone, which is used by LagrangianHydroOperator::ComputeDensity to do an L2
+// projection of the density.
+class DensityIntegrator : public LinearFormIntegrator
+{
+   using LinearFormIntegrator::AssembleRHSElementVect;
+private:
+   const QuadratureData &qdata;
+
+public:
+   DensityIntegrator(QuadratureData &qdata) : qdata(qdata) { }
+   virtual void AssembleRHSElementVect(const FiniteElement &fe,
+                                       ElementTransformation &Tr,
+                                       Vector &elvect);
+};
+
+// Performs full assembly for the force operator.
+class ForceIntegrator : public BilinearFormIntegrator
+{
+private:
+   const QuadratureData &qdata;
+public:
+   ForceIntegrator(QuadratureData &qdata) : qdata(qdata) { }
+   virtual void AssembleElementMatrix2(const FiniteElement &trial_fe,
+                                       const FiniteElement &test_fe,
+                                       ElementTransformation &Tr,
+                                       DenseMatrix &elmat);
+};
+
+// Performs partial assembly for the force operator.
+class ForcePAOperator : public Operator
+{
+private:
+   const int dim, NE;
+   const QuadratureData &qdata;
+   const FiniteElementSpace &H1, &L2;
+   const Operator *H1R, *L2R;
+   const IntegrationRule &ir1D;
+   const int D1D, Q1D, L1D, H1sz, L2sz;
+   const DofToQuad *L2D2Q, *H1D2Q;
+   mutable Vector X, Y;
+public:
+   ForcePAOperator(const QuadratureData&,
+                   FiniteElementSpace&,
+                   FiniteElementSpace&,
+                   const IntegrationRule&);
+   virtual void Mult(const Vector&, Vector&) const;
+   virtual void MultTranspose(const Vector&, Vector&) const;
+};
+
+// Performs partial assembly for the velocity mass matrix.
+class MassPAOperator : public Operator
+{
+private:
+   const int dim, NE, vsize;
+   BilinearForm pabf;
+   int ess_tdofs_count;
+   Array<int> ess_tdofs;
+   OperatorPtr mass;
+public:
+   MassPAOperator(FiniteElementSpace&, const IntegrationRule&, Coefficient&);
+   virtual void Mult(const Vector&, Vector&) const;
+   virtual void SetEssentialTrueDofs(Array<int>&);
+   virtual void EliminateRHS(Vector&) const;
+   const BilinearForm &GetBF() const { return pabf; }
+};
+
+} // namespace hydrodynamics
+
+} // namespace mfem
+
+#endif // MFEM_LAGHOS_ASSEMBLY
diff --git a/serial/laghos_solver.cpp b/serial/laghos_solver.cpp
index 59dcb3c4..fb1877a8 100644
--- a/serial/laghos_solver.cpp
+++ b/serial/laghos_solver.cpp
@@ -14,9 +14,10 @@
 // software, applications, hardware, advanced system engineering and early
 // testbed platforms, in support of the nation's exascale computing imperative.
 
+#include "general/forall.hpp"
 #include "laghos_solver.hpp"
-
-using namespace std;
+#include "linalg/kernels.hpp"
+#include <unordered_map>
 
 namespace mfem
 {
@@ -28,6 +29,7 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
                     GridFunction &gf, const char *title,
                     int x, int y, int w, int h, bool vec)
 {
+   gf.HostRead();
    Mesh &mesh = *gf.FESpace()->GetMesh();
    bool newly_opened = false;
    int connection_failed;
@@ -42,6 +44,7 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
       }
       sock << "solution\n";
 
+
       mesh.Print(sock);
       gf.Save(sock);
 
@@ -52,7 +55,7 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
               << x << " " << y << " " << w << " " << h << "\n"
               << "keys maaAc";
          if ( vec ) { sock << "vvv"; }
-         sock << endl;
+         sock << std::endl;
       }
 
       connection_failed = !sock && !newly_opened;
@@ -60,122 +63,290 @@ void VisualizeField(socketstream &sock, const char *vishost, int visport,
    while (connection_failed);
 }
 
-LagrangianHydroOperator::LagrangianHydroOperator(int size,
-                                                 FiniteElementSpace &h1_fes,
-                                                 FiniteElementSpace &l2_fes,
-                                                 Array<int> &essential_tdofs,
-                                                 GridFunction &rho0,
-                                                 int source_type_, double cfl_,
-                                                 Coefficient *material_,
-                                                 bool visc, bool pa,
-                                                 double cgt, int cgiter,
-                                                 int h1_basis_type)
-   : TimeDependentOperator(size),
-     H1FESpace(h1_fes), L2FESpace(l2_fes),
-     ess_tdofs(essential_tdofs),
-     dim(h1_fes.GetMesh()->Dimension()),
-     nzones(h1_fes.GetMesh()->GetNE()),
-     l2dofs_cnt(l2_fes.GetFE(0)->GetDof()),
-     h1dofs_cnt(h1_fes.GetFE(0)->GetDof()),
-     source_type(source_type_), cfl(cfl_),
-     use_viscosity(visc), p_assembly(pa), cg_rel_tol(cgt), cg_max_iter(cgiter),
-     material_pcf(material_),
-     Mv(&h1_fes), Mv_spmat_copy(),
-     Me(l2dofs_cnt, l2dofs_cnt, nzones), Me_inv(l2dofs_cnt, l2dofs_cnt, nzones),
-     integ_rule(IntRules.Get(h1_fes.GetMesh()->GetElementBaseGeometry(0),
-                             3*h1_fes.GetOrder(0) + l2_fes.GetOrder(0) - 1)),
-     quad_data(dim, nzones, integ_rule.GetNPoints()),
-     quad_data_is_current(false), forcemat_is_assembled(false),
-     tensors1D(H1FESpace.GetFE(0)->GetOrder(), L2FESpace.GetFE(0)->GetOrder(),
-               int(floor(0.7 + pow(integ_rule.GetNPoints(), 1.0 / dim))),
-               h1_basis_type == BasisType::Positive),
-     evaluator(H1FESpace, &tensors1D),
-     Force(&l2_fes, &h1_fes), ForcePA(&quad_data, h1_fes, l2_fes, &tensors1D),
-     VMassPA(&quad_data, H1FESpace, &tensors1D), VMassPA_prec(H1FESpace),
-     locEMassPA(&quad_data, l2_fes, &tensors1D),
-     locCG(), timer()
+static void Rho0DetJ0Vol(const int dim, const int NE,
+                         const IntegrationRule &ir, Mesh *mesh,
+                         FiniteElementSpace &L2,
+                         const GridFunction &rho0,
+                         QuadratureData &qdata,
+                         double &volume)
 {
-   GridFunctionCoefficient rho_coeff(&rho0);
-
-   // Standard local assembly and inversion for energy mass matrices.
-   MassIntegrator mi(rho_coeff, &integ_rule);
-   for (int i = 0; i < nzones; i++)
+   const int NQ = ir.GetNPoints();
+   const int Q1D = IntRules.Get(Geometry::SEGMENT,ir.GetOrder()).GetNPoints();
+   const int flags = GeometricFactors::JACOBIANS|GeometricFactors::DETERMINANTS;
+   const GeometricFactors *geom = mesh->GetGeometricFactors(ir, flags);
+   Vector rho0Q(NQ*NE);
+   rho0Q.UseDevice(true);
+   Vector j, detj;
+   const QuadratureInterpolator *qi = L2.GetQuadratureInterpolator(ir);
+   qi->Mult(rho0, QuadratureInterpolator::VALUES, rho0Q, j, detj);
+   auto W = ir.GetWeights().Read();
+   auto R = Reshape(rho0Q.Read(), NQ, NE);
+   auto J = Reshape(geom->J.Read(), NQ, dim, dim, NE);
+   auto detJ = Reshape(geom->detJ.Read(), NQ, NE);
+   auto V = Reshape(qdata.rho0DetJ0w.Write(), NQ, NE);
+   Memory<double> &Jinv_m = qdata.Jac0inv.GetMemory();
+   const MemoryClass mc = Device::GetMemoryClass();
+   const int Ji_total_size = qdata.Jac0inv.TotalSize();
+   auto invJ = Reshape(Jinv_m.Write(mc, Ji_total_size), dim, dim, NQ, NE);
+   Vector vol(NE*NQ), one(NE*NQ);
+   auto A = Reshape(vol.Write(), NQ, NE);
+   auto O = Reshape(one.Write(), NQ, NE);
+   MFEM_ASSERT(dim==2 || dim==3, "");
+   if (dim==2)
    {
-      DenseMatrixInverse inv(&Me(i));
-      mi.AssembleElementMatrix(*l2_fes.GetFE(i),
-                               *l2_fes.GetElementTransformation(i), Me(i));
-      inv.Factor();
-      inv.GetInverseMatrix(Me_inv(i));
+      MFEM_FORALL_2D(e, NE, Q1D, Q1D, 1,
+      {
+         MFEM_FOREACH_THREAD(qy,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qx,x,Q1D)
+            {
+               const int q = qx + qy * Q1D;
+               const double J11 = J(q,0,0,e);
+               const double J12 = J(q,1,0,e);
+               const double J21 = J(q,0,1,e);
+               const double J22 = J(q,1,1,e);
+               const double det = detJ(q,e);
+               V(q,e) =  W[q] * R(q,e) * det;
+               const double r_idetJ = 1.0 / det;
+               invJ(0,0,q,e) =  J22 * r_idetJ;
+               invJ(1,0,q,e) = -J12 * r_idetJ;
+               invJ(0,1,q,e) = -J21 * r_idetJ;
+               invJ(1,1,q,e) =  J11 * r_idetJ;
+               A(q,e) = W[q] * det;
+               O(q,e) = 1.0;
+            }
+         }
+      });
+   }
+   else
+   {
+      MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
+      {
+         MFEM_FOREACH_THREAD(qz,z,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qx,x,Q1D)
+               {
+                  const int q = qx + (qy + qz * Q1D) * Q1D;
+                  const double J11 = J(q,0,0,e), J12 = J(q,0,1,e), J13 = J(q,0,2,e);
+                  const double J21 = J(q,1,0,e), J22 = J(q,1,1,e), J23 = J(q,1,2,e);
+                  const double J31 = J(q,2,0,e), J32 = J(q,2,1,e), J33 = J(q,2,2,e);
+                  const double det = detJ(q,e);
+                  V(q,e) = W[q] * R(q,e) * det;
+                  const double r_idetJ = 1.0 / det;
+                  invJ(0,0,q,e) = r_idetJ * ((J22 * J33)-(J23 * J32));
+                  invJ(1,0,q,e) = r_idetJ * ((J32 * J13)-(J33 * J12));
+                  invJ(2,0,q,e) = r_idetJ * ((J12 * J23)-(J13 * J22));
+                  invJ(0,1,q,e) = r_idetJ * ((J23 * J31)-(J21 * J33));
+                  invJ(1,1,q,e) = r_idetJ * ((J33 * J11)-(J31 * J13));
+                  invJ(2,1,q,e) = r_idetJ * ((J13 * J21)-(J11 * J23));
+                  invJ(0,2,q,e) = r_idetJ * ((J21 * J32)-(J22 * J31));
+                  invJ(1,2,q,e) = r_idetJ * ((J31 * J12)-(J32 * J11));
+                  invJ(2,2,q,e) = r_idetJ * ((J11 * J22)-(J12 * J21));
+                  A(q,e) = W[q] * det;
+                  O(q,e) = 1.0;
+               }
+            }
+         }
+      });
    }
+   qdata.rho0DetJ0w.HostRead();
+   volume = vol * one;
+}
+
+LagrangianHydroOperator::LagrangianHydroOperator(const int size,
+                                                 FiniteElementSpace &h1,
+                                                 FiniteElementSpace &l2,
+                                                 const Array<int> &ess_tdofs,
+                                                 Coefficient &rho0_coeff,
+                                                 GridFunction &rho0_gf,
+                                                 Coefficient &gamma_coeff,
+                                                 GridFunction &gamma_gf,
+                                                 const int source,
+                                                 const double cfl,
+                                                 const bool visc,
+                                                 const bool p_assembly,
+                                                 const double cgt,
+                                                 const int cgiter,
+                                                 double ftz,
+                                                 const int oq) :
+   TimeDependentOperator(size), H1(h1), L2(l2),
+   H1c(H1.GetMesh(), H1.FEColl(), 1),
+   mesh(H1.GetMesh()),
+   H1Vsize(H1.GetVSize()),
+   H1TVSize(H1.GetTrueVSize()),
+   L2Vsize(L2.GetVSize()),
+   L2TVSize(L2.GetTrueVSize()),
+   block_offsets(4),
+   x_gf(&H1),
+   ess_tdofs(ess_tdofs),
+   dim(mesh->Dimension()),
+   NE(mesh->GetNE()),
+   l2dofs_cnt(L2.GetFE(0)->GetDof()),
+   h1dofs_cnt(H1.GetFE(0)->GetDof()),
+   source_type(source), cfl(cfl),
+   use_viscosity(visc),
+   p_assembly(p_assembly),
+   cg_rel_tol(cgt), cg_max_iter(cgiter),ftz_tol(ftz),
+   gamma_coeff(gamma_coeff),
+   gamma_gf(gamma_gf),
+   Mv(&H1), Mv_spmat_copy(),
+   Me(l2dofs_cnt, l2dofs_cnt, NE),
+   Me_inv(l2dofs_cnt, l2dofs_cnt, NE),
+   ir(IntRules.Get(mesh->GetElementBaseGeometry(0),
+                   (oq > 0) ? oq : 3 * H1.GetOrder(0) + L2.GetOrder(0) - 1)),
+   Q1D(int(floor(0.7 + pow(ir.GetNPoints(), 1.0 / dim)))),
+   qdata(dim, NE, ir.GetNPoints()),
+   qdata_is_current(false),
+   forcemat_is_assembled(false),
+   Force(&L2, &H1),
+   ForcePA(nullptr), VMassPA(nullptr), EMassPA(nullptr),
+   VMassPA_Jprec(nullptr),
+   CG_VMass(),
+   CG_EMass(),
+   timer(p_assembly ? L2TVSize : 1),
+   qupdate(dim, NE, Q1D, visc, cfl, &timer, gamma_gf, ir, H1, L2),
+   X(H1c.GetTrueVSize()),
+   B(H1c.GetTrueVSize()),
+   one(L2Vsize),
+   rhs(H1Vsize),
+   e_rhs(L2Vsize),
+   rhs_c_gf(&H1c),
+   dvc_gf(&H1c)
+{
+   block_offsets[0] = 0;
+   block_offsets[1] = block_offsets[0] + H1Vsize;
+   block_offsets[2] = block_offsets[1] + H1Vsize;
+   block_offsets[3] = block_offsets[2] + L2Vsize;
+   one.UseDevice(true);
+   one = 1.0;
 
-   // Standard assembly for the velocity mass matrix.
-   VectorMassIntegrator *vmi = new VectorMassIntegrator(rho_coeff, &integ_rule);
-   Mv.AddDomainIntegrator(vmi);
-   Mv.Assemble();
-   Mv_spmat_copy = Mv.SpMat();
+   if (p_assembly)
+   {
+      ForcePA = new ForcePAOperator(qdata, H1, L2, ir);
+      VMassPA = new MassPAOperator(H1c, ir, rho0_coeff);
+      EMassPA = new MassPAOperator(L2, ir, rho0_coeff);
+      // Inside the above constructors for mass, there is reordering of the mesh
+      // nodes which is performed on the host. Since the mesh nodes are a
+      // subvector, so we need to sync with the rest of the base vector (which
+      // is assumed to be in the memory space used by the mfem::Device).
+      H1.GetMesh()->GetNodes()->ReadWrite();
+      // Attributes 1/2/3 correspond to fixed-x/y/z boundaries, i.e.,
+      // we must enforce v_x/y/z = 0 for the velocity components.
+      const int bdr_attr_max = H1.GetMesh()->bdr_attributes.Max();
+      Array<int> ess_bdr(bdr_attr_max);
+      for (int c = 0; c < dim; c++)
+      {
+         ess_bdr = 0;
+         ess_bdr[c] = 1;
+         H1c.GetEssentialTrueDofs(ess_bdr, c_tdofs[c]);
+         c_tdofs[c].Read();
+      }
+      X.UseDevice(true);
+      B.UseDevice(true);
+      rhs.UseDevice(true);
+      e_rhs.UseDevice(true);
+   }
+   else
+   {
+      // Standard local assembly and inversion for energy mass matrices.
+      // 'Me' is used in the computation of the internal energy
+      // which is used twice: once at the start and once at the end of the run.
+      MassIntegrator mi(rho0_coeff, &ir);
+      for (int e = 0; e < NE; e++)
+      {
+         DenseMatrixInverse inv(&Me(e));
+         const FiniteElement &fe = *L2.GetFE(e);
+         ElementTransformation &Tr = *L2.GetElementTransformation(e);
+         mi.AssembleElementMatrix(fe, Tr, Me(e));
+         inv.Factor();
+         inv.GetInverseMatrix(Me_inv(e));
+      }
+      // Standard assembly for the velocity mass matrix.
+      VectorMassIntegrator *vmi = new VectorMassIntegrator(rho0_coeff, &ir);
+      Mv.AddDomainIntegrator(vmi);
+      Mv.Assemble();
+      Mv_spmat_copy = Mv.SpMat();
+   }
 
    // Values of rho0DetJ0 and Jac0inv at all quadrature points.
-   const int nqp = integ_rule.GetNPoints();
-   Vector rho_vals(nqp);
-   for (int i = 0; i < nzones; i++)
+   // Initial local mesh size (assumes all mesh elements are the same).
+   double vol = 0.0;
+   if (dim > 1) { Rho0DetJ0Vol(dim, NE, ir, mesh, L2, rho0_gf, qdata, vol); }
+   else
    {
-      rho0.GetValues(i, integ_rule, rho_vals);
-      ElementTransformation *T = h1_fes.GetElementTransformation(i);
-      for (int q = 0; q < nqp; q++)
+      const int NQ = ir.GetNPoints();
+      Vector rho_vals(NQ);
+      for (int e = 0; e < NE; e++)
       {
-         const IntegrationPoint &ip = integ_rule.IntPoint(q);
-         T->SetIntPoint(&ip);
-
-         DenseMatrixInverse Jinv(T->Jacobian());
-         Jinv.GetInverseMatrix(quad_data.Jac0inv(i*nqp + q));
-
-         const double rho0DetJ0 = T->Weight() * rho_vals(q);
-         quad_data.rho0DetJ0w(i*nqp + q) = rho0DetJ0 *
-                                           integ_rule.IntPoint(q).weight;
+         rho0_gf.GetValues(e, ir, rho_vals);
+         ElementTransformation &Tr = *H1.GetElementTransformation(e);
+         for (int q = 0; q < NQ; q++)
+         {
+            const IntegrationPoint &ip = ir.IntPoint(q);
+            Tr.SetIntPoint(&ip);
+            DenseMatrixInverse Jinv(Tr.Jacobian());
+            Jinv.GetInverseMatrix(qdata.Jac0inv(e*NQ + q));
+            const double rho0DetJ0 = Tr.Weight() * rho_vals(q);
+            qdata.rho0DetJ0w(e*NQ + q) = rho0DetJ0 * ir.IntPoint(q).weight;
+         }
       }
+      for (int e = 0; e < NE; e++) { vol += mesh->GetElementVolume(e); }
    }
 
-   // Initial local mesh size (assumes all mesh elements are of the same type).
-   double area = 0.0;
-   Mesh *m = H1FESpace.GetMesh();
-   for (int i = 0; i < nzones; i++) { area += m->GetElementVolume(i); }
-   switch (m->GetElementBaseGeometry(0))
+   switch (mesh->GetElementBaseGeometry(0))
    {
-      case Geometry::SEGMENT:
-         quad_data.h0 = area / nzones; break;
-      case Geometry::SQUARE:
-         quad_data.h0 = sqrt(area / nzones); break;
-      case Geometry::TRIANGLE:
-         quad_data.h0 = sqrt(2.0 * area / nzones); break;
-      case Geometry::CUBE:
-         quad_data.h0 = pow(area / nzones, 1.0/3.0); break;
-      case Geometry::TETRAHEDRON:
-         quad_data.h0 = pow(6.0 * area / nzones, 1.0/3.0); break;
+      case Geometry::SEGMENT: qdata.h0 = vol / NE; break;
+      case Geometry::SQUARE: qdata.h0 = sqrt(vol / NE); break;
+      case Geometry::TRIANGLE: qdata.h0 = sqrt(2.0 * vol / NE); break;
+      case Geometry::CUBE: qdata.h0 = pow(vol / NE, 1./3.); break;
+      case Geometry::TETRAHEDRON: qdata.h0 = pow(6.0 * vol / NE, 1./3.); break;
       default: MFEM_ABORT("Unknown zone type!");
    }
-   quad_data.h0 /= (double) H1FESpace.GetOrder(0);
-
-   ForceIntegrator *fi = new ForceIntegrator(quad_data);
-   fi->SetIntRule(&integ_rule);
-   Force.AddDomainIntegrator(fi);
-   // Make a dummy assembly to figure out the sparsity.
-   Force.Assemble(0);
-   Force.Finalize(0);
+   qdata.h0 /= (double) H1.GetOrder(0);
 
    if (p_assembly)
    {
       // Setup the preconditioner of the velocity mass operator.
-      Vector d;
-      (dim == 2) ? VMassPA.ComputeDiagonal2D(d) : VMassPA.ComputeDiagonal3D(d);
-      VMassPA_prec.SetDiagonal(d);
+      // BC are handled by the VMassPA, so ess_tdofs here can be empty.
+      Array<int> ess_tdofs;
+      VMassPA_Jprec = new OperatorJacobiSmoother(VMassPA->GetBF(), ess_tdofs);
+      CG_VMass.SetPreconditioner(*VMassPA_Jprec);
+
+      CG_VMass.SetOperator(*VMassPA);
+      CG_VMass.SetRelTol(cg_rel_tol);
+      CG_VMass.SetAbsTol(0.0);
+      CG_VMass.SetMaxIter(cg_max_iter);
+      CG_VMass.SetPrintLevel(-1);
+
+      CG_EMass.SetOperator(*EMassPA);
+      CG_EMass.iterative_mode = false;
+      CG_EMass.SetRelTol(cg_rel_tol);
+      CG_EMass.SetAbsTol(0.0);
+      CG_EMass.SetMaxIter(cg_max_iter);
+      CG_EMass.SetPrintLevel(-1);
    }
+   else
+   {
+      ForceIntegrator *fi = new ForceIntegrator(qdata);
+      fi->SetIntRule(&ir);
+      Force.AddDomainIntegrator(fi);
+      // Make a dummy assembly to figure out the sparsity.
+      Force.Assemble(0);
+      Force.Finalize(0);
+   }
+}
 
-   locCG.SetOperator(locEMassPA);
-   locCG.iterative_mode = false;
-   locCG.SetRelTol(1e-8);
-   locCG.SetAbsTol(1e-8 * numeric_limits<double>::epsilon());
-   locCG.SetMaxIter(200);
-   locCG.SetPrintLevel(0);
+LagrangianHydroOperator::~LagrangianHydroOperator()
+{
+   if (p_assembly)
+   {
+      delete EMassPA;
+      delete VMassPA;
+      delete VMassPA_Jprec;
+      delete ForcePA;
+   }
 }
 
 void LagrangianHydroOperator::Mult(const Vector &S, Vector &dS_dt) const
@@ -184,23 +355,19 @@ void LagrangianHydroOperator::Mult(const Vector &S, Vector &dS_dt) const
    // needed only because some mfem time integrators don't update the solution
    // vector at every intermediate stage (hence they don't change the mesh).
    UpdateMesh(S);
-
    // The monolithic BlockVector stores the unknown fields as follows:
    // (Position, Velocity, Specific Internal Energy).
-   Vector* sptr = (Vector*) &S;
+   Vector* sptr = const_cast<Vector*>(&S);
    GridFunction v;
-   const int VsizeH1 = H1FESpace.GetVSize();
-   v.MakeRef(&H1FESpace, *sptr, VsizeH1);
-
+   const int VsizeH1 = H1.GetVSize();
+   v.MakeRef(&H1, *sptr, VsizeH1);
    // Set dx_dt = v (explicit).
    GridFunction dx;
-   dx.MakeRef(&H1FESpace, dS_dt, 0);
+   dx.MakeRef(&H1, dS_dt, 0);
    dx = v;
-
    SolveVelocity(S, dS_dt);
    SolveEnergy(S, v, dS_dt);
-
-   quad_data_is_current = false;
+   qdata_is_current = false;
 }
 
 void LagrangianHydroOperator::SolveVelocity(const Vector &S,
@@ -208,38 +375,40 @@ void LagrangianHydroOperator::SolveVelocity(const Vector &S,
 {
    UpdateQuadratureData(S);
    AssembleForceMatrix();
-
-   const int VsizeL2 = L2FESpace.GetVSize();
-   const int VsizeH1 = H1FESpace.GetVSize();
-
    // The monolithic BlockVector stores the unknown fields as follows:
    // (Position, Velocity, Specific Internal Energy).
    GridFunction dv;
-   dv.MakeRef(&H1FESpace, dS_dt, VsizeH1);
+   dv.MakeRef(&H1, dS_dt, H1Vsize);
    dv = 0.0;
-
-   Vector one(VsizeL2), rhs(VsizeH1), B, X; one = 1.0;
    if (p_assembly)
    {
       timer.sw_force.Start();
-      ForcePA.Mult(one, rhs);
+      ForcePA->Mult(one, rhs);
       timer.sw_force.Stop();
       rhs.Neg();
 
-      Operator *cVMassPA;
-      VMassPA.FormLinearSystem(ess_tdofs, dv, rhs, cVMassPA, X, B);
-      CGSolver cg;
-      cg.SetPreconditioner(VMassPA_prec);
-      cg.SetOperator(*cVMassPA);
-      cg.SetRelTol(cg_rel_tol); cg.SetAbsTol(0.0);
-      cg.SetMaxIter(cg_max_iter);
-      cg.SetPrintLevel(0);
-      timer.sw_cgH1.Start();
-      cg.Mult(B, X);
-      timer.sw_cgH1.Stop();
-      timer.H1cg_iter += cg.GetNumIterations();
-      VMassPA.RecoverFEMSolution(X, rhs, dv);
-      delete cVMassPA;
+      // Partial assembly solve for each velocity component
+      const int size = H1c.GetVSize();
+      const Operator *Pconf = H1c.GetProlongationMatrix();
+      for (int c = 0; c < dim; c++)
+      {
+         dvc_gf.MakeRef(&H1c, dS_dt, H1Vsize + c*size);
+         rhs_c_gf.MakeRef(&H1c, rhs, c*size);
+         if (Pconf) { Pconf->MultTranspose(rhs_c_gf, B); }
+         else { B = rhs_c_gf; }
+         X = dvc_gf;
+         VMassPA->SetEssentialTrueDofs(c_tdofs[c]);
+         VMassPA->EliminateRHS(B);
+         timer.sw_cgH1.Start();
+         CG_VMass.Mult(B, X);
+         timer.sw_cgH1.Stop();
+         timer.H1iter += CG_VMass.GetNumIterations();
+         if (Pconf) { Pconf->Mult(X, dvc_gf); }
+         else { dvc_gf = X; }
+         // We need to sync the subvector 'dvc_gf' with its base vector
+         // because it may have been moved to a different memory space.
+         dvc_gf.GetMemory().SyncAlias(dS_dt.GetMemory(), dvc_gf.Size());
+      }
    }
    else
    {
@@ -254,13 +423,14 @@ void LagrangianHydroOperator::SolveVelocity(const Vector &S,
       DSmoother prec(0);
       cg.SetPreconditioner(prec);
       cg.SetOperator(A);
-      cg.SetRelTol(cg_rel_tol); cg.SetAbsTol(0.0);
+      cg.SetRelTol(cg_rel_tol);
+      cg.SetAbsTol(0.0);
       cg.SetMaxIter(cg_max_iter);
-      cg.SetPrintLevel(0);
+      cg.SetPrintLevel(-1);
       timer.sw_cgH1.Start();
       cg.Mult(B, X);
       timer.sw_cgH1.Stop();
-      timer.H1cg_iter += cg.GetNumIterations();
+      timer.H1iter += cg.GetNumIterations();
       Mv.RecoverFEMSolution(X, rhs, dv);
    }
 }
@@ -271,60 +441,54 @@ void LagrangianHydroOperator::SolveEnergy(const Vector &S, const Vector &v,
    UpdateQuadratureData(S);
    AssembleForceMatrix();
 
-   const int VsizeL2 = L2FESpace.GetVSize();
-   const int VsizeH1 = H1FESpace.GetVSize();
-
    // The monolithic BlockVector stores the unknown fields as follows:
    // (Position, Velocity, Specific Internal Energy).
    GridFunction de;
-   de.MakeRef(&L2FESpace, dS_dt, VsizeH1*2);
+   de.MakeRef(&L2, dS_dt, H1Vsize*2);
    de = 0.0;
 
    // Solve for energy, assemble the energy source if such exists.
-   LinearForm *e_source = NULL;
+   LinearForm *e_source = nullptr;
    if (source_type == 1) // 2D Taylor-Green.
    {
-      e_source = new LinearForm(&L2FESpace);
+      e_source = new LinearForm(&L2);
       TaylorCoefficient coeff;
-      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &integ_rule);
+      DomainLFIntegrator *d = new DomainLFIntegrator(coeff, &ir);
       e_source->AddDomainIntegrator(d);
       e_source->Assemble();
    }
+
    Array<int> l2dofs;
-   Vector e_rhs(VsizeL2), loc_rhs(l2dofs_cnt), loc_de(l2dofs_cnt);
    if (p_assembly)
    {
       timer.sw_force.Start();
-      ForcePA.MultTranspose(v, e_rhs);
+      ForcePA->MultTranspose(v, e_rhs);
       timer.sw_force.Stop();
-
       if (e_source) { e_rhs += *e_source; }
-      for (int z = 0; z < nzones; z++)
-      {
-         L2FESpace.GetElementDofs(z, l2dofs);
-         e_rhs.GetSubVector(l2dofs, loc_rhs);
-         locEMassPA.SetZoneId(z);
-         timer.sw_cgL2.Start();
-         locCG.Mult(loc_rhs, loc_de);
-         timer.sw_cgL2.Stop();
-         timer.L2dof_iter += locCG.GetNumIterations() * l2dofs_cnt;
-         de.SetSubVector(l2dofs, loc_de);
-      }
+      timer.sw_cgL2.Start();
+      CG_EMass.Mult(e_rhs, de);
+      timer.sw_cgL2.Stop();
+      const int cg_num_iter = CG_EMass.GetNumIterations();
+      timer.L2iter += (cg_num_iter==0) ? 1 : cg_num_iter;
+      // Move the memory location of the subvector 'de' to the memory
+      // location of the base vector 'dS_dt'.
+      de.GetMemory().SyncAlias(dS_dt.GetMemory(), de.Size());
    }
-   else
+   else // not p_assembly
    {
       timer.sw_force.Start();
       Force.MultTranspose(v, e_rhs);
       timer.sw_force.Stop();
       if (e_source) { e_rhs += *e_source; }
-      for (int z = 0; z < nzones; z++)
+      Vector loc_rhs(l2dofs_cnt), loc_de(l2dofs_cnt);
+      for (int e = 0; e < NE; e++)
       {
-         L2FESpace.GetElementDofs(z, l2dofs);
+         L2.GetElementDofs(e, l2dofs);
          e_rhs.GetSubVector(l2dofs, loc_rhs);
          timer.sw_cgL2.Start();
-         Me_inv(z).Mult(loc_rhs, loc_de);
+         Me_inv(e).Mult(loc_rhs, loc_de);
          timer.sw_cgL2.Stop();
-         timer.L2dof_iter += l2dofs_cnt;
+         timer.L2iter += 1;
          de.SetSubVector(l2dofs, loc_de);
       }
    }
@@ -333,112 +497,159 @@ void LagrangianHydroOperator::SolveEnergy(const Vector &S, const Vector &v,
 
 void LagrangianHydroOperator::UpdateMesh(const Vector &S) const
 {
-   Vector* sptr = (Vector*) &S;
-   x_gf.MakeRef(&H1FESpace, *sptr, 0);
-   H1FESpace.GetMesh()->NewNodes(x_gf, false);
+   Vector* sptr = const_cast<Vector*>(&S);
+   x_gf.MakeRef(&H1, *sptr, 0);
+   H1.GetMesh()->NewNodes(x_gf, false);
 }
 
 double LagrangianHydroOperator::GetTimeStepEstimate(const Vector &S) const
 {
    UpdateMesh(S);
    UpdateQuadratureData(S);
-
-   return quad_data.dt_est;
+   return qdata.dt_est;
 }
 
 void LagrangianHydroOperator::ResetTimeStepEstimate() const
 {
-   quad_data.dt_est = numeric_limits<double>::infinity();
+   qdata.dt_est = std::numeric_limits<double>::infinity();
 }
 
-void LagrangianHydroOperator::ComputeDensity(GridFunction &rho)
+void LagrangianHydroOperator::ComputeDensity(GridFunction &rho) const
 {
-   rho.SetSpace(&L2FESpace);
-
+   rho.SetSpace(&L2);
    DenseMatrix Mrho(l2dofs_cnt);
    Vector rhs(l2dofs_cnt), rho_z(l2dofs_cnt);
    Array<int> dofs(l2dofs_cnt);
    DenseMatrixInverse inv(&Mrho);
-   MassIntegrator mi(&integ_rule);
-   DensityIntegrator di(quad_data);
-   di.SetIntRule(&integ_rule);
-   for (int i = 0; i < nzones; i++)
+   MassIntegrator mi(&ir);
+   DensityIntegrator di(qdata);
+   di.SetIntRule(&ir);
+   for (int e = 0; e < NE; e++)
    {
-      di.AssembleRHSElementVect(*L2FESpace.GetFE(i),
-                                *L2FESpace.GetElementTransformation(i), rhs);
-      mi.AssembleElementMatrix(*L2FESpace.GetFE(i),
-                               *L2FESpace.GetElementTransformation(i), Mrho);
+      const FiniteElement &fe = *L2.GetFE(e);
+      ElementTransformation &eltr = *L2.GetElementTransformation(e);
+      di.AssembleRHSElementVect(fe, eltr, rhs);
+      mi.AssembleElementMatrix(fe, eltr, Mrho);
       inv.Factor();
       inv.Mult(rhs, rho_z);
-      L2FESpace.GetElementDofs(i, dofs);
+      L2.GetElementDofs(e, dofs);
       rho.SetSubVector(dofs, rho_z);
    }
 }
 
-double LagrangianHydroOperator::InternalEnergy(const GridFunction &e) const
+double LagrangianHydroOperator::InternalEnergy(const GridFunction &gf) const
 {
-   Vector one(l2dofs_cnt), loc_e(l2dofs_cnt);
-   one = 1.0;
-   Array<int> l2dofs;
-
-   double ie = 0.0;
-   for (int z = 0; z < nzones; z++)
+   double glob_ie = 0.0;
+   // This should be turned into a kernel so that it could be displayed in pa
+   if (!p_assembly)
    {
-      L2FESpace.GetElementDofs(z, l2dofs);
-      e.GetSubVector(l2dofs, loc_e);
-      ie += Me(z).InnerProduct(loc_e, one);
+      Vector one(l2dofs_cnt), loc_e(l2dofs_cnt);
+      one = 1.0;
+      Array<int> l2dofs;
+      double loc_ie = 0.0;
+      for (int e = 0; e < NE; e++)
+      {
+         L2.GetElementDofs(e, l2dofs);
+         gf.GetSubVector(l2dofs, loc_e);
+         loc_ie += Me(e).InnerProduct(loc_e, one);
+      }
+      glob_ie = loc_ie;
    }
-
-   return ie;
+   return glob_ie;
 }
 
 double LagrangianHydroOperator::KineticEnergy(const GridFunction &v) const
 {
-   return 0.5 * Mv_spmat_copy.InnerProduct(v, v);
+   double glob_ke = 0.0;
+   // This should be turned into a kernel so that it could be displayed in pa
+   if (!p_assembly)
+   {
+      double loc_ke = 0.5 * Mv_spmat_copy.InnerProduct(v, v);
+      glob_ke = loc_ke;
+   }
+   return glob_ke;
 }
 
-void LagrangianHydroOperator::PrintTimingData(int steps) const
+void LagrangianHydroOperator::PrintTimingData(int steps, const bool fom) const
 {
-   double runtime[5];
-   runtime[0] = timer.sw_cgH1.RealTime();
-   runtime[1] = timer.sw_cgL2.RealTime();
-   runtime[2] = timer.sw_force.RealTime();
-   runtime[3] = timer.sw_qdata.RealTime();
-   runtime[4] = runtime[0] + runtime[2] + runtime[3];
-
-   int data[2];
-   data[0] = timer.L2dof_iter;
+   double T[5];
+   T[0] = timer.sw_cgH1.RealTime();
+   T[1] = timer.sw_cgL2.RealTime();
+   T[2] = timer.sw_force.RealTime();
+   T[3] = timer.sw_qdata.RealTime();
+   T[4] = T[0] + T[2] + T[3];
+
+   int data[3];
+   data[0] = timer.L2dof * timer.L2iter;
    data[1] = timer.quad_tstep;
+   data[2] = NE;
+
+   const int H1size = H1.GetVSize(), L2size = L2.GetVSize();
 
-   const int H1size = H1FESpace.GetVSize(),
-             L2size = L2FESpace.GetVSize();
    using namespace std;
+   // FOM = (FOM1 * T1 + FOM2 * T2 + FOM3 * T3) / (T1 + T2 + T3)
+   const int H1iter = p_assembly ? (timer.H1iter/dim) : timer.H1iter;
+   const double FOM1 = 1e-6 * H1size * H1iter / T[0];
+   const double FOM2 = 1e-6 * steps * (H1size + L2size) / T[2];
+   const double FOM3 = 1e-6 * data[1] * ir.GetNPoints() / T[3];
+   const double FOM = (FOM1 * T[0] + FOM2 * T[2] + FOM3 * T[3]) / T[4];
+   const double FOM0 = 1e-6 * steps * (H1size + L2size) / T[4];
    cout << endl;
-   cout << "CG (H1) total time: " << runtime[0] << endl;
+   cout << "CG (H1) total time: " << T[0] << endl;
    cout << "CG (H1) rate (megadofs x cg_iterations / second): "
-        << 1e-6 * H1size * timer.H1cg_iter / runtime[0] << endl;
+        << FOM1 << endl;
    cout << endl;
-   cout << "CG (L2) total time: " << runtime[1] << endl;
+   cout << "CG (L2) total time: " << T[1] << endl;
    cout << "CG (L2) rate (megadofs x cg_iterations / second): "
-        << 1e-6 * data[0] / runtime[1] << endl;
+        << 1e-6 * data[0] / T[1] << endl;
    cout << endl;
-   // The Force operator is applied twice per time step, on the H1 and the L2
-   // vectors, respectively.
-   cout << "Forces total time: " << runtime[2] << endl;
+   cout << "Forces total time: " << T[2] << endl;
    cout << "Forces rate (megadofs x timesteps / second): "
-        << 1e-6 * steps * (H1size + L2size) / runtime[2] << endl;
+        << FOM2 << endl;
    cout << endl;
-   cout << "UpdateQuadData total time: " << runtime[3] << endl;
+   cout << "UpdateQuadData total time: " << T[3] << endl;
    cout << "UpdateQuadData rate (megaquads x timesteps / second): "
-        << 1e-6 * data[1] * integ_rule.GetNPoints() / runtime[3] << endl;
+        << FOM3 << endl;
    cout << endl;
-   cout << "Major kernels total time (seconds): " << runtime[4] << endl;
+   cout << "Major kernels total time (seconds): " << T[4] << endl;
    cout << "Major kernels total rate (megadofs x time steps / second): "
-        << 1e-6 * steps * (H1size + L2size) / runtime[4] << endl;
+        << FOM << endl;
+   if (!fom) { return; }
+   const int QPT = ir.GetNPoints();
+   const int GNZones = data[2];
+   const long ndofs = 2*H1size + L2size + QPT*GNZones;
+   cout << endl;
+   cout << "| Ranks " << "| Zones   "
+        << "| H1 dofs " << "| L2 dofs "
+        << "| QP "      << "| N dofs   "
+        << "| FOM0   "
+        << "| FOM1   " << "| T1   "
+        << "| FOM2   " << "| T2   "
+        << "| FOM3   " << "| T3   "
+        << "| FOM    " << "| TT   "
+        << "|" << endl;
+   cout << setprecision(3);
+   cout << "| " << setw(6) << 1
+        << "| " << setw(8) << GNZones
+        << "| " << setw(8) << H1size
+        << "| " << setw(8) << L2size
+        << "| " << setw(3) << QPT
+        << "| " << setw(9) << ndofs
+        << "| " << setw(7) << FOM0
+        << "| " << setw(7) << FOM1
+        << "| " << setw(5) << T[0]
+        << "| " << setw(7) << FOM2
+        << "| " << setw(5) << T[2]
+        << "| " << setw(7) << FOM3
+        << "| " << setw(5) << T[3]
+        << "| " << setw(7) << FOM
+        << "| " << setw(5) << T[4]
+        << "| " << endl;
+
 }
 
 // Smooth transition between 0 and 1 for x in [-eps, eps].
-inline double smooth_step_01(double x, double eps)
+MFEM_HOST_DEVICE inline double smooth_step_01(double x, double eps)
 {
    const double y = (x + eps) / (2.0 * eps);
    if (y < 0.0) { return 0.0; }
@@ -448,28 +659,32 @@ inline double smooth_step_01(double x, double eps)
 
 void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
 {
-   if (quad_data_is_current) { return; }
-   timer.sw_qdata.Start();
+   if (qdata_is_current) { return; }
 
-   const int nqp = integ_rule.GetNPoints();
+   qdata_is_current = true;
+   forcemat_is_assembled = false;
 
+   if (dim > 1) { return qupdate.UpdateQuadratureData(S, qdata); }
+
+   // This code is only for the 1D/FA mode
+   timer.sw_qdata.Start();
+   const int nqp = ir.GetNPoints();
    GridFunction x, v, e;
-   Vector* sptr = (Vector*) &S;
-   x.MakeRef(&H1FESpace, *sptr, 0);
-   v.MakeRef(&H1FESpace, *sptr, H1FESpace.GetVSize());
-   e.MakeRef(&L2FESpace, *sptr, 2*H1FESpace.GetVSize());
+   Vector* sptr = const_cast<Vector*>(&S);
+   x.MakeRef(&H1, *sptr, 0);
+   v.MakeRef(&H1, *sptr, H1.GetVSize());
+   e.MakeRef(&L2, *sptr, 2*H1.GetVSize());
    Vector e_vals, e_loc(l2dofs_cnt), vector_vals(h1dofs_cnt * dim);
    DenseMatrix Jpi(dim), sgrad_v(dim), Jinv(dim), stress(dim), stressJiT(dim),
                vecvalMat(vector_vals.GetData(), h1dofs_cnt, dim);
    DenseTensor grad_v_ref(dim, dim, nqp);
    Array<int> L2dofs, H1dofs;
-
    // Batched computations are needed, because hydrodynamic codes usually
    // involve expensive computations of material properties. Although this
    // miniapp uses simple EOS equations, we still want to represent the batched
    // cycle structure.
    int nzones_batch = 3;
-   const int nbatches =  nzones / nzones_batch + 1; // +1 for the remainder.
+   const int nbatches =  NE / nzones_batch + 1; // +1 for the remainder.
    int nqp_batch = nqp * nzones_batch;
    double *gamma_b = new double[nqp_batch],
    *rho_b = new double[nqp_batch],
@@ -478,71 +693,46 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
    *cs_b  = new double[nqp_batch];
    // Jacobians of reference->physical transformations for all quadrature points
    // in the batch.
-   DenseTensor *Jpr_b = new DenseTensor[nqp_batch];
+   DenseTensor *Jpr_b = new DenseTensor[nzones_batch];
    for (int b = 0; b < nbatches; b++)
    {
       int z_id = b * nzones_batch; // Global index over zones.
       // The last batch might not be full.
-      if (z_id == nzones) { break; }
-      else if (z_id + nzones_batch > nzones)
+      if (z_id == NE) { break; }
+      else if (z_id + nzones_batch > NE)
       {
-         nzones_batch = nzones - z_id;
+         nzones_batch = NE - z_id;
          nqp_batch    = nqp * nzones_batch;
       }
-
-      double min_detJ = numeric_limits<double>::infinity();
+      double min_detJ = std::numeric_limits<double>::infinity();
       for (int z = 0; z < nzones_batch; z++)
       {
-         ElementTransformation *T = H1FESpace.GetElementTransformation(z_id);
+         ElementTransformation *T = H1.GetElementTransformation(z_id);
          Jpr_b[z].SetSize(dim, dim, nqp);
-
-         if (p_assembly)
-         {
-            // Energy values at quadrature point.
-            L2FESpace.GetElementDofs(z_id, L2dofs);
-            e.GetSubVector(L2dofs, e_loc);
-            evaluator.GetL2Values(e_loc, e_vals);
-
-            // All reference->physical Jacobians at the quadrature points.
-            H1FESpace.GetElementVDofs(z_id, H1dofs);
-            x.GetSubVector(H1dofs, vector_vals);
-            evaluator.GetVectorGrad(vecvalMat, Jpr_b[z]);
-         }
-         else { e.GetValues(z_id, integ_rule, e_vals); }
+         e.GetValues(z_id, ir, e_vals);
          for (int q = 0; q < nqp; q++)
          {
-            const IntegrationPoint &ip = integ_rule.IntPoint(q);
+            const IntegrationPoint &ip = ir.IntPoint(q);
             T->SetIntPoint(&ip);
-            if (!p_assembly) { Jpr_b[z](q) = T->Jacobian(); }
+            Jpr_b[z](q) = T->Jacobian();
             const double detJ = Jpr_b[z](q).Det();
-            min_detJ = min(min_detJ, detJ);
-
+            min_detJ = fmin(min_detJ, detJ);
             const int idx = z * nqp + q;
-            if (material_pcf == NULL) { gamma_b[idx] = 5./3.; } // Ideal gas.
-            else { gamma_b[idx] = material_pcf->Eval(*T, ip); }
-            rho_b[idx] = quad_data.rho0DetJ0w(z_id*nqp + q) / detJ / ip.weight;
-            e_b[idx]   = max(0.0, e_vals(q));
+            gamma_b[idx] = gamma_coeff.Eval(*T, ip);
+            rho_b[idx] = qdata.rho0DetJ0w(z_id*nqp + q) / detJ / ip.weight;
+            e_b[idx] = fmax(0.0, e_vals(q));
          }
          ++z_id;
       }
-
       // Batched computation of material properties.
       ComputeMaterialProperties(nqp_batch, gamma_b, rho_b, e_b, p_b, cs_b);
-
       z_id -= nzones_batch;
       for (int z = 0; z < nzones_batch; z++)
       {
-         ElementTransformation *T = H1FESpace.GetElementTransformation(z_id);
-         if (p_assembly)
-         {
-            // All reference->physical Jacobians at the quadrature points.
-            H1FESpace.GetElementVDofs(z_id, H1dofs);
-            v.GetSubVector(H1dofs, vector_vals);
-            evaluator.GetVectorGrad(vecvalMat, grad_v_ref);
-         }
+         ElementTransformation *T = H1.GetElementTransformation(z_id);
          for (int q = 0; q < nqp; q++)
          {
-            const IntegrationPoint &ip = integ_rule.IntPoint(q);
+            const IntegrationPoint &ip = ir.IntPoint(q);
             T->SetIntPoint(&ip);
             // Note that the Jacobian was already computed above. We've chosen
             // not to store the Jacobians for all batched quadrature points.
@@ -550,10 +740,8 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
             CalcInverse(Jpr, Jinv);
             const double detJ = Jpr.Det(), rho = rho_b[z*nqp + q],
                          p = p_b[z*nqp + q], sound_speed = cs_b[z*nqp + q];
-
             stress = 0.0;
             for (int d = 0; d < dim; d++) { stress(d, d) = -p; }
-
             double visc_coeff = 0.0;
             if (use_viscosity)
             {
@@ -561,30 +749,18 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
                // eigenvector of the symmetric velocity gradient gives the
                // direction of maximal compression. This is used to define the
                // relative change of the initial length scale.
-               if (p_assembly)
-               {
-                  mfem::Mult(grad_v_ref(q), Jinv, sgrad_v);
-               }
-               else
-               {
-                  v.GetVectorGradient(*T, sgrad_v);
-               }
+               v.GetVectorGradient(*T, sgrad_v);
                sgrad_v.Symmetrize();
                double eig_val_data[3], eig_vec_data[9];
-               if (dim==1)
-               {
-                  eig_val_data[0] = sgrad_v(0, 0);
-                  eig_vec_data[0] = 1.;
-               }
-               else { sgrad_v.CalcEigenvalues(eig_val_data, eig_vec_data); }
+               eig_val_data[0] = sgrad_v(0, 0);
+               eig_vec_data[0] = 1.;
                Vector compr_dir(eig_vec_data, dim);
                // Computes the initial->physical transformation Jacobian.
-               mfem::Mult(Jpr, quad_data.Jac0inv(z_id*nqp + q), Jpi);
+               mfem::Mult(Jpr, qdata.Jac0inv(z_id*nqp + q), Jpi);
                Vector ph_dir(dim); Jpi.Mult(compr_dir, ph_dir);
                // Change of the initial mesh size in the compression direction.
-               const double h = quad_data.h0 * ph_dir.Norml2() /
+               const double h = qdata.h0 * ph_dir.Norml2() /
                                 compr_dir.Norml2();
-
                // Measure of maximal compression.
                const double mu = eig_val_data[0];
                visc_coeff = 2.0 * rho * h * h * fabs(mu);
@@ -595,42 +771,36 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
                const double eps = 1e-12;
                visc_coeff += 0.5 * rho * h * sound_speed *
                              (1.0 - smooth_step_01(mu - 2.0 * eps, eps));
-
                stress.Add(visc_coeff, sgrad_v);
-
-               // Note that the (mu < 0.0) check introduces discontinuous
-               // behavior. This can lead to bigger differences in results when
-               // there are round-offs around the zero for the min eigenvalue.
-               // We've observed differences between Linux and Mac for some 3D
-               // calculations.
             }
-
             // Time step estimate at the point. Here the more relevant length
             // scale is related to the actual mesh deformation; we use the min
             // singular value of the ref->physical Jacobian. In addition, the
             // time step estimate should be aware of the presence of shocks.
             const double h_min =
-               Jpr.CalcSingularvalue(dim-1) / (double) H1FESpace.GetOrder(0);
+               Jpr.CalcSingularvalue(dim-1) / (double) H1.GetOrder(0);
             const double inv_dt = sound_speed / h_min +
                                   2.5 * visc_coeff / rho / h_min / h_min;
             if (min_detJ < 0.0)
             {
                // This will force repetition of the step with smaller dt.
-               quad_data.dt_est = 0.0;
+               qdata.dt_est = 0.0;
             }
             else
             {
-               quad_data.dt_est = min(quad_data.dt_est, cfl * (1.0 / inv_dt) );
+               if (inv_dt>0.0)
+               {
+                  qdata.dt_est = fmin(qdata.dt_est, cfl*(1.0/inv_dt));
+               }
             }
-
             // Quadrature data for partial assembly of the force operator.
             MultABt(stress, Jinv, stressJiT);
-            stressJiT *= integ_rule.IntPoint(q).weight * detJ;
+            stressJiT *= ir.IntPoint(q).weight * detJ;
             for (int vd = 0 ; vd < dim; vd++)
             {
                for (int gd = 0; gd < dim; gd++)
                {
-                  quad_data.stressJinvT(vd)(z_id*nqp + q, gd) =
+                  qdata.stressJinvT(vd)(z_id*nqp + q, gd) =
                      stressJiT(vd, gd);
                }
             }
@@ -638,32 +808,342 @@ void LagrangianHydroOperator::UpdateQuadratureData(const Vector &S) const
          ++z_id;
       }
    }
-
    delete [] gamma_b;
    delete [] rho_b;
    delete [] e_b;
    delete [] p_b;
    delete [] cs_b;
    delete [] Jpr_b;
-   quad_data_is_current = true;
-   forcemat_is_assembled = false;
-
    timer.sw_qdata.Stop();
-   timer.quad_tstep += nzones;
+   timer.quad_tstep += NE;
+}
+
+template<int DIM> MFEM_HOST_DEVICE static inline
+void QUpdateBody(const int NE, const int e,
+                 const int NQ, const int q,
+                 const bool use_viscosity,
+                 const double h0,
+                 const double h1order,
+                 const double cfl,
+                 const double infinity,
+                 double* __restrict__ Jinv,
+                 double* __restrict__ stress,
+                 double* __restrict__ sgrad_v,
+                 double* __restrict__ eig_val_data,
+                 double* __restrict__ eig_vec_data,
+                 double* __restrict__ compr_dir,
+                 double* __restrict__ Jpi,
+                 double* __restrict__ ph_dir,
+                 double* __restrict__ stressJiT,
+                 const double* __restrict__ d_gamma,
+                 const double* __restrict__ d_weights,
+                 const double* __restrict__ d_Jacobians,
+                 const double* __restrict__ d_rho0DetJ0w,
+                 const double* __restrict__ d_e_quads,
+                 const double* __restrict__ d_grad_v_ext,
+                 const double* __restrict__ d_Jac0inv,
+                 double *d_dt_est,
+                 double *d_stressJinvT)
+{
+   constexpr int DIM2 = DIM*DIM;
+   double min_detJ = infinity;
+
+   const int eq = e * NQ + q;
+   const double gamma = d_gamma[e];
+   const double weight =  d_weights[q];
+   const double inv_weight = 1. / weight;
+   const double *J = d_Jacobians + DIM2*(NQ*e + q);
+   const double detJ = kernels::Det<DIM>(J);
+   min_detJ = fmin(min_detJ, detJ);
+   kernels::CalcInverse<DIM>(J, Jinv);
+   const double R = inv_weight * d_rho0DetJ0w[eq] / detJ;
+   const double E = fmax(0.0, d_e_quads[eq]);
+   const double P = (gamma - 1.0) * R * E;
+   const double S = sqrt(gamma * (gamma - 1.0) * E);
+   for (int k = 0; k < DIM2; k++) { stress[k] = 0.0; }
+   for (int d = 0; d < DIM; d++) { stress[d*DIM+d] = -P; }
+   double visc_coeff = 0.0;
+   if (use_viscosity)
+   {
+      // Compression-based length scale at the point. The first
+      // eigenvector of the symmetric velocity gradient gives the
+      // direction of maximal compression. This is used to define the
+      // relative change of the initial length scale.
+      const double *dV = d_grad_v_ext + DIM2*(NQ*e + q);
+      kernels::Mult(DIM, DIM, DIM, dV, Jinv, sgrad_v);
+      kernels::Symmetrize(DIM, sgrad_v);
+      if (DIM == 1)
+      {
+         eig_val_data[0] = sgrad_v[0];
+         eig_vec_data[0] = 1.;
+      }
+      else
+      {
+         kernels::CalcEigenvalues<DIM>(sgrad_v, eig_val_data, eig_vec_data);
+      }
+      for (int k=0; k<DIM; k++) { compr_dir[k] = eig_vec_data[k]; }
+      // Computes the initial->physical transformation Jacobian.
+      kernels::Mult(DIM, DIM, DIM, J, d_Jac0inv + eq*DIM*DIM, Jpi);
+      kernels::Mult(DIM, DIM, Jpi, compr_dir, ph_dir);
+      // Change of the initial mesh size in the compression direction.
+      const double ph_dir_nl2 = kernels::Norml2(DIM, ph_dir);
+      const double compr_dir_nl2 = kernels::Norml2(DIM, compr_dir);
+      const double H = h0 * ph_dir_nl2 / compr_dir_nl2;
+      // Measure of maximal compression.
+      const double mu = eig_val_data[0];
+      visc_coeff = 2.0 * R * H * H * fabs(mu);
+      // The following represents a "smooth" version of the statement
+      // "if (mu < 0) visc_coeff += 0.5 rho h sound_speed".  Note that
+      // eps must be scaled appropriately if a different unit system is
+      // being used.
+      const double eps = 1e-12;
+      visc_coeff += 0.5 * R * H  * S * (1.0 - smooth_step_01(mu-2.0*eps, eps));
+      kernels::Add(DIM, DIM, visc_coeff, stress, sgrad_v, stress);
+   }
+   // Time step estimate at the point. Here the more relevant length
+   // scale is related to the actual mesh deformation; we use the min
+   // singular value of the ref->physical Jacobian. In addition, the
+   // time step estimate should be aware of the presence of shocks.
+   const double sv = kernels::CalcSingularvalue<DIM>(J, DIM - 1);
+   const double h_min = sv / h1order;
+   const double ih_min = 1. / h_min;
+   const double irho_ih_min_sq = ih_min * ih_min / R ;
+   const double idt = S * ih_min + 2.5 * visc_coeff * irho_ih_min_sq;
+   if (min_detJ < 0.0)
+   {
+      // This will force repetition of the step with smaller dt.
+      d_dt_est[eq] = 0.0;
+   }
+   else
+   {
+      if (idt > 0.0)
+      {
+         const double cfl_inv_dt = cfl / idt;
+         d_dt_est[eq] = fmin(d_dt_est[eq], cfl_inv_dt);
+      }
+   }
+   // Quadrature data for partial assembly of the force operator.
+   kernels::MultABt(DIM, DIM, DIM, stress, Jinv, stressJiT);
+   for (int k = 0; k < DIM2; k++) { stressJiT[k] *= weight * detJ; }
+   for (int vd = 0 ; vd < DIM; vd++)
+   {
+      for (int gd = 0; gd < DIM; gd++)
+      {
+         const int offset = eq + NQ*NE*(gd + vd*DIM);
+         d_stressJinvT[offset] = stressJiT[vd + gd*DIM];
+      }
+   }
+}
+
+template<int DIM, int Q1D> static inline
+void QKernel(const int NE, const int NQ,
+             const bool use_viscosity,
+             const double h0,
+             const double h1order,
+             const double cfl,
+             const double infinity,
+             const GridFunction &gamma_gf,
+             const Array<double> &weights,
+             const Vector &Jacobians,
+             const Vector &rho0DetJ0w,
+             const Vector &e_quads,
+             const Vector &grad_v_ext,
+             const DenseTensor &Jac0inv,
+             Vector &dt_est,
+             DenseTensor &stressJinvT)
+{
+   constexpr int DIM2 = DIM*DIM;
+   auto d_gamma = gamma_gf.Read();
+   auto d_weights = weights.Read();
+   auto d_Jacobians = Jacobians.Read();
+   auto d_rho0DetJ0w = rho0DetJ0w.Read();
+   auto d_e_quads = e_quads.Read();
+   auto d_grad_v_ext = grad_v_ext.Read();
+   auto d_Jac0inv = Read(Jac0inv.GetMemory(), Jac0inv.TotalSize());
+   auto d_dt_est = dt_est.ReadWrite();
+   auto d_stressJinvT = Write(stressJinvT.GetMemory(), stressJinvT.TotalSize());
+   if (DIM == 2)
+   {
+      MFEM_FORALL_2D(e, NE, Q1D, Q1D, 1,
+      {
+         double Jinv[DIM2];
+         double stress[DIM2];
+         double sgrad_v[DIM2];
+         double eig_val_data[3];
+         double eig_vec_data[9];
+         double compr_dir[DIM];
+         double Jpi[DIM2];
+         double ph_dir[DIM];
+         double stressJiT[DIM2];
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               QUpdateBody<DIM>(NE, e, NQ, qx + qy * Q1D,
+               use_viscosity, h0, h1order, cfl, infinity,
+               Jinv, stress, sgrad_v, eig_val_data, eig_vec_data,
+               compr_dir, Jpi, ph_dir, stressJiT,
+               d_gamma, d_weights, d_Jacobians, d_rho0DetJ0w,
+               d_e_quads, d_grad_v_ext, d_Jac0inv,
+               d_dt_est, d_stressJinvT);
+            }
+         }
+         MFEM_SYNC_THREAD;
+      });
+   }
+   if (DIM == 3)
+   {
+      MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
+      {
+         double Jinv[DIM2];
+         double stress[DIM2];
+         double sgrad_v[DIM2];
+         double eig_val_data[3];
+         double eig_vec_data[9];
+         double compr_dir[DIM];
+         double Jpi[DIM2];
+         double ph_dir[DIM];
+         double stressJiT[DIM2];
+         MFEM_FOREACH_THREAD(qx,x,Q1D)
+         {
+            MFEM_FOREACH_THREAD(qy,y,Q1D)
+            {
+               MFEM_FOREACH_THREAD(qz,z,Q1D)
+               {
+                  QUpdateBody<DIM>(NE, e, NQ, qx + Q1D * (qy + qz * Q1D),
+                  use_viscosity, h0, h1order, cfl, infinity,
+                  Jinv, stress, sgrad_v, eig_val_data, eig_vec_data,
+                  compr_dir, Jpi, ph_dir, stressJiT,
+                  d_gamma, d_weights, d_Jacobians, d_rho0DetJ0w,
+                  d_e_quads, d_grad_v_ext, d_Jac0inv,
+                  d_dt_est, d_stressJinvT);
+               }
+            }
+         }
+         MFEM_SYNC_THREAD;
+      });
+   }
+}
+
+void QUpdate::UpdateQuadratureData(const Vector &S, QuadratureData &qdata)
+{
+   timer->sw_qdata.Start();
+   Vector* S_p = const_cast<Vector*>(&S);
+   const int H1_size = H1.GetVSize();
+   const double h1order = (double) H1.GetOrder(0);
+   const double infinity = std::numeric_limits<double>::infinity();
+   GridFunction x, v, e;
+   x.MakeRef(&H1,*S_p, 0);
+   H1R->Mult(x, e_vec);
+   q1->SetOutputLayout(QVectorLayout::byVDIM);
+   q1->Derivatives(e_vec, q_dx);
+   v.MakeRef(&H1,*S_p, H1_size);
+   H1R->Mult(v, e_vec);
+   q1->Derivatives(e_vec, q_dv);
+   e.MakeRef(&L2, *S_p, 2*H1_size);
+   q2->SetOutputLayout(QVectorLayout::byVDIM);
+   q2->Values(e, q_e);
+   q_dt_est = qdata.dt_est;
+   const int id = (dim << 4) | Q1D;
+   typedef void (*fQKernel)(const int NE, const int NQ,
+                            const bool use_viscosity,
+                            const double h0, const double h1order,
+                            const double cfl, const double infinity,
+                            const GridFunction &gamma_gf,
+                            const Array<double> &weights,
+                            const Vector &Jacobians, const Vector &rho0DetJ0w,
+                            const Vector &e_quads, const Vector &grad_v_ext,
+                            const DenseTensor &Jac0inv,
+                            Vector &dt_est, DenseTensor &stressJinvT);
+   static std::unordered_map<int, fQKernel> qupdate =
+   {
+      {0x24,&QKernel<2,4>}, {0x26,&QKernel<2,6>}, {0x28,&QKernel<2,8>},
+      {0x34,&QKernel<3,4>}, {0x36,&QKernel<3,6>}, {0x38,&QKernel<3,8>}
+   };
+   if (!qupdate[id])
+   {
+      mfem::out << "Unknown kernel 0x" << std::hex << id << std::endl;
+      MFEM_ABORT("Unknown kernel");
+   }
+   qupdate[id](NE, NQ, use_viscosity, qdata.h0, h1order, cfl, infinity,
+               gamma_gf, ir.GetWeights(), q_dx,
+               qdata.rho0DetJ0w, q_e, q_dv,
+               qdata.Jac0inv, q_dt_est, qdata.stressJinvT);
+   qdata.dt_est = q_dt_est.Min();
+   timer->sw_qdata.Stop();
+   timer->quad_tstep += NE;
 }
 
 void LagrangianHydroOperator::AssembleForceMatrix() const
 {
    if (forcemat_is_assembled || p_assembly) { return; }
-
    Force = 0.0;
    timer.sw_force.Start();
    Force.Assemble();
    timer.sw_force.Stop();
-
    forcemat_is_assembled = true;
 }
 
 } // namespace hydrodynamics
 
+void HydroODESolver::Init(TimeDependentOperator &tdop)
+{
+   ODESolver::Init(tdop);
+   hydro_oper = dynamic_cast<hydrodynamics::LagrangianHydroOperator *>(f);
+   MFEM_VERIFY(hydro_oper, "HydroSolvers expect LagrangianHydroOperator.");
+}
+
+void RK2AvgSolver::Init(TimeDependentOperator &tdop)
+{
+   HydroODESolver::Init(tdop);
+   const Array<int> &block_offsets = hydro_oper->GetBlockOffsets();
+   V.SetSize(block_offsets[1], mem_type);
+   V.UseDevice(true);
+   dS_dt.Update(block_offsets, mem_type);
+   dS_dt = 0.0;
+   S0.Update(block_offsets, mem_type);
+}
+
+void RK2AvgSolver::Step(Vector &S, double &t, double &dt)
+{
+   // The monolithic BlockVector stores the unknown fields as follows:
+   // (Position, Velocity, Specific Internal Energy).
+   S0.Vector::operator=(S);
+   Vector &v0 = S0.GetBlock(1);
+   Vector &dx_dt = dS_dt.GetBlock(0);
+   Vector &dv_dt = dS_dt.GetBlock(1);
+
+   // In each sub-step:
+   // - Update the global state Vector S.
+   // - Compute dv_dt using S.
+   // - Update V using dv_dt.
+   // - Compute de_dt and dx_dt using S and V.
+
+   // -- 1.
+   // S is S0.
+   hydro_oper->UpdateMesh(S);
+   hydro_oper->SolveVelocity(S, dS_dt);
+   // V = v0 + 0.5 * dt * dv_dt;
+   add(v0, 0.5 * dt, dv_dt, V);
+   hydro_oper->SolveEnergy(S, V, dS_dt);
+   dx_dt = V;
+
+   // -- 2.
+   // S = S0 + 0.5 * dt * dS_dt;
+   add(S0, 0.5 * dt, dS_dt, S);
+   hydro_oper->ResetQuadratureData();
+   hydro_oper->UpdateMesh(S);
+   hydro_oper->SolveVelocity(S, dS_dt);
+   // V = v0 + 0.5 * dt * dv_dt;
+   add(v0, 0.5 * dt, dv_dt, V);
+   hydro_oper->SolveEnergy(S, V, dS_dt);
+   dx_dt = V;
+
+   // -- 3.
+   // S = S0 + dt * dS_dt.
+   add(S0, dt, dS_dt, S);
+   hydro_oper->ResetQuadratureData();
+   t += dt;
+}
+
 } // namespace mfem
diff --git a/serial/laghos_solver.hpp b/serial/laghos_solver.hpp
index bdc9b093..e0a2826d 100644
--- a/serial/laghos_solver.hpp
+++ b/serial/laghos_solver.hpp
@@ -18,7 +18,7 @@
 #define MFEM_LAGHOS_SOLVER
 
 #include "mfem.hpp"
-#include "../laghos_assembly.hpp"
+#include "laghos_assembly.hpp"
 
 namespace mfem
 {
@@ -26,7 +26,7 @@ namespace mfem
 namespace hydrodynamics
 {
 
-/// Visualize the given parallel grid function, using a GLVis server on the
+/// Visualize the given grid function, using a GLVis server on the
 /// specified host and port. Set the visualization window title, and optionally,
 /// its geometry.
 void VisualizeField(socketstream &sock, const char *vishost, int visport,
@@ -40,13 +40,53 @@ struct TimingData
    // CG solves (H1 and L2) / force RHS assemblies / quadrature computations.
    StopWatch sw_cgH1, sw_cgL2, sw_force, sw_qdata;
 
+   // Store the number of dofs of the corresponding local CG
+   const int L2dof;
+
    // These accumulate the total processed dofs or quad points:
-   // #(CG iterations) for the H1 CG solve.
-   // #dofs  * #(CG iterations) for the L2 CG solve.
+   // #(CG iterations) for the L2 CG solve.
    // #quads * #(RK sub steps) for the quadrature data computations.
-   int H1cg_iter, L2dof_iter, quad_tstep;
+   int H1iter, L2iter;
+   int quad_tstep;
+
+   TimingData(const int l2d) :
+      L2dof(l2d), H1iter(0), L2iter(0), quad_tstep(0) { }
+};
 
-   TimingData() : H1cg_iter(0), L2dof_iter(0), quad_tstep(0) { }
+class QUpdate
+{
+private:
+   const int dim, vdim, NQ, NE, Q1D;
+   const bool use_viscosity;
+   const double cfl;
+   TimingData *timer;
+   const IntegrationRule &ir;
+   FiniteElementSpace &H1, &L2;
+   const Operator *H1R;
+   Vector q_dt_est, q_e, e_vec, q_dx, q_dv;
+   const QuadratureInterpolator *q1,*q2;
+   const GridFunction &gamma_gf;
+public:
+   QUpdate(const int d, const int ne, const int q1d, const bool visc,
+           const double cfl, TimingData *t,
+           const GridFunction &gamma_gf,
+           const IntegrationRule &ir,
+           FiniteElementSpace &h1, FiniteElementSpace &l2):
+      dim(d), vdim(h1.GetVDim()),
+      NQ(ir.GetNPoints()), NE(ne), Q1D(q1d),
+      use_viscosity(visc), cfl(cfl),
+      timer(t), ir(ir), H1(h1), L2(l2),
+      H1R(H1.GetElementRestriction(ElementDofOrdering::LEXICOGRAPHIC)),
+      q_dt_est(NE*NQ),
+      q_e(NE*NQ),
+      e_vec(NQ*NE*vdim),
+      q_dx(NQ*NE*vdim*vdim),
+      q_dv(NQ*NE*vdim*vdim),
+      q1(H1.GetQuadratureInterpolator(ir)),
+      q2(L2.GetQuadratureInterpolator(ir)),
+      gamma_gf(gamma_gf) { }
+
+   void UpdateQuadratureData(const Vector &S, QuadratureData &qdata);
 };
 
 // Given a solutions state (x, v, e), this class performs all necessary
@@ -54,60 +94,59 @@ struct TimingData
 class LagrangianHydroOperator : public TimeDependentOperator
 {
 protected:
-   FiniteElementSpace &H1FESpace, &L2FESpace;
-
+   FiniteElementSpace &H1, &L2;
+   mutable FiniteElementSpace H1c;
+   Mesh *mesh;
+   // FE spaces local and global sizes
+   const int H1Vsize;
+   const int H1TVSize;
+   const int L2Vsize;
+   const int L2TVSize;
+   Array<int> block_offsets;
    // Reference to the current mesh configuration.
    mutable GridFunction x_gf;
-
-   Array<int> &ess_tdofs;
-
-   const int dim, nzones, l2dofs_cnt, h1dofs_cnt, source_type;
+   const Array<int> &ess_tdofs;
+   const int dim, NE, l2dofs_cnt, h1dofs_cnt, source_type;
    const double cfl;
    const bool use_viscosity, p_assembly;
    const double cg_rel_tol;
    const int cg_max_iter;
-   Coefficient *material_pcf;
-
+   const double ftz_tol;
+   Coefficient &gamma_coeff;
+   const GridFunction &gamma_gf;
    // Velocity mass matrix and local inverses of the energy mass matrices. These
    // are constant in time, due to the pointwise mass conservation property.
    mutable BilinearForm Mv;
    SparseMatrix Mv_spmat_copy;
    DenseTensor Me, Me_inv;
-
    // Integration rule for all assemblies.
-   const IntegrationRule &integ_rule;
-
-   // Data associated with each quadrature point in the mesh. These values are
-   // recomputed at each time step.
-   mutable QuadratureData quad_data;
-   mutable bool quad_data_is_current, forcemat_is_assembled;
-
-   // Structures used to perform partial assembly.
-   Tensors1D tensors1D;
-   FastEvaluator evaluator;
-
+   const IntegrationRule &ir;
+   // Data associated with each quadrature point in the mesh.
+   // These values are recomputed at each time step.
+   const int Q1D;
+   mutable QuadratureData qdata;
+   mutable bool qdata_is_current, forcemat_is_assembled;
    // Force matrix that combines the kinematic and thermodynamic spaces. It is
    // assembled in each time step and then it is used to compute the final
    // right-hand sides for momentum and specific internal energy.
    mutable MixedBilinearForm Force;
-
    // Same as above, but done through partial assembly.
-   ForcePAOperator ForcePA;
-
+   ForcePAOperator *ForcePA;
    // Mass matrices done through partial assembly:
    // velocity (coupled H1 assembly) and energy (local L2 assemblies).
-   mutable MassPAOperator VMassPA;
-   mutable DiagonalSolver VMassPA_prec;
-   mutable LocalMassPAOperator locEMassPA;
-
+   MassPAOperator *VMassPA, *EMassPA;
+   OperatorJacobiSmoother *VMassPA_Jprec;
    // Linear solver for energy.
-   CGSolver locCG;
-
+   CGSolver CG_VMass, CG_EMass;
    mutable TimingData timer;
-
-   void ComputeMaterialProperties(int nvalues, const double gamma[],
-                                  const double rho[], const double e[],
-                                  double p[], double cs[]) const
+   mutable QUpdate qupdate;
+   mutable Vector X, B, one, rhs, e_rhs;
+   mutable GridFunction rhs_c_gf, dvc_gf;
+   mutable Array<int> c_tdofs[3];
+
+   virtual void ComputeMaterialProperties(int nvalues, const double gamma[],
+                                          const double rho[], const double e[],
+                                          double p[], double cs[]) const
    {
       for (int v = 0; v < nvalues; v++)
       {
@@ -120,37 +159,49 @@ class LagrangianHydroOperator : public TimeDependentOperator
    void AssembleForceMatrix() const;
 
 public:
-   LagrangianHydroOperator(int size, FiniteElementSpace &h1_fes,
+   LagrangianHydroOperator(const int size,
+                           FiniteElementSpace &h1_fes,
                            FiniteElementSpace &l2_fes,
-                           Array<int> &essential_tdofs, GridFunction &rho0,
-                           int source_type_, double cfl_,
-                           Coefficient *material_, bool visc, bool pa,
-                           double cgt, int cgiter, int h1_basis_type);
+                           const Array<int> &ess_tdofs,
+                           Coefficient &rho0_coeff,
+                           GridFunction &rho0_gf,
+                           Coefficient &mat_gf_coeff,
+                           GridFunction &gamma_gf,
+                           const int source,
+                           const double cfl,
+                           const bool visc, const bool pa,
+                           const double cgt, const int cgiter, double ftz_tol,
+                           const int order_q);
+   ~LagrangianHydroOperator();
 
    // Solve for dx_dt, dv_dt and de_dt.
    virtual void Mult(const Vector &S, Vector &dS_dt) const;
 
+   virtual MemoryClass GetMemoryClass() const
+   { return Device::GetMemoryClass(); }
+
    void SolveVelocity(const Vector &S, Vector &dS_dt) const;
    void SolveEnergy(const Vector &S, const Vector &v, Vector &dS_dt) const;
    void UpdateMesh(const Vector &S) const;
 
-   // Calls UpdateQuadratureData to compute the new quad_data.dt_estimate.
+   // Calls UpdateQuadratureData to compute the new qdata.dt_estimate.
    double GetTimeStepEstimate(const Vector &S) const;
    void ResetTimeStepEstimate() const;
-   void ResetQuadratureData() const { quad_data_is_current = false; }
-
-   // The density values, which are stored only at some quadrature points, are
-   // projected as a ParGridFunction.
-   void ComputeDensity(GridFunction &rho);
+   void ResetQuadratureData() const { qdata_is_current = false; }
 
+   // The density values, which are stored only at some quadrature points,
+   // are projected as a GridFunction.
+   void ComputeDensity(GridFunction &rho) const;
    double InternalEnergy(const GridFunction &e) const;
    double KineticEnergy(const GridFunction &v) const;
 
-   void PrintTimingData(int steps) const;
+   int GetH1VSize() const { return H1.GetVSize(); }
+   const Array<int> &GetBlockOffsets() const { return block_offsets; }
 
-   int GetH1VSize() const { return H1FESpace.GetVSize(); }
+   void PrintTimingData(int steps, const bool fom) const;
 };
 
+// TaylorCoefficient used in the 2D Taylor-Green problem.
 class TaylorCoefficient : public Coefficient
 {
    virtual double Eval(ElementTransformation &T,
@@ -165,6 +216,28 @@ class TaylorCoefficient : public Coefficient
 
 } // namespace hydrodynamics
 
+class HydroODESolver : public ODESolver
+{
+protected:
+   hydrodynamics::LagrangianHydroOperator *hydro_oper;
+public:
+   HydroODESolver() : hydro_oper(NULL) { }
+   virtual void Init(TimeDependentOperator&);
+   virtual void Step(Vector&, double&, double&)
+   { MFEM_ABORT("Time stepping is undefined."); }
+};
+
+class RK2AvgSolver : public HydroODESolver
+{
+protected:
+   Vector V;
+   BlockVector dS_dt, S0;
+public:
+   RK2AvgSolver() { }
+   virtual void Init(TimeDependentOperator &_f);
+   virtual void Step(Vector &S, double &t, double &dt);
+};
+
 } // namespace mfem
 
 #endif // MFEM_LAGHOS
diff --git a/serial/laghos_timeinteg.cpp b/serial/laghos_timeinteg.cpp
deleted file mode 100644
index 04706e29..00000000
--- a/serial/laghos_timeinteg.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#include "laghos_timeinteg.hpp"
-#include "laghos_solver.hpp"
-
-using namespace std;
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-void HydroODESolver::Init(TimeDependentOperator &_f)
-{
-   ODESolver::Init(_f);
-
-   hydro_oper = dynamic_cast<LagrangianHydroOperator *>(f);
-   MFEM_VERIFY(hydro_oper, "HydroSolvers expect LagrangianHydroOperator.");
-}
-
-void RK2AvgSolver::Step(Vector &S, double &t, double &dt)
-{
-   const int Vsize = hydro_oper->GetH1VSize();
-   Vector V(Vsize), dS_dt(S.Size()), S0(S);
-
-   // The monolithic BlockVector stores the unknown fields as follows:
-   // (Position, Velocity, Specific Internal Energy).
-   Vector dv_dt, v0, dx_dt;
-   v0.SetDataAndSize(S0.GetData() + Vsize, Vsize);
-   dv_dt.SetDataAndSize(dS_dt.GetData() + Vsize, Vsize);
-   dx_dt.SetDataAndSize(dS_dt.GetData(), Vsize);
-
-   // In each sub-step:
-   // - Update the global state Vector S.
-   // - Compute dv_dt using S.
-   // - Update V using dv_dt.
-   // - Compute de_dt and dx_dt using S and V.
-
-   // -- 1.
-   // S is S0.
-   hydro_oper->UpdateMesh(S);
-   hydro_oper->SolveVelocity(S, dS_dt);
-   // V = v0 + 0.5 * dt * dv_dt;
-   add(v0, 0.5 * dt, dv_dt, V);
-   hydro_oper->SolveEnergy(S, V, dS_dt);
-   dx_dt = V;
-
-   // -- 2.
-   // S = S0 + 0.5 * dt * dS_dt;
-   add(S0, 0.5 * dt, dS_dt, S);
-   hydro_oper->ResetQuadratureData();
-   hydro_oper->UpdateMesh(S);
-   hydro_oper->SolveVelocity(S, dS_dt);
-   // V = v0 + 0.5 * dt * dv_dt;
-   add(v0, 0.5 * dt, dv_dt, V);
-   hydro_oper->SolveEnergy(S, V, dS_dt);
-   dx_dt = V;
-
-   // -- 3.
-   // S = S0 + dt * dS_dt.
-   add(S0, dt, dS_dt, S);
-   hydro_oper->ResetQuadratureData();
-
-   t += dt;
-}
-
-} // namespace hydrodynamics
-
-} // namespace mfem
diff --git a/serial/laghos_timeinteg.hpp b/serial/laghos_timeinteg.hpp
deleted file mode 100644
index bb9cbc7e..00000000
--- a/serial/laghos_timeinteg.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
-// the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
-// reserved. See files LICENSE and NOTICE for details.
-//
-// This file is part of CEED, a collection of benchmarks, miniapps, software
-// libraries and APIs for efficient high-order finite element and spectral
-// element discretizations for exascale applications. For more information and
-// source code availability see http://github.com/ceed.
-//
-// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
-// a collaborative effort of two U.S. Department of Energy organizations (Office
-// of Science and the National Nuclear Security Administration) responsible for
-// the planning and preparation of a capable exascale ecosystem, including
-// software, applications, hardware, advanced system engineering and early
-// testbed platforms, in support of the nation's exascale computing imperative.
-
-#ifndef MFEM_LAGHOS_TIMEINTEG
-#define MFEM_LAGHOS_TIMEINTEG
-
-#include "mfem.hpp"
-
-namespace mfem
-{
-
-namespace hydrodynamics
-{
-
-class LagrangianHydroOperator;
-
-class HydroODESolver : public ODESolver
-{
-protected:
-   LagrangianHydroOperator *hydro_oper;
-
-public:
-   HydroODESolver() : hydro_oper(NULL) { }
-
-   virtual void Init(TimeDependentOperator &_f);
-
-   virtual void Step(Vector &S, double &t, double &dt)
-   { MFEM_ABORT("Time stepping is undefined."); }
-};
-
-class RK2AvgSolver : public HydroODESolver
-{
-public:
-   RK2AvgSolver() { }
-
-   virtual void Step(Vector &S, double &t, double &dt);
-};
-
-} // namespace hydrodynamics
-
-} // namespace mfem
-
-#endif // MFEM_LAGHOS_TIMEINTEG
diff --git a/serial/makefile b/serial/makefile
index 3b566121..9fcf3766 100644
--- a/serial/makefile
+++ b/serial/makefile
@@ -20,6 +20,9 @@ Laghos makefile targets:
 
    make
    make status/info
+   make test
+   make tests
+   make checks
    make install
    make clean
    make distclean
@@ -46,117 +49,90 @@ make style
 
 endef
 
+NPROC = $(shell getconf _NPROCESSORS_ONLN)
+GOALS = help clean distclean style
+
 # Default installation location
-PREFIX = ./bin
+PREFIX ?= ./bin
 INSTALL = /usr/bin/install
 
-# Use the MFEM build directory
-MFEM_DIR = ../../mfem
+# Use the MFEM source, build, or install directory
+MFEM_DIR ?= ../../mfem
 CONFIG_MK = $(MFEM_DIR)/config/config.mk
-TEST_MK = $(MFEM_DIR)/config/test.mk
-# Use the MFEM install directory
-# MFEM_DIR = ../mfem/mfem
-# CONFIG_MK = $(MFEM_DIR)/config.mk
-# TEST_MK = $(MFEM_DIR)/test.mk
-
-# Use two relative paths to MFEM: first one for compilation in '.' and second
-# one for compilation in 'lib'.
-MFEM_DIR1 := $(MFEM_DIR)
-MFEM_DIR2 := $(realpath $(MFEM_DIR))
+ifeq ($(wildcard $(CONFIG_MK)),)
+   CONFIG_MK = $(MFEM_DIR)/share/mfem/config.mk
+endif
+TEST_MK = $(MFEM_TEST_MK)
 
 # Use the compiler used by MFEM. Get the compiler and the options for compiling
 # and linking from MFEM's config.mk. (Skip this if the target does not require
 # building.)
 MFEM_LIB_FILE = mfem_is_not_built
-ifeq (,$(filter help clean distclean style,$(MAKECMDGOALS)))
+ifeq (,$(filter $(GOALS),$(MAKECMDGOALS)))
    -include $(CONFIG_MK)
+   ifneq ($(realpath $(MFEM_DIR)),$(MFEM_SOURCE_DIR))
+      ifneq ($(realpath $(MFEM_DIR)),$(MFEM_INSTALL_DIR))
+         MFEM_BUILD_DIR := $(MFEM_DIR)
+         override MFEM_DIR := $(MFEM_SOURCE_DIR)
+      endif
+   endif
 endif
 
 CXX = $(MFEM_CXX)
 CPPFLAGS = $(MFEM_CPPFLAGS)
 CXXFLAGS = $(MFEM_CXXFLAGS)
-
-# MFEM config does not define C compiler
-CC     = gcc
-CFLAGS = -O3
-
-# Optional link flags
-LDFLAGS =
-
-OPTIM_OPTS = -O3
-DEBUG_OPTS = -g -Wall
-LAGHOS_DEBUG = $(MFEM_DEBUG)
-ifneq ($(LAGHOS_DEBUG),$(MFEM_DEBUG))
-   ifeq ($(LAGHOS_DEBUG),YES)
-      CXXFLAGS = $(DEBUG_OPTS)
-   else
-      CXXFLAGS = $(OPTIM_OPTS)
-   endif
-endif
-
 LAGHOS_FLAGS = $(CPPFLAGS) $(CXXFLAGS) $(MFEM_INCFLAGS)
-LAGHOS_LIBS = $(MFEM_LIBS)
-
-ifeq ($(LAGHOS_DEBUG),YES)
-   LAGHOS_FLAGS += -DLAGHOS_DEBUG
-endif
+# Extra include dir, needed for now to include headers like "general/forall.hpp"
+EXTRA_INC_DIR = $(or $(wildcard $(MFEM_DIR)/include/mfem),$(MFEM_DIR))
+CCC = $(strip $(CXX) $(LAGHOS_FLAGS) $(if $(EXTRA_INC_DIR),-I$(EXTRA_INC_DIR)))
 
+LAGHOS_LIBS = $(MFEM_LIBS) $(MFEM_EXT_LIBS)
 LIBS = $(strip $(LAGHOS_LIBS) $(LDFLAGS))
-CCC  = $(strip $(CXX) $(LAGHOS_FLAGS))
-Ccc  = $(strip $(CC) $(CFLAGS) $(GL_OPTS))
 
-SOURCE_FILES = laghos.cpp laghos_solver.cpp laghos_timeinteg.cpp \
-               ../laghos_assembly.cpp
-OBJECT_FILES1 = $(SOURCE_FILES:.cpp=.o)
-OBJECT_FILES = $(OBJECT_FILES1:.c=.o)
-HEADER_FILES = laghos_solver.hpp laghos_timeinteg.hpp \
-               ../laghos_assembly.hpp
+SOURCE_FILES = $(sort $(wildcard *.cpp))
+HEADER_FILES = $(sort $(wildcard *.hpp))
+OBJECT_FILES = $(SOURCE_FILES:.cpp=.o)
 
 # Targets
 
-.PHONY: all clean distclean install status info opt debug test style clean-build clean-exec
+.PHONY: all clean distclean install status info opt debug test tests style \
+	clean-build clean-exec clean-tests
 
-.SUFFIXES: .c .cpp .o
+.SUFFIXES: .cpp .o
 .cpp.o:
 	cd $(<D); $(CCC) -c $(<F)
-.c.o:
-	cd $(<D); $(Ccc) -c $(<F)
 
-laghos: override MFEM_DIR = $(MFEM_DIR1)
-laghos:	$(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
-	$(CCC) -o laghos $(OBJECT_FILES) $(LIBS)
+laghos: $(OBJECT_FILES) $(CONFIG_MK) $(MFEM_LIB_FILE)
+	$(MFEM_CXX) $(MFEM_LINK_FLAGS) -o laghos $(OBJECT_FILES) $(LIBS)
 
-all: laghos
+all:;@$(MAKE) -j $(NPROC) laghos
 
-opt:
-	$(MAKE) "LAGHOS_DEBUG=NO"
-
-debug:
-	$(MAKE) "LAGHOS_DEBUG=YES"
-
-$(OBJECT_FILES): override MFEM_DIR = $(MFEM_DIR2)
 $(OBJECT_FILES): $(HEADER_FILES) $(CONFIG_MK)
 
+# Quick test with specific execution options
 MFEM_TESTS = laghos
-include $(TEST_MK)
-# Testing: Specific execution options
-RUN_MPI =
+RUN_MPI = 
 test: laghos
 	@$(call mfem-test,$<, $(RUN_MPI), Laghos miniapp,\
 	-p 0 -m ../data/square01_quad.mesh -rs 3 -tf 0.1)
 # Testing: "test" target and mfem-test* variables are defined in MFEM's
 # config/test.mk
+ifeq (,$(filter $(GOALS),$(MAKECMDGOALS)))
+include $(TEST_MK)
+endif
 
 # Generate an error message if the MFEM library is not built and exit
 $(CONFIG_MK) $(MFEM_LIB_FILE):
 	$(error The MFEM library is not built)
 
-clean: clean-build clean-exec
+cln clean: clean-build clean-exec clean-tests
 
 clean-build:
-	rm -rf laghos *.o *~ *.dSYM Laghos_*
+	rm -rf laghos *.o *~ *.dSYM
 clean-exec:
-
+	rm -rf ./results
+clean-tests:
+	rm -rf BASELINE.dat RUN.dat RESULTS.dat
 distclean: clean
 	rm -rf bin/
 
@@ -169,16 +145,87 @@ help:
 	@true
 
 status info:
-	$(info MFEM_DIR    = $(MFEM_DIR))
+	$(info MFEM_DIR     = $(MFEM_DIR))
 	$(info LAGHOS_FLAGS = $(LAGHOS_FLAGS))
 	$(info LAGHOS_LIBS  = $(value LAGHOS_LIBS))
-	$(info PREFIX      = $(PREFIX))
+	$(info PREFIX       = $(PREFIX))
 	@true
 
-ASTYLE = astyle --options=$(MFEM_DIR1)/config/mfem.astylerc
+ASTYLE = astyle --options=$(MFEM_DIR)/config/mfem.astylerc
 FORMAT_FILES := $(SOURCE_FILES) $(HEADER_FILES)
-
 style:
 	@if ! $(ASTYLE) $(FORMAT_FILES) | grep Formatted; then\
 	   echo "No source files were changed.";\
 	fi
+
+# Laghos checks template - Default arguments
+ECHO=echo
+SED=sed -e
+ranks=1
+dims=2 3
+problems=0 1 2 3 4 5 6
+OPTS=-cgt 1.e-14 -rs 0 --checks
+USE_CUDA := $(MFEM_USE_CUDA:NO=)
+optioni=1 2$(if $(USE_CUDA), 3)
+options=-fa -pa $(if $(USE_CUDA),-d_cuda) #-d_debug
+#optioni = $(shell for i in {1..$(words $(options))}; do echo $$i; done)
+
+# Laghos checks template - Targets
+define laghos_checks_template
+.PHONY: laghos_$(1)_$(2)_$(3)_$(4)
+laghos_$(1)_$(2)_$(3)_$(4): laghos
+	$(eval name=laghos-x$(4)-p$(1)-$(2)D$(word $(3),$(options)))
+	$(eval command=./laghos $(OPTS) -p $(1) -dim $(2) $(shell echo $(word $(3),$(options))|$(SED) "s/-/ -/g"|$(SED) "s/_/ /g"))
+	@./$$< $(OPTS) -p $(1) -dim $(2) $(shell echo $(word $(3),$(options))|$(SED) "s/-/ -/g"|$(SED) "s/_/ /g") > /dev/null 2>&1 && \
+	$(call COLOR_PRINT,'\033[0;32m',OK,': $(name)\n') || $(call COLOR_PRINT,'\033[1;31m',KO,': $(command)\n');
+endef
+# Generate all Laghos checks template targets
+$(foreach p, $(problems), $(foreach d, $(dims), $(foreach o, $(optioni), $(foreach r, $(ranks),\
+	$(eval $(call laghos_checks_template,$(p),$(d),$(o),$(r)))))))
+# Output info on all Laghos checks template targets
+#$(foreach p, $(problems), $(foreach d, $(dims), $(foreach o, $(optioni), $(foreach r, $(ranks),\
+#   $(info $(call laghos_checks_template,$(p),$(d),$(o),$(r)))))))
+checks: laghos
+checks: |$(foreach p,$(problems), $(foreach d,$(dims), $(foreach o,$(optioni), $(foreach r,$(ranks), laghos_$(p)_$(d)_$(o)_$(r)))))
+
+1:;@$(MAKE) -j $(NPROC) checks
+
+# Laghos run tests
+tests:
+	cat << EOF > RESULTS.dat
+	./laghos -p 0 -dim 2 -rs 3 -tf 0.75 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 20 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	./laghos -p 0 -dim 3 -rs 1 -tf 0.75 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 20 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	./laghos -p 1 -dim 2 -rs 3 -tf 0.8 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	./laghos -p 1 -dim 3 -rs 2 -tf 0.6 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	./laghos -p 2 -dim 1 -rs 5 -tf 0.2 -fa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 18 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	./laghos -p 3 -m ../data/rectangle01_quad.mesh -rs 2 -tf 3.0 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	./laghos -p 3 -m ../data/box01_hex.mesh -rs 1 -tf 3.0 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 17 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	./laghos -p 4 -m ../data/square_gresho.mesh -rs 3 -ok 3 \
+	         -ot 2 -tf 0.62831853 -s 7 -pa -vs 100 | tee RUN.dat
+	cat RUN.dat | tail -n 20 | head -n 1 | \
+	awk '{ printf("step = %04d, dt = %s |e| = %.10e\n", $$2, $$8, $$11); }' >> RESULTS.dat
+	$(shell cat << EOF > BASELINE.dat)
+	$(shell echo 'step = 0339, dt = 0.000702, |e| = 4.9695537349e+01' >> BASELINE.dat)
+	$(shell echo 'step = 1041, dt = 0.000121, |e| = 3.3909635545e+03' >> BASELINE.dat)
+	$(shell echo 'step = 1154, dt = 0.001655, |e| = 4.6303396053e+01' >> BASELINE.dat)
+	$(shell echo 'step = 0560, dt = 0.002449, |e| = 1.3408616722e+02' >> BASELINE.dat)
+	$(shell echo 'step = 0413, dt = 0.000470, |e| = 3.2012077410e+01' >> BASELINE.dat)
+	$(shell echo 'step = 2872, dt = 0.000064, |e| = 5.6547039096e+01' >> BASELINE.dat)
+	$(shell echo 'step = 0528, dt = 0.000180, |e| = 5.6505348812e+01' >> BASELINE.dat)
+	$(shell echo 'step = 0776, dt = 0.000045, |e| = 4.0982431726e+02' >> BASELINE.dat)
+	diff --report-identical-files RESULTS.dat BASELINE.dat
+
diff --git a/setupLaghos.sh b/setupLaghos.sh
deleted file mode 100644
index 538e8f83..00000000
--- a/setupLaghos.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-cd ../ &&
-wget https://computing.llnl.gov/projects/hypre-scalable-linear-solvers-multigrid-methods/download/hypre-2.10.0b.tar.gz &&
-tar -zxvf hypre-2.10.0b.tar.gz &&
-mv hypre-2.10.0b hypre &&
-cd hypre/src/ &&
-./configure --disable-fortran &&
-make -j 3 &&
-cd ../.. &&
-wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/metis/OLD/metis-4.0.3.tar.gz &&
-tar -zxvf metis-4.0.3.tar.gz && cd metis-4.0.3 &&
-make -j 3 && cd .. &&
-ln -s metis-4.0.3 metis-4.0 &&
-git clone --recursive https://github.com/mfem/mfem.git &&
-cd mfem && make parallel -j 3
-cd ../Laghos && make -j 3
diff --git a/timing/bind_ray.sh b/timing/bind_ray.sh
deleted file mode 100755
index 813bc329..00000000
--- a/timing/bind_ray.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/bin/bash
-#--------------------------------------------------------------------------------
-# mpirun -np $nmpi bind.sh your.exe [args]
-# optionally set BIND_BASE in your job script
-# optionally set BIND_STRIDE in your job script
-# optionally set BIND_POLICY=packed in your job script
-# optionally set BIND_CPU_LIST="cpu0 cpu1 ..." in your job script
-# Note : for some OpenMP implementations (GNU OpenMP) use mpirun --bind-to none
-#--------------------------------------------------------------------------------
-cpus_per_node=`cat /proc/cpuinfo | grep processor | wc -l`
-
-if [ -z "$OMPI_COMM_WORLD_LOCAL_SIZE" ]; then
-
-  let OMPI_COMM_WORLD_LOCAL_SIZE=1
-
-  let OMPI_COMM_WORLD_LOCAL_RANK=0
-
-fi
-
-# if OMP_NUM_THREADS is not set, assume no threading and bind with taskset
-if [ -z "$OMP_NUM_THREADS" ]; then
-
-  if [ "$OMPI_COMM_WORLD_RANK" == "0" ]; then
-    echo bind.sh: OMP_NUM_THREADS is not set ... assuming one thread
-  fi
-
-  if [ -z "$BIND_CPU_LIST" ]; then
-
-    if [ -z "$BIND_BASE" ]; then
-      let BIND_BASE=0
-    fi
-
-    if [ -z "$BIND_STRIDE" ]; then
-      let cpus_per_rank=$cpus_per_node/$OMPI_COMM_WORLD_LOCAL_SIZE
-    else
-      let cpus_per_rank=$BIND_STRIDE
-    fi
-
-    let start_cpu=$BIND_BASE+$OMPI_COMM_WORLD_LOCAL_RANK*$cpus_per_rank
-
-    let stop_cpu=$start_cpu+$cpus_per_rank-1
-
-    if [ "${BIND_ALL}" == "yes" ]; then
-      printf -v command "exec taskset -c %d-%d"  $start_cpu  $stop_cpu
-    else
-      printf -v command "exec taskset -c %d"  $start_cpu
-    fi
-
-    $command "$@"
-
-  else
-
-    declare -a cpulist=($BIND_CPU_LIST)
-
-    exec taskset -c ${cpulist[$OMPI_COMM_WORLD_LOCAL_RANK]} "$@"
-
-  fi
-
-else
-#if OMP_NUM_THREADS is set, bind using OMP_PLACES
-
-  if [ -z "$BIND_STRIDE" ]; then
-    let cpus_required=$OMP_NUM_THREADS*$OMPI_COMM_WORLD_LOCAL_SIZE
-    let BIND_STRIDE=$cpus_per_node/$cpus_required
-  fi
-
-  if [ "${BIND_POLICY}" == "packed" ]; then
-    let cpus_per_rank=$OMP_NUM_THREADS*$BIND_STRIDE
-  else
-    let cpus_per_rank=$cpus_per_node/$OMPI_COMM_WORLD_LOCAL_SIZE
-  fi
-
-  if [ -z "$BIND_BASE" ]; then
-    let base=0;
-  else
-    let base=$BIND_BASE;
-  fi
-
-  let start_cpu=$base+$OMPI_COMM_WORLD_LOCAL_RANK*$cpus_per_rank
-
-  let stop_cpu=$start_cpu+$OMP_NUM_THREADS*$BIND_STRIDE-1
-
-  export OMP_PLACES={$start_cpu:$OMP_NUM_THREADS:$BIND_STRIDE}
-
-  export OMP_PROC_BIND=true
-
-  if [ "${BIND_ALL}" == "yes" ]; then
-    printf -v command "exec taskset -c %d-%d"  $start_cpu  $stop_cpu
-    $command "$@"
-  else
-    exec "$@"
-  fi
-
-fi
diff --git a/timing/collect_timings_2D.sh b/timing/collect_timings_2D.sh
deleted file mode 100755
index 0d7616d8..00000000
--- a/timing/collect_timings_2D.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env bash
-
-options=( 'pa' 'fa' )
-
-parallel_refs=0
-maxL2dof=1000000
-nproc=4
-
-outfile=timings_2d
-mesh_file=../data/square01_quad.mesh
-
-calc() { awk "BEGIN{print $*}"; }
-
-run_case()
-{
-    # Pass command as all inputs
-    # Outputs: order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate
-
-    "$@" | tee run.log | awk '
-BEGIN { ref = 0 }
-/--refine-serial/ { ref += $2 }
-/--refine-parallel/ { ref += $2 }
-/--order/ { order = $2 }
-/Number of kinematic/ { h1_dofs = $7 }
-/Number of specific internal energy/ { l2_dofs = $7 }
-/CG \(H1\) rate/ { h1_cg_rate = $9 }
-/CG \(L2\) rate/ { l2_cg_rate = $9 }
-/Forces rate/ { forces_rate = $8 }
-/UpdateQuadData rate/ { update_quad_rate = $8 }
-/Major kernels total rate/ { total_time = $11 }
-END { printf("%d %d %d %d %.8f %.8f %.8f %.8f %.8f\n", order, ref, h1_dofs, l2_dofs, h1_cg_rate, l2_cg_rate, forces_rate, update_quad_rate, total_time) }'
-}
-
-[ -r $outfile ] && cp $outfile $outfile.bak
-echo "# order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[0]}
-echo "# order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[1]}
-for method in "${options[@]}"; do
-  for torder in {0..4}; do
-    for sref in {0..10}; do
-       nzones=$(( 4**(sref+1) ))
-       nL2dof=$(( nzones*(torder+1)**2 ))
-       if (( nproc <= nzones )) && (( nL2dof < maxL2dof )) ; then
-         echo "np"$nproc "Q"$((torder+1))"Q"$torder $sref"ref" $method $outfile"_"${options[0]}
-         echo $(run_case mpirun -np $nproc ../laghos -$method -p 1 -tf 0.8 \
-                       --cg-tol 0 --cg-max-steps 50 \
-                       --max-steps 10 \
-                       --mesh $mesh_file \
-                       --refine-serial $sref \
-                       --refine-parallel $parallel_refs \
-                       --order-thermo $torder \
-                       --order-kinematic $((torder+1))) >> $outfile"_"$method
-      fi
-    done
-  done
-done
diff --git a/timing/collect_timings_3D.sh b/timing/collect_timings_3D.sh
deleted file mode 100755
index 5ae7b324..00000000
--- a/timing/collect_timings_3D.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env bash
-
-options=( 'pa' 'fa' )
-
-parallel_refs=0
-maxL2dof=1000000
-nproc=8
-
-outfile=timings_3d
-mesh_file=../data/cube01_hex.mesh
-
-calc() { awk "BEGIN{print $*}"; }
-
-run_case()
-{
-    # Pass command as all inputs
-    # Outputs: order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate
-
-    "$@" | tee run.log | awk '
-BEGIN { ref = 0 }
-/--refine-serial/ { ref += $2 }
-/--refine-parallel/ { ref += $2 }
-/--order/ { order = $2 }
-/Number of kinematic/ { h1_dofs = $7 }
-/Number of specific internal energy/ { l2_dofs = $7 }
-/CG \(H1\) rate/ { h1_cg_rate = $9 }
-/CG \(L2\) rate/ { l2_cg_rate = $9 }
-/Forces rate/ { forces_rate = $8 }
-/UpdateQuadData rate/ { update_quad_rate = $8 }
-/Major kernels total rate/ { total_time = $11 }
-END { printf("%d %d %d %d %.8f %.8f %.8f %.8f %.8f\n", order, ref, h1_dofs, l2_dofs, h1_cg_rate, l2_cg_rate, forces_rate, update_quad_rate, total_time) }'
-}
-
-[ -r $outfile ] && cp $outfile $outfile.bak
-echo "# order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[0]}
-echo "# order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[1]}
-for method in "${options[@]}"; do
-  for torder in {0..4}; do
-    for sref in {0..10}; do
-       nzones=$(( 8**(sref+1) ))
-       nL2dof=$(( nzones*(torder+1)**3 ))
-       if (( nproc <= nzones )) && (( nL2dof < maxL2dof )) ; then
-         echo "np"$nproc "Q"$((torder+1))"Q"$torder $sref"ref" $method $outfile"_"${options[0]}
-         echo $(run_case mpirun -np $nproc ../laghos -$method -p 1 -tf 0.8 \
-                       --cg-tol 0 --cg-max-steps 50 \
-                       --max-steps 10 \
-                       --mesh $mesh_file \
-                       --refine-serial $sref \
-                       --refine-parallel $parallel_refs \
-                       --order-thermo $torder \
-                       --order-kinematic $((torder+1))) >> $outfile"_"$method
-      fi
-    done
-  done
-done
diff --git a/timing/collect_timings_ray_2D.sh b/timing/collect_timings_ray_2D.sh
deleted file mode 100755
index b5ea0119..00000000
--- a/timing/collect_timings_ray_2D.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-
-options=( 'pa' 'fa' )
-
-problem=0
-parallel_refs=0
-maxL2dof=1000000
-nproc=16
-
-outfile=timings_2d
-mesh_file=../data/square01_quad.mesh
-
-calc() { awk "BEGIN{print $*}"; }
-
-run_case()
-{
-    # Pass command as all inputs
-    # Outputs: order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate
-
-    "$@" | awk '
-BEGIN { refu= 0 }
-/--refine-serial/ { ref += $2 }
-/--refine-parallel/ { ref += $2 }
-/--order/ { order = $2 }
-/Number of kinematic/ { h1_dofs = $7 }
-/Number of specific internal energy/ { l2_dofs = $7 }
-/CG \(H1\) rate/ { h1_cg_rate = $9 }
-/CG \(L2\) rate/ { l2_cg_rate = $9 }
-/Forces rate/ { forces_rate = $8 }
-/UpdateQuadData rate/ { update_quad_rate = $8 }
-/Major kernels/ { total_time = $6 }
-END { printf("%d %d %d %d %.8f %.8f %.8f %.8f %.8f\n", order, ref, h1_dofs, l2_dofs, h1_cg_rate, l2_cg_rate, forces_rate, update_quad_rate, total_time) }'
-}
-
-[ -r $outfile ] && cp $outfile $outfile.bak
-echo "# H1order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[0]}
-echo "# H1order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[1]}
-for method in "${options[@]}"; do
-  for torder in {0..4}; do
-    for sref in {0..10}; do
-       nzones=$(( 4**(sref+1) ))
-       nL2dof=$(( nzones*(torder+1)**2 ))
-       if (( nproc <= nzones )) && (( nL2dof < maxL2dof )) ; then
-         echo "np"$nproc "Q"$((torder+1))"Q"$torder $sref"ref" $method
-         echo $(run_case mpirun -np $nproc ./bind_ray.sh ../laghos -$method \
-                       -p $problem -tf 0.5 -cfl 0.05 -vs 1 \
-                       --cg-tol 0 --cg-max-steps 50 \
-                       --max-steps 1 \
-                       --mesh $mesh_file \
-                       --refine-serial $sref \
-                       --refine-parallel $parallel_refs \
-                       --order-thermo $torder \
-                       --order-kinematic $((torder+1))) >> $outfile"_"$method
-      fi
-    done
-  done
-done
diff --git a/timing/collect_timings_ray_3D.sh b/timing/collect_timings_ray_3D.sh
deleted file mode 100755
index 2546fc83..00000000
--- a/timing/collect_timings_ray_3D.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-
-options=( 'pa' 'fa' )
-
-problem=0
-parallel_refs=0
-maxL2dof=1000000
-nproc=64
-
-outfile=timings_3d
-mesh_file=../data/cube01_hex.mesh
-
-calc() { awk "BEGIN{print $*}"; }
-
-run_case()
-{
-    # Pass command as all inputs
-    # Outputs: order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate
-
-    "$@" | awk '
-BEGIN { refu= 0 }
-/--refine-serial/ { ref += $2 }
-/--refine-parallel/ { ref += $2 }
-/--order/ { order = $2 }
-/Number of kinematic/ { h1_dofs = $7 }
-/Number of specific internal energy/ { l2_dofs = $7 }
-/CG \(H1\) rate/ { h1_cg_rate = $9 }
-/CG \(L2\) rate/ { l2_cg_rate = $9 }
-/Forces rate/ { forces_rate = $8 }
-/UpdateQuadData rate/ { update_quad_rate = $8 }
-/Major kernels/ { total_time = $6 }
-END { printf("%d %d %d %d %.8f %.8f %.8f %.8f %.8f\n", order, ref, h1_dofs, l2_dofs, h1_cg_rate, l2_cg_rate, forces_rate, update_quad_rate, total_time) }'
-}
-
-[ -r $outfile ] && cp $outfile $outfile.bak
-echo "# H1order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[0]}
-echo "# H1order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"${options[1]}
-for method in "${options[@]}"; do
-  for torder in {0..4}; do
-    for sref in {0..10}; do
-       nzones=$(( 8**(sref+1) ))
-       nL2dof=$(( nzones*(torder+1)**3 ))
-       if (( nproc <= nzones )) && (( nL2dof < maxL2dof )) ; then
-         echo "np"$nproc "Q"$((torder+1))"Q"$torder $sref"ref" $method
-         echo $(run_case mpirun -np $nproc ./bind_ray.sh ../laghos -$method \
-                       -p $problem -tf 0.5 -cfl 0.05 -vs 1 \
-                       --cg-tol 0 --cg-max-steps 50 \
-                       --max-steps 1 \
-                       --mesh $mesh_file \
-                       --refine-serial $sref \
-                       --refine-parallel $parallel_refs \
-                       --order-thermo $torder \
-                       --order-kinematic $((torder+1))) >> $outfile"_"$method
-      fi
-    done
-  done
-done
diff --git a/timing/collect_timings_vulcan_2D.sh b/timing/collect_timings_vulcan_2D.sh
deleted file mode 100755
index ae36f5d8..00000000
--- a/timing/collect_timings_vulcan_2D.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-
-options=( 'pa' 'fa' )
-
-parallel_refs=0
-minL2dof=200
-maxL2dof=1000000
-
-# To get Cartesian mesh partitions, use 1/4/16/64/256 ... nodes.
-nodes=4
-nproc=$(( 16 * nodes ))
-
-mesh_file=../data/square01_quad.mesh
-outfile=timings_2d
-
-calc() { awk "BEGIN{print $*}"; }
-
-run_case()
-{
-    # Pass command as all inputs
-    # Outputs: order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate
-
-    "$@" | tee run.log | awk '
-BEGIN { ref = 0 }
-/--refine-serial/ { ref += $2 }
-/--refine-parallel/ { ref += $2 }
-/--order/ { order = $2 }
-/Number of kinematic/ { h1_dofs = $7 }
-/Number of specific internal energy/ { l2_dofs = $7 }
-/CG \(H1\) rate/ { h1_cg_rate = $9 }
-/CG \(L2\) rate/ { l2_cg_rate = $9 }
-/Forces rate/ { forces_rate = $8 }
-/UpdateQuadData rate/ { update_quad_rate = $8 }
-/Major kernels total rate/ { total_time = $11 }
-END { printf("%d %d %d %d %.8f %.8f %.8f %.8f %.8f\n", order, ref, h1_dofs, l2_dofs, h1_cg_rate, l2_cg_rate, forces_rate, update_quad_rate, total_time) }'
-}
-
-for method in "${options[@]}"; do
-  echo "# order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_time" > $outfile"_"$method
-  for torder in {0..4}; do
-    for sref in {0..10}; do
-       nzones=$(( 4**(sref+1) ))
-       nL2dof=$(( nzones*(torder+1)**2 ))
-       if (( nproc <= nzones )) && (( nL2dof > minL2dof )) && (( nL2dof < maxL2dof )) ; then
-         echo "np"$nproc "Q"$((torder+1))"Q"$torder $sref"ref" $method $outfile"_"$method
-         echo $(run_case srun -n $nproc ../laghos -$method -p 1 -tf 0.8 \
-                       --cg-tol 0 --cg-max-steps 50 \
-                       --max-steps 10 \
-                       --mesh $mesh_file \
-                       --refine-serial $sref \
-                       --refine-parallel $parallel_refs \
-                       --order-thermo $torder \
-                       --order-kinematic $((torder+1))) >> $outfile"_"$method
-      fi
-    done
-  done
-done
diff --git a/timing/collect_timings_vulcan_3D.sh b/timing/collect_timings_vulcan_3D.sh
deleted file mode 100755
index ec296fc5..00000000
--- a/timing/collect_timings_vulcan_3D.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env bash
-
-# Usage:
-#   ./collect_timings_vulcan_3D part_type nodes
-
-# To get Cartesian mesh partitions:
-#   with the 111 partition, use 4/32/256/2048/16384 nodes.
-#   with the 221 partition, use 2/16/128/1024/8192 nodes.
-#   with the 211 partition, use 1/ 8/ 64/ 512/4096 nodes.
-#
-#   with the 322 partition, use 6/48/384/3072/24576(full machine) nodes.
-#   with the 321 partition, use 3/24/192/1536/12288(1/2  machine) nodes.
-#   with the 311 partition, use   12/ 96/ 768/6144 (1/4  machine) nodes.
-#
-#   with the 432 partition, use   12/ 96/ 768/6144 (1/4 machine) nodes.
-
-part_type=$1
-nodes=$2
-
-# Additional user input
-l2orders=(1 2 3)
-minL2dof_node=0
-maxL2dof_node=400000
-steps=2
-cg_iter=50
-options=( 'pa' 'fa' )
-# End of user input.
-
-# Different meshes might be used, depending on
-# how many zones each MPI task is expected to start with.
-if (( part_type == 111 )) || (( part_type == 211 )) || (( part_type == 221 )); then
-  nzones0=8
-  mesh_file=../data/cube01_hex.mesh
-elif (( part_type == 432 )); then
-  nzones0=24
-  mesh_file=../data/cube_24_hex.mesh
-elif ((part_type == 322 )) || (( part_type == 321 )) || (( part_type == 311 )); then
-  nzones0=12
-  mesh_file=../data/cube_12_hex.mesh
-fi
-
-#
-# The stuff below should not change
-#
-# Make sure that the serial mesh has at least one zone per task.
-nproc=$(( 16 * nodes ))
-sref=0
-while (( nzones0 * 8**(sref) < nproc ))
-do
-  sref=$(( sref+1 ))
-done
-echo "sref: "$sref "serial_nzones: "$(( nzones0 * 8**(sref) )) "nproc: "$nproc
-
-minL2dof=$(( minL2dof_node * nodes ))
-maxL2dof=$(( maxL2dof_node * nodes ))
-
-outfile=timings_3d
-
-run_case()
-{
-    # Pass command as all inputs
-    # Outputs: order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_rate
-
-    "$@" | tee -a run"_"$method"_"$nodes.log | awk '
-BEGIN { ref = 0 }
-/--refine-serial/ { ref += $2 }
-/--refine-parallel/ { ref += $2 }
-/--order/ { order = $2 }
-/Number of kinematic/ { h1_dofs = $7 }
-/Number of specific internal energy/ { l2_dofs = $7 }
-/CG \(H1\) rate/ { h1_cg_rate = $9 }
-/CG \(L2\) rate/ { l2_cg_rate = $9 }
-/Forces rate/ { forces_rate = $8 }
-/UpdateQuadData rate/ { update_quad_rate = $8 }
-/Major kernels total rate/ { total_rate = $11 }
-END { printf("%d %d %d %d %.8f %.8f %.8f %.8f %.8f\n", order, ref, h1_dofs, l2_dofs, h1_cg_rate, l2_cg_rate, forces_rate, update_quad_rate, total_rate) }'
-}
-
-for method in "${options[@]}"; do
-  echo "# order refs h1_dofs l2_dofs h1_cg_rate l2_cg_rate forces_rate update_quad_rate total_rate" > $outfile"_"$method"_"$nodes
-  for torder in ${l2orders[@]}; do
-    for pref in {0..10}; do
-       nzones=$(( 8**(pref+sref)*nzones0 ))
-       nL2dof=$(( nzones*(torder+1)**3 ))
-       echo "L2dofs: "$nL2dof "maxL2dofs: "$maxL2dof
-       if (( nproc <= nzones )) && (( nL2dof > minL2dof )) && (( nL2dof < maxL2dof )) ; then
-         echo "np"$nproc "Q"$((torder+1))"Q"$torder $pref"ref" $method
-         echo $(run_case srun -n $nproc ../laghos -$method -p 1 -tf 0.8 -pt $part_type \
-                       --cg-tol 0 --cg-max-steps $cg_iter \
-                       --max-steps $steps \
-                       --mesh $mesh_file \
-                       --refine-serial $sref --refine-parallel $pref \
-                       --order-thermo $torder \
-                       --order-kinematic $((torder+1))) >> $outfile"_"$method"_"$nodes
-      fi
-    done
-  done
-done
diff --git a/timing/rates.py b/timing/rates.py
deleted file mode 100644
index 442c2dba..00000000
--- a/timing/rates.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: iso-8859-1 -*-
-
-from pylab import *
-
-#rc('lines',  linestyle=None, marker='.', markersize=3)
-rc('legend', fontsize=10)
-
-txt_pa  = loadtxt("timings_3d_pa");
-txt_fa  = loadtxt("timings_3d_fa");
-txt_oc  = loadtxt("timings_3d_occa");
-
-def make_plot(column, label_prefix, line_style, txt, title=None, fig=None):
-  cm=get_cmap('Set1') # 'Accent', 'Dark2', 'Set1', 'Set2', 'Set3'
-  if '_segmentdata' in cm.__dict__:
-    cm_size=len(cm.__dict__['_segmentdata']['red'])
-  elif 'colors' in cm.__dict__:
-    cm_size=len(cm.__dict__['colors'])
-  colors=[cm(1.*i/(cm_size-1)) for i in range(cm_size)]
-
-  if fig is None:
-    fig = figure(figsize=(10,8))
-  ax = fig.gca()
-  orders = list(set([int(x) for x in txt[:,0]]))
-
-  for i, p in enumerate(orders):
-    dofs = []
-    data = []
-    for k in range(txt.shape[0]):
-      o = txt[k,0]
-      if o == p:
-        dofs.append(txt[k, 2])
-        data.append(txt[k, 2]/txt[k, column])
-       #data.append(1e6*txt[k, column])
-    pm1 = p-1
-    ax.plot(dofs, data, line_style, label=label_prefix + 'Q' + str(p) + 'Q' + str(p-1),
-            color=colors[i], linewidth=2)
-
-  ax.grid(True, which='major')
-  ax.grid(True, which='minor')
-  ax.legend(loc='best', prop={'size':18})
-  ax.set_autoscaley_on(False)
-  ax.set_autoscalex_on(False)
-  axis([10, 1e7, 1e4, 3e6])
- #axis([10, 1e7, 1e5, 1e9])
-  ax.set_xlabel('H1 DOFs', fontsize=18)
-  ax.set_xscale('log', basex = 10)
-  ax.set_ylabel('[DOFs x time steps] / [seconds]', fontsize=18)
-  ax.set_yscale('log', basex = 10)
-  xticks(fontsize = 18, rotation = 0)
-  yticks(fontsize = 18, rotation = 0)
-  if title is not None:
-    ax.set_title(title, fontsize=18)
-  return fig
-
-f1 = make_plot(8, 'PA: ', 'o-', txt_pa, title='Total Rate')
-f2 = make_plot(8, 'FA: ', 'o-', txt_fa, title='Total Rate')
-f3 = make_plot(8, 'OCCA: ', 'o-', txt_oc, title='Total Rate')
-#f1.savefig('laghos_3D_TT_PA.png', dpi=300, bbox_inches='tight')
-#f2.savefig('laghos_3D_TT_FA.png', dpi=300, bbox_inches='tight')
-#f3.savefig('laghos_3D_TT_OC.png', dpi=300, bbox_inches='tight')
-
-show()