diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9e9ceefb7..93bd9e9ad 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,15 @@
 - Fix memory leak in mesh BVH ([GH-225](https://github.com/NVIDIA/warp/issues/225))
 - Use C++17 with NVCC when building the Warp library and user kernels
 - Increase PTX target architecture up to `sm_75` (from `sm_70`), enabling Turing ISA features
+- Extended NanoVDB support (see `warp.Volume`):
+  - Add support for data-agnostic index grids, allocation at voxel granularity
+  - New `volume_lookup_index`, `volume_sample_index` and generic `volume_sample`/`volume_lookup`/`volume_store` kernel-level functions
+  - Zero-copy aliasing of in-memory grids, support for multi-grid buffers
+  - Grid introspection and blind data access capabilities
+  - warp.fem can now work directly on NanoVDB grids using `warp.fem.Nanogrid`
+  - Fixed `volume_sample_v` and `volume_store_*` adjoints
+  - Prevent `volume_store` from overwriting grid background values
+- Improve validation of user-provided fields and values in warp.fem
 
 ## [1.1.1] - 2024-05-24
 
diff --git a/docs/modules/fem.rst b/docs/modules/fem.rst
index f7a7f2418..c0911db96 100644
--- a/docs/modules/fem.rst
+++ b/docs/modules/fem.rst
@@ -62,7 +62,7 @@ Basic workflow
 
 The typical steps for solving a linear PDE are as follow:
 
- - Define a :class:`.Geometry` (grid, mesh, etc). At the moment, 2D and 3D regular grids, triangle, quadrilateral, tetrahedron and hexahedron meshes are supported.
+ - Define a :class:`.Geometry` (grid, mesh, etc). At the moment, 2D and 3D regular grids, NanoVDB volumes, and triangle, quadrilateral, tetrahedron and hexahedron unstructured meshes are supported.
  - Define one or more :class:`.FunctionSpace`, by equipping the geometry elements with shape functions. See :func:`.make_polynomial_space`. At the moment, continuous/discontinuous Lagrange (:math:`P_{k[d]}, Q_{k[d]}`) and Serendipity (:math:`S_k`) shape functions of order :math:`k \leq 3` are supported.
  - Define an integration :class:`.GeometryDomain`, for instance the geometry's cells (:class:`.Cells`) or boundary sides (:class:`.BoundarySides`).
  - Integrate linear forms to build the system's right-hand-side. Define a test function over the function space using :func:`.make_test`,
@@ -119,8 +119,8 @@ Introductory examples
  - ``example_diffusion.py``: 2D diffusion with homogeneous Neumann and Dirichlet boundary conditions
      * ``example_diffusion_3d.py``: 3D variant of the diffusion problem
  - ``example_convection_diffusion.py``: 2D convection-diffusion using semi-Lagrangian advection
-     * ``example_diffusion_dg0.py``: 2D convection-diffusion using finite-volume and upwind transport
-     * ``example_diffusion_dg.py``: 2D convection-diffusion using Discontinuous Galerkin with upwind transport and Symmetric Interior Penalty
+     * ``example_convection_diffusion_dg.py``: 2D convection-diffusion using Discontinuous Galerkin with upwind transport and Symmetric Interior Penalty
+ - ``example_burgers.py``: 2D inviscid Burgers using Discontinuous Galerkin with upwind transport and slope limiter
  - ``example_stokes.py``: 2D incompressible Stokes flow using mixed :math:`P_k/P_{k-1}` or :math:`Q_k/P_{(k-1)d}` elements
  - ``example_navier_stokes.py``: 2D Navier-Stokes flow using mixed :math:`P_k/P_{k-1}` elements
  - ``example_mixed_elasticity.py``: 2D linear elasticity using mixed continuous/discontinuous :math:`S_k/P_{(k-1)d}` elements
@@ -250,6 +250,9 @@ Geometry
 .. autoclass:: Hexmesh
    :show-inheritance:
 
+.. autoclass:: Nanogrid
+   :show-inheritance:
+
 .. autoclass:: LinearGeometryPartition
 
 .. autoclass:: ExplicitGeometryPartition
diff --git a/warp/examples/fem/example_apic_fluid.py b/warp/examples/fem/example_apic_fluid.py
index 3ec9a0765..f153a6104 100644
--- a/warp/examples/fem/example_apic_fluid.py
+++ b/warp/examples/fem/example_apic_fluid.py
@@ -8,7 +8,8 @@
 ###########################################################################
 # Example APIC Fluid Simulation
 #
-# Shows how to implement a apic fluid simulation.
+# Shows how to implement a minimalist APIC fluid simulation using a NanoVDB
+# grid and the PicQuadrature class.
 ###########################################################################
 
 import numpy as np
@@ -16,9 +17,9 @@
 import warp as wp
 import warp.fem as fem
 import warp.sim.render
-from warp.fem import Domain, Field, Sample, at_node, div, grad, integrand, lookup, normal
+from warp.fem import Domain, Field, Sample, at_node, div, grad, integrand
 from warp.sim import Model, State
-from warp.sparse import BsrMatrix, bsr_copy, bsr_mm, bsr_mv, bsr_transposed, bsr_zeros
+from warp.sparse import BsrMatrix, bsr_mm, bsr_mv, bsr_transposed
 
 try:
     from .bsr_utils import bsr_cg
@@ -26,6 +27,14 @@
     from bsr_utils import bsr_cg
 
 
+@wp.func
+def collision_sdf(x: wp.vec3):
+    # Arbitrary sdf representing collision geometry
+    # Here an inverted half-ball of radius 10
+    x[1] = wp.min(x[1], 0.0)
+    return 10.0 - wp.length(x), -wp.normalize(x)
+
+
 @integrand
 def integrate_fraction(s: Sample, phi: Field):
     return phi(s)
@@ -46,6 +55,13 @@ def integrate_velocity(
     vel_apic = velocities[s.qp_index] + velocity_gradients[s.qp_index] * node_offset
 
     vel_adv = vel_apic + dt * gravity
+
+    # if inside collider, remove normal velocity
+    sdf, sdf_gradient = collision_sdf(domain(s))
+    if sdf <= 0:
+        v_n = wp.dot(vel_adv, sdf_gradient)
+        vel_adv -= wp.max(v_n, 0.0) * sdf_gradient
+
     return wp.dot(u(s), vel_adv)
 
 
@@ -61,31 +77,33 @@ def update_particles(
     vel_grad: wp.array(dtype=wp.mat33),
 ):
     """Read particle velocity from grid and advect positions"""
-    vel[s.qp_index] = grid_vel(s)
+    p_vel = grid_vel(s)
     vel_grad[s.qp_index] = grad(grid_vel, s)
 
-    pos_adv = pos_prev[s.qp_index] + dt * vel[s.qp_index]
+    pos_adv = pos_prev[s.qp_index] + dt * p_vel
 
-    # Project onto domain
-    pos_proj = domain(lookup(domain, pos_adv))
-    pos[s.qp_index] = pos_proj
+    pos[s.qp_index] = pos_adv
+    vel[s.qp_index] = p_vel
 
 
 @integrand
 def velocity_boundary_projector_form(s: Sample, domain: Domain, u: Field, v: Field):
     """Projector for velocity-Dirichlet boundary conditions"""
 
-    n = normal(domain, s)
-    if n[1] > 0.0:
-        # Neuman  on top
+    x = domain(s)
+    sdf, sdf_normal = collision_sdf(x)
+
+    if sdf > 0.0:
+        # Neuman
         return 0.0
 
-    # Free-slip on other sides
-    return wp.dot(u(s), n) * wp.dot(v(s), n)
+    # Free-slip on boundary
+    return wp.dot(u(s), sdf_normal) * wp.dot(v(s), sdf_normal)
 
 
 @integrand
-def divergence_form(s: Sample, u: Field, psi: Field):
+def divergence_form(s: Sample, domain: Domain, u: Field, psi: Field):
+    # Divergence bilinear form
     return div(u, s) * psi(s)
 
 
@@ -93,10 +111,7 @@ def divergence_form(s: Sample, u: Field, psi: Field):
 def invert_volume_kernel(values: wp.array(dtype=float)):
     i = wp.tid()
     m = values[i]
-    if m <= 1.0e-8:
-        values[i] = 0.0
-    else:
-        values[i] = 1.0 / m
+    values[i] = wp.select(m == 0.0, 1.0 / m, 0.0)
 
 
 @wp.kernel
@@ -115,6 +130,8 @@ def scale_transposed_divergence_mat(
     tr_divergence_mat_values: wp.array(dtype=wp.mat(shape=(3, 1), dtype=float)),
     inv_fraction_int: wp.array(dtype=float),
 ):
+    # In-place scaling of gradient operator rows wiht inverse mass
+
     u_i = wp.tid()
     block_beg = tr_divergence_mat_offsets[u_i]
     block_end = tr_divergence_mat_offsets[u_i + 1]
@@ -123,6 +140,15 @@ def scale_transposed_divergence_mat(
         tr_divergence_mat_values[b] = tr_divergence_mat_values[b] * inv_fraction_int[u_i]
 
 
+@wp.kernel
+def compute_particle_ijk(positions: wp.array(dtype=wp.vec3), voxel_size: float, ijks: wp.array(dtype=wp.vec3i)):
+    # Index-space coordinates of grid cell containing each particle
+
+    p = wp.tid()
+    pos = positions[p] / voxel_size
+    ijks[p] = wp.vec3i(int(wp.floor(pos[0])), int(wp.floor(pos[1])), int(wp.floor(pos[2])))
+
+
 def solve_incompressibility(divergence_mat: BsrMatrix, inv_volume, pressure, velocity, quiet: bool = False):
     """Solve for divergence-free velocity delta:
 
@@ -142,43 +168,45 @@ def solve_incompressibility(divergence_mat: BsrMatrix, inv_volume, pressure, vel
         ],
     )
 
-    # For simplicity, assemble schur complement and solve with CG
+    # For simplicity, assemble Schur complement and solve with CG
     schur = bsr_mm(divergence_mat, transposed_divergence_mat)
 
     rhs = wp.zeros_like(pressure)
     bsr_mv(A=divergence_mat, x=velocity, y=rhs, alpha=-1.0, beta=0.0)
-    bsr_cg(schur, b=rhs, x=pressure, quiet=quiet)
+    bsr_cg(schur, b=rhs, x=pressure, quiet=quiet, tol=1.0e-6)
 
     # Apply pressure to velocity
     bsr_mv(A=transposed_divergence_mat, x=pressure, y=velocity, alpha=1.0, beta=1.0)
 
 
 class Example:
-    def __init__(self, quiet=False, stage_path="example_apic_fluid.usd", res=(32, 64, 16)):
+    def __init__(self, quiet=False, stage_path="example_apic_fluid.usd", voxel_size=1.0):
         fps = 60
         self.frame_dt = 1.0 / fps
         self.current_frame = 0
 
         self.sim_substeps = 1
         self.sim_dt = self.frame_dt / self.sim_substeps
+        self.voxel_size = voxel_size
 
         self._quiet = quiet
 
-        # grid dimensions and particle emission
-        grid_res = np.array(res, dtype=int)
-        particle_fill_frac = np.array([0.5, 0.5, 1.0])
-        grid_lo = wp.vec3(0.0)
-        grid_hi = wp.vec3(50, 100, 25)
+        # particle emission
+        particle_grid_lo = wp.vec3(-5)
+        particle_grid_hi = wp.vec3(5)
 
-        grid_cell_size = np.array(grid_hi - grid_lo) / grid_res
+        grid_cell_size = voxel_size
         grid_cell_volume = np.prod(grid_cell_size)
 
-        PARTICLES_PER_CELL_DIM = 3
+        PARTICLES_PER_CELL_DIM = 2
         self.radius = float(np.max(grid_cell_size) / (2 * PARTICLES_PER_CELL_DIM))
 
-        particle_grid_res = np.array(particle_fill_frac * grid_res * PARTICLES_PER_CELL_DIM, dtype=int)
+        particle_grid_res = (
+            np.array((particle_grid_hi - particle_grid_lo) / voxel_size, dtype=int) * PARTICLES_PER_CELL_DIM
+        )
         particle_grid_offset = wp.vec3(self.radius, self.radius, self.radius)
 
+        # Initialize warp.sim model, spawn particles
         np.random.seed(0)
         builder = wp.sim.ModelBuilder()
         builder.add_particle_grid(
@@ -188,60 +216,17 @@ def __init__(self, quiet=False, stage_path="example_apic_fluid.usd", res=(32, 64
             cell_x=self.radius * 2.0,
             cell_y=self.radius * 2.0,
             cell_z=self.radius * 2.0,
-            pos=wp.vec3(0.0, 0.0, 0.0) + particle_grid_offset,
+            pos=particle_grid_lo + particle_grid_offset,
             rot=wp.quat_identity(),
             vel=wp.vec3(0.0, 0.0, 0.0),
             mass=grid_cell_volume / PARTICLES_PER_CELL_DIM**3,
             jitter=self.radius * 1.0,
-            radius_mean=self.radius,
-        )
-
-        self.grid = fem.Grid3D(wp.vec3i(grid_res), grid_lo, grid_hi)
-
-        # Function spaces
-        self.velocity_space = fem.make_polynomial_space(self.grid, dtype=wp.vec3, degree=1)
-        self.fraction_space = fem.make_polynomial_space(self.grid, dtype=float, degree=1)
-        self.strain_space = fem.make_polynomial_space(
-            self.grid,
-            dtype=float,
-            degree=0,
-        )
-
-        self.pressure_field = self.strain_space.make_field()
-        self.velocity_field = self.velocity_space.make_field()
-
-        # Test and trial functions
-        self.domain = fem.Cells(self.grid)
-        self.velocity_test = fem.make_test(self.velocity_space, domain=self.domain)
-        self.velocity_trial = fem.make_trial(self.velocity_space, domain=self.domain)
-        self.fraction_test = fem.make_test(self.fraction_space, domain=self.domain)
-        self.strain_test = fem.make_test(self.strain_space, domain=self.domain)
-        self.strain_trial = fem.make_trial(self.strain_space, domain=self.domain)
-
-        # Enforcing the Dirichlet boundary condition the hard way;
-        # build projector for velocity left- and right-hand-sides
-        boundary = fem.BoundarySides(self.grid)
-        u_bd_test = fem.make_test(space=self.velocity_space, domain=boundary)
-        u_bd_trial = fem.make_trial(space=self.velocity_space, domain=boundary)
-        u_bd_projector = fem.integrate(
-            velocity_boundary_projector_form, fields={"u": u_bd_trial, "v": u_bd_test}, nodal=True, output_dtype=float
         )
-
-        fem.normalize_dirichlet_projector(u_bd_projector)
-        self.vel_bd_projector = u_bd_projector
+        self.model: Model = builder.finalize()
 
         # Storage for temporary variables
         self.temporary_store = fem.TemporaryStore()
 
-        self._divergence_matrix = bsr_zeros(
-            self.strain_space.node_count(),
-            self.velocity_space.node_count(),
-            block_type=wp.mat(shape=(1, 3), dtype=float),
-        )
-
-        # Warp.sim model
-        self.model: Model = builder.finalize()
-
         if not self._quiet:
             print("Particle count:", self.model.particle_count)
 
@@ -263,79 +248,122 @@ def step(self):
         fem.set_default_temporary_store(self.temporary_store)
 
         self.current_frame = self.current_frame + 1
+
+        particle_ijk = wp.empty(self.state_0.particle_count, dtype=wp.vec3i)
+
         with wp.ScopedTimer(f"simulate frame {self.current_frame}", active=True):
             for _s in range(self.sim_substeps):
-                # Bin particles to grid cells
-                pic = fem.PicQuadrature(
-                    domain=fem.Cells(self.grid), positions=self.state_0.particle_q, measures=self.model.particle_mass
+                # Compute the voxel coordinates for each particle.
+                # `Volume.allocate_by_voxels` accepts world positions, but allocates
+                # the voxels with the closest origin rather than the enclosing ones
+                # (i.e, it "round"s the positions, while here we eant to "floor" it)
+                wp.launch(
+                    compute_particle_ijk,
+                    dim=particle_ijk.shape,
+                    inputs=[self.state_0.particle_q, self.voxel_size, particle_ijk],
+                )
+
+                # Allocate the voxels and create the warp.fem geometry
+                volume = wp.Volume.allocate_by_voxels(
+                    voxel_points=particle_ijk,
+                    voxel_size=self.voxel_size,
+                )
+                grid = fem.Nanogrid(volume)
+
+                # Define function spaces: linear (Q1) for velocity and volume fraction,
+                # piecewise-constant for pressure
+                linear_basis_space = fem.make_polynomial_basis_space(grid, degree=1)
+                velocity_space = fem.make_collocated_function_space(linear_basis_space, dtype=wp.vec3)
+                fraction_space = fem.make_collocated_function_space(linear_basis_space, dtype=float)
+                strain_space = fem.make_polynomial_space(
+                    grid,
+                    dtype=float,
+                    degree=0,
+                    discontinuous=True,
                 )
 
-                # Borrow some temporary arrays for storing integration results
-                inv_volume_temporary = fem.borrow_temporary(
-                    self.temporary_store, shape=(self.fraction_space.node_count()), dtype=float
+                pressure_field = strain_space.make_field()
+                velocity_field = velocity_space.make_field()
+
+                # Define test and trial functions and integrating linear and bilinear forms
+                domain = fem.Cells(grid)
+                velocity_test = fem.make_test(velocity_space, domain=domain)
+                velocity_trial = fem.make_trial(velocity_space, domain=domain)
+                fraction_test = fem.make_test(fraction_space, domain=domain)
+                strain_test = fem.make_test(strain_space, domain=domain)
+
+                # Build projector for Dirichlet boundary conditions
+                vel_projector = fem.integrate(
+                    velocity_boundary_projector_form,
+                    fields={"u": velocity_trial, "v": velocity_test},
+                    nodal=True,
+                    output_dtype=float,
                 )
-                velocity_int_temporary = fem.borrow_temporary(
-                    self.temporary_store, shape=(self.velocity_space.node_count()), dtype=wp.vec3
+                fem.normalize_dirichlet_projector(vel_projector)
+
+                # Bin particles to grid cells
+                pic = fem.PicQuadrature(
+                    domain=domain, positions=self.state_0.particle_q, measures=self.model.particle_mass
                 )
-                inv_volume = inv_volume_temporary.array
-                velocity_int = velocity_int_temporary.array
 
-                # Inverse volume fraction
-                fem.integrate(
+                # Compute inverse particle volume for each grid node
+                inv_volume = fem.integrate(
                     integrate_fraction,
                     quadrature=pic,
-                    fields={"phi": self.fraction_test},
-                    accumulate_dtype=float,
-                    output=inv_volume,
+                    fields={"phi": fraction_test},
+                    output_dtype=float,
                 )
                 wp.launch(kernel=invert_volume_kernel, dim=inv_volume.shape, inputs=[inv_volume])
 
                 # Velocity right-hand side
-                fem.integrate(
+                velocity_int = fem.integrate(
                     integrate_velocity,
                     quadrature=pic,
-                    fields={"u": self.velocity_test},
+                    fields={"u": velocity_test},
                     values={
                         "velocities": self.state_0.particle_qd,
                         "velocity_gradients": self.state_0.particle_qd_grad,
                         "dt": self.sim_dt,
                         "gravity": self.model.gravity,
                     },
-                    accumulate_dtype=float,
-                    output=velocity_int,
+                    output_dtype=wp.vec3,
                 )
 
                 # Compute constraint-free velocity
                 wp.launch(
                     kernel=scalar_vector_multiply,
                     dim=inv_volume.shape[0],
-                    inputs=[inv_volume, velocity_int, self.velocity_field.dof_values],
+                    inputs=[inv_volume, velocity_int, velocity_field.dof_values],
                 )
 
                 # Apply velocity boundary conditions:
-                # velocity -= vel_bd_projector * velocity
-                wp.copy(src=self.velocity_field.dof_values, dest=velocity_int)
-                bsr_mv(A=self.vel_bd_projector, x=velocity_int, y=self.velocity_field.dof_values, alpha=-1.0, beta=1.0)
+                # velocity -= vel_projector * velocity
+                bsr_mv(
+                    A=vel_projector,
+                    x=velocity_field.dof_values,
+                    y=velocity_field.dof_values,
+                    alpha=-1.0,
+                    beta=1.0,
+                )
 
-                # Divergence matrix
-                fem.integrate(
+                # Assemble divergence operator matrix
+                divergence_matrix = fem.integrate(
                     divergence_form,
                     quadrature=pic,
-                    fields={"u": self.velocity_trial, "psi": self.strain_test},
-                    accumulate_dtype=float,
-                    output=self._divergence_matrix,
+                    fields={"u": velocity_trial, "psi": strain_test},
+                    output_dtype=float,
                 )
 
                 # Project matrix to enforce boundary conditions
-                divergence_mat_tmp = bsr_copy(self._divergence_matrix)
-                bsr_mm(alpha=-1.0, x=divergence_mat_tmp, y=self.vel_bd_projector, z=self._divergence_matrix, beta=1.0)
+                # divergence_matrix -= divergence_matrix * vel_projector
+                bsr_mm(alpha=-1.0, x=divergence_matrix, y=vel_projector, z=divergence_matrix, beta=1.0)
 
                 # Solve unilateral incompressibility
                 solve_incompressibility(
-                    self._divergence_matrix,
+                    divergence_matrix,
                     inv_volume,
-                    self.pressure_field.dof_values,
-                    self.velocity_field.dof_values,
+                    pressure_field.dof_values,
+                    velocity_field.dof_values,
                     quiet=self._quiet,
                 )
 
@@ -350,7 +378,7 @@ def step(self):
                         "vel_grad": self.state_1.particle_qd_grad,
                         "dt": self.sim_dt,
                     },
-                    fields={"grid_vel": self.velocity_field},
+                    fields={"grid_vel": velocity_field},
                 )
 
                 # swap states
@@ -383,19 +411,18 @@ def render(self, is_live=False):
         default="example_apic_fluid.usd",
         help="Path to the output USD file.",
     )
-    parser.add_argument("--num_frames", type=int, default=1000, help="Total number of frames.")
+    parser.add_argument("--num_frames", type=int, default=250, help="Total number of frames.")
     parser.add_argument("--quiet", action="store_true")
     parser.add_argument(
-        "--res",
-        type=lambda s: [int(item) for item in s.split(",")],
-        default="32,64,16",
-        help="Delimited list specifying resolution in x, y, and z.",
+        "--voxel_size",
+        type=float,
+        default=0.25,
     )
 
     args = parser.parse_known_args()[0]
 
     with wp.ScopedDevice(args.device):
-        example = Example(quiet=args.quiet, stage_path=args.stage_path, res=args.res)
+        example = Example(quiet=args.quiet, stage_path=args.stage_path, voxel_size=args.voxel_size)
 
         for _ in range(args.num_frames):
             example.step()
diff --git a/warp/examples/fem/example_burgers.py b/warp/examples/fem/example_burgers.py
new file mode 100644
index 000000000..3f9199ad3
--- /dev/null
+++ b/warp/examples/fem/example_burgers.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+###########################################################################
+# Example Burgers
+#
+# This example simulates an inviscid non-conservative Burgers PDE using
+# Discontinuous Galerkin with minmod slope limiter
+#
+# d u /dt + (u . grad) u = 0
+#
+###########################################################################
+
+import warp as wp
+import warp.fem as fem
+import warp.sparse as sp
+
+# Import example utilities
+# Make sure that works both when imported as module and run as standalone file
+try:
+    from .bsr_utils import invert_diagonal_bsr_mass_matrix
+    from .plot_utils import Plot
+except ImportError:
+    from bsr_utils import invert_diagonal_bsr_mass_matrix
+    from plot_utils import Plot
+
+
+@fem.integrand
+def vel_mass_form(
+    s: fem.Sample,
+    u: fem.Field,
+    v: fem.Field,
+):
+    return wp.dot(v(s), u(s))
+
+
+@fem.integrand
+def upwind_transport_form(s: fem.Sample, domain: fem.Domain, u: fem.Field, v: fem.Field, w: fem.Field):
+    # Upwinding transport with discontinuous convection velocity,
+    # using jump(w v) = jump(w) avg(v) + avg(w) jump(w)
+
+    nor = fem.normal(domain, s)
+    w_avg_n = wp.dot(fem.average(w, s), nor)
+    w_jump_n = wp.dot(fem.jump(w, s), nor)
+
+    x = domain(s)
+    v_avg = fem.average(v, s)
+    if x[0] <= 0.0 or x[0] >= 1.0:  # out
+        # if x[0] >= 1.0:  # out
+        v_jump = v(s)
+    else:
+        v_jump = fem.jump(v, s)
+
+    u_avg = fem.average(u, s)
+    u_jump = fem.jump(u, s)
+
+    return wp.dot(u_avg, v_jump * w_avg_n + v_avg * w_jump_n) + 0.5 * wp.dot(v_jump, u_jump) * (
+        wp.abs(w_avg_n) + 0.5 * wp.abs(w_jump_n)
+    )
+
+
+@fem.integrand
+def cell_transport_form(s: fem.Sample, domain: fem.Domain, u: fem.Field, v: fem.Field, w: fem.Field):
+    # ((w . grad) u) . v = v^T (grad u) w =  grad(u) : (v w^T)
+    # with integration by parts
+    # u . Div (w v^T) = u^T grad(v) w  + u^T v div (w)
+
+    return -wp.dot(fem.div(w, s) * v(s) + fem.grad(v, s) * w(s), u(s))
+
+
+@fem.integrand
+def initial_condition(s: fem.Sample, domain: fem.Domain):
+    x = domain(s)[0] * 2.0
+    wave = wp.sin(x * wp.pi)
+    return wp.vec2(wp.select(x <= 1.0, 0.0, wave), 0.0)
+
+
+@fem.integrand
+def velocity_norm(s: fem.Sample, u: fem.Field):
+    return wp.length(u(s))
+
+
+@wp.func
+def minmod(a: float, b: float):
+    sa = wp.sign(a)
+    sb = wp.sign(b)
+    return wp.select(sa == sb, 0.0, sa * wp.min(wp.abs(a), wp.abs(b)))
+
+
+@fem.integrand
+def slope_limiter(domain: fem.Domain, s: fem.Sample, u: fem.Field, dx: wp.vec2):
+    # Minmod slope limiter against P0 discretization (evaluation at cell centers)
+    # Assumes regular grid topology
+
+    center_coords = fem.Coords(0.5, 0.5, 0.0)
+    cell_center = fem.types.make_free_sample(s.element_index, center_coords)
+    center_pos = domain(cell_center)
+
+    u_center = u(cell_center)
+
+    delta_coords = s.element_coords - center_coords
+
+    neighbour_xp = fem.lookup(domain, center_pos + wp.vec2(dx[0], 0.0))
+    neighbour_yp = fem.lookup(domain, center_pos + wp.vec2(0.0, dx[1]))
+    neighbour_xm = fem.lookup(domain, center_pos - wp.vec2(dx[0], 0.0))
+    neighbour_ym = fem.lookup(domain, center_pos - wp.vec2(0.0, dx[1]))
+
+    u_nxp = u(neighbour_xp)
+    u_nyp = u(neighbour_yp)
+    u_nxm = u(neighbour_xm)
+    u_nym = u(neighbour_ym)
+
+    gx = minmod(u_nxp[0] - u_center[0], u_center[0] - u_nxm[0]) * delta_coords[0]
+    gy = minmod(u_nyp[1] - u_center[1], u_center[1] - u_nym[1]) * delta_coords[1]
+
+    delta_u = u(s) - u_center
+    return u_center + wp.vec2(minmod(gx, delta_u[0]), minmod(gy, delta_u[1]))
+
+
+class Example:
+    def __init__(self, quiet=False, resolution=50, degree=1):
+        self._quiet = quiet
+
+        res = resolution
+        self.sim_dt = 1.0 / res
+        self.current_frame = 0
+
+        geo = fem.Grid2D(res=wp.vec2i(resolution))
+
+        domain = fem.Cells(geometry=geo)
+        sides = fem.Sides(geo)
+
+        basis_space = fem.make_polynomial_basis_space(geo, degree=degree, discontinuous=True)
+        vector_space = fem.make_collocated_function_space(basis_space, dtype=wp.vec2)
+        scalar_space = fem.make_collocated_function_space(basis_space, dtype=float)
+
+        # Test function for ou vector space
+        self._test = fem.make_test(space=vector_space, domain=domain)
+        # Test function for integration on sides
+        self._side_test = fem.make_test(space=vector_space, domain=sides)
+
+        # Inertia matrix
+        # For simplicity, use nodal integration so that inertia matrix is diagonal
+        trial = fem.make_trial(space=vector_space, domain=domain)
+        matrix_inertia = fem.integrate(
+            vel_mass_form, fields={"u": trial, "v": self._test}, output_dtype=wp.float32, nodal=True
+        )
+        self._inv_mass_matrix = sp.bsr_copy(matrix_inertia)
+        invert_diagonal_bsr_mass_matrix(self._inv_mass_matrix)
+
+        # Initial condition
+        self.velocity_field = vector_space.make_field()
+        fem.interpolate(initial_condition, dest=self.velocity_field)
+
+        # Velocity nor field -- for visualization purposes
+        self.velocity_norm_field = scalar_space.make_field()
+        fem.interpolate(velocity_norm, dest=self.velocity_norm_field, fields={"u": self.velocity_field})
+
+        self.renderer = Plot()
+        self.renderer.add_surface("u_norm", self.velocity_norm_field)
+
+    def _velocity_delta(self, trial_velocity):
+        # Integration on sides
+        rhs = fem.integrate(
+            upwind_transport_form,
+            fields={"u": trial_velocity.trace(), "v": self._side_test, "w": trial_velocity.trace()},
+            output_dtype=wp.vec2,
+        )
+
+        if self.velocity_field.space.degree > 0:
+            # Integration on cells (if not piecewise-constant)
+            fem.utils.array_axpy(
+                x=fem.integrate(
+                    cell_transport_form,
+                    fields={"u": trial_velocity, "v": self._test, "w": trial_velocity},
+                    output_dtype=wp.vec2,
+                    quadrature=fem.RegularQuadrature(
+                        order=3, domain=self._test.domain, family=fem.Polynomial.LOBATTO_GAUSS_LEGENDRE
+                    ),
+                ),
+                y=rhs,
+                alpha=1.0,
+                beta=1.0,
+            )
+        return sp.bsr_mv(self._inv_mass_matrix, rhs)
+
+    def step(self):
+        self.current_frame += 1
+
+        # Third-order Strong Stability Preserving Runge-Kutta (SSPRK3)
+
+        k1 = self._velocity_delta(self.velocity_field)
+
+        # tmp = v0 - dt * k1
+        tmp = self.velocity_field.space.make_field()
+        fem.utils.array_axpy(y=tmp.dof_values, x=self.velocity_field.dof_values, alpha=1.0, beta=0.0)
+        fem.utils.array_axpy(y=tmp.dof_values, x=k1, alpha=-self.sim_dt, beta=1.0)
+        k2 = self._velocity_delta(tmp)
+
+        # tmp = v0 - dt * (0.25 * k1 + 0.25 * k2)
+        fem.utils.array_axpy(y=tmp.dof_values, x=k1, alpha=0.75 * self.sim_dt, beta=1.0)
+        fem.utils.array_axpy(y=tmp.dof_values, x=k2, alpha=-0.25 * self.sim_dt, beta=1.0)
+        k3 = self._velocity_delta(tmp)
+
+        # v = v0 - dt * (1/6 * k1 + 1/6 * k2 + 2/3 * k3)
+        fem.utils.array_axpy(y=self.velocity_field.dof_values, x=k1, alpha=-1.0 / 6.0 * self.sim_dt, beta=1.0)
+        fem.utils.array_axpy(y=self.velocity_field.dof_values, x=k2, alpha=-1.0 / 6.0 * self.sim_dt, beta=1.0)
+        fem.utils.array_axpy(y=self.velocity_field.dof_values, x=k3, alpha=-2.0 / 3.0 * self.sim_dt, beta=1.0)
+
+        # Apply slope limiter
+        if self.velocity_field.space.degree > 0:
+            res = self.velocity_field.space.geometry.res
+            dx = wp.vec2(1.0 / res[0], 1.0 / res[1])
+            fem.interpolate(slope_limiter, dest=tmp, fields={"u": self.velocity_field}, values={"dx": dx})
+            wp.copy(src=tmp.dof_values, dest=self.velocity_field.dof_values)
+
+        # Update velocity norm (for visualization)
+        fem.interpolate(velocity_norm, dest=self.velocity_norm_field, fields={"u": self.velocity_field})
+
+    def render(self):
+        self.renderer.begin_frame(time=self.current_frame * self.sim_dt)
+        self.renderer.add_surface("u_norm", self.velocity_norm_field)
+        self.renderer.end_frame()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    wp.set_module_options({"enable_backward": False})
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
+    parser.add_argument("--resolution", type=int, default=50, help="Grid resolution.")
+    parser.add_argument("--num_frames", type=int, default=250, help="Total number of frames.")
+    parser.add_argument("--degree", choices=(0, 1), type=int, default=1, help="Discretization order.")
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        help="Run in headless mode, suppressing the opening of any graphical windows.",
+    )
+    parser.add_argument("--quiet", action="store_true")
+
+    args = parser.parse_known_args()[0]
+
+    with wp.ScopedDevice(args.device):
+        example = Example(
+            quiet=args.quiet,
+            resolution=args.resolution,
+            degree=args.degree,
+        )
+
+        for k in range(args.num_frames):
+            print(f"Frame {k}:")
+            example.step()
+            example.render()
+
+        if not args.headless:
+            example.renderer.plot()
diff --git a/warp/examples/fem/example_convection_diffusion_dg0.py b/warp/examples/fem/example_convection_diffusion_dg0.py
deleted file mode 100644
index 6531a9edc..000000000
--- a/warp/examples/fem/example_convection_diffusion_dg0.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-###########################################################################
-# Example Convection Diffusion DG0
-#
-# This example simulates a convection-diffusion PDE using
-# FVM with upwind transport
-#
-# D phi / dt + nu Div f = 0
-# f = grad phi
-###########################################################################
-
-import warp as wp
-import warp.fem as fem
-from warp.sparse import bsr_axpy, bsr_mm, bsr_transposed
-
-# Import example utilities
-# Make sure that works both when imported as module and run as standalone file
-try:
-    from .bsr_utils import bsr_cg, invert_diagonal_bsr_mass_matrix
-    from .example_convection_diffusion import inertia_form, initial_condition, velocity
-    from .mesh_utils import gen_quadmesh, gen_trimesh
-    from .plot_utils import Plot
-except ImportError:
-    from bsr_utils import bsr_cg, invert_diagonal_bsr_mass_matrix
-    from example_convection_diffusion import inertia_form, initial_condition, velocity
-    from mesh_utils import gen_quadmesh, gen_trimesh
-    from plot_utils import Plot
-
-
-@fem.integrand
-def vel_mass_form(
-    s: fem.Sample,
-    u: fem.Field,
-    v: fem.Field,
-):
-    return wp.dot(v(s), u(s))
-
-
-@fem.integrand
-def half_diffusion_form(
-    s: fem.Sample,
-    domain: fem.Domain,
-    psi: fem.Field,
-    u: fem.Field,
-):
-    return fem.jump(psi, s) * wp.dot(fem.average(u, s), fem.normal(domain, s))
-
-
-@fem.integrand
-def upwind_transport_form(s: fem.Sample, domain: fem.Domain, phi: fem.Field, psi: fem.Field, ang_vel: float):
-    pos = domain(s)
-
-    vel = velocity(pos, ang_vel)
-
-    vel_n = wp.dot(vel, fem.normal(domain, s))
-
-    return fem.jump(psi, s) * (fem.average(phi, s) * vel_n + 0.5 * fem.jump(phi, s) * wp.abs(vel_n))
-
-
-class Example:
-    def __init__(self, quiet=False, resolution=50, mesh="grid", viscosity=0.001, ang_vel=1.0):
-        self._quiet = quiet
-
-        res = resolution
-        self.sim_dt = 1.0 / (ang_vel * res)
-        self.current_frame = 0
-
-        if mesh == "tri":
-            positions, tri_vidx = gen_trimesh(res=wp.vec2i(resolution))
-            geo = fem.Trimesh2D(tri_vertex_indices=tri_vidx, positions=positions)
-        elif mesh == "quad":
-            positions, quad_vidx = gen_quadmesh(res=wp.vec2i(resolution))
-            geo = fem.Quadmesh2D(quad_vertex_indices=quad_vidx, positions=positions)
-        else:
-            geo = fem.Grid2D(res=wp.vec2i(resolution))
-
-        domain = fem.Cells(geometry=geo)
-        sides = fem.Sides(geo)
-        scalar_space = fem.make_polynomial_space(geo, degree=0)
-
-        # Inertia matrix
-        self._test = fem.make_test(space=scalar_space, domain=domain)
-        trial = fem.make_trial(space=scalar_space, domain=domain)
-        matrix_inertia = fem.integrate(
-            inertia_form,
-            fields={"phi": trial, "psi": self._test},
-            values={"dt": self.sim_dt},
-        )
-
-        # Upwind transport term
-        side_test = fem.make_test(space=scalar_space, domain=sides)
-        side_trial = fem.make_trial(space=scalar_space, domain=sides)
-        matrix_transport = fem.integrate(
-            upwind_transport_form,
-            fields={"phi": side_trial, "psi": side_test},
-            values={"ang_vel": ang_vel},
-        )
-
-        # Diffusion bilinear form
-        # Since we have piecewise constant element, we cannot use the classical diffusion form
-        # Instead we assemble the matrix B M^-1 B^T, with B associated to the form psi div(u)
-        # and the diagonal matrix M to the velocity mass form u.v
-
-        velocity_space = fem.make_polynomial_space(geo, degree=0, dtype=wp.vec2)
-        side_trial_vel = fem.make_trial(space=velocity_space, domain=sides)
-        matrix_half_diffusion = fem.integrate(
-            half_diffusion_form,
-            fields={"psi": side_test, "u": side_trial_vel},
-        )
-
-        # Diagonal velocity mass matrix
-        test_vel = fem.make_test(space=velocity_space, domain=domain)
-        trial_vel = fem.make_trial(space=velocity_space, domain=domain)
-        inv_vel_mass_matrix = fem.integrate(
-            vel_mass_form, domain=domain, fields={"u": trial_vel, "v": test_vel}, nodal=True
-        )
-        invert_diagonal_bsr_mass_matrix(inv_vel_mass_matrix)
-
-        # Assemble system matrix
-
-        self._matrix = matrix_inertia
-        # matrix += matrix_transport
-        bsr_axpy(x=matrix_transport, y=self._matrix)
-        # matrix += nu * B M^-1 B^T
-        bsr_mm(
-            x=bsr_mm(matrix_half_diffusion, inv_vel_mass_matrix),
-            y=bsr_transposed(matrix_half_diffusion),
-            z=self._matrix,
-            alpha=viscosity,
-            beta=1.0,
-        )
-
-        # Initial condition
-        self._phi_field = scalar_space.make_field()
-        fem.interpolate(initial_condition, dest=self._phi_field)
-
-        self.renderer = Plot()
-        self.renderer.add_surface("phi", self._phi_field)
-
-    def step(self):
-        self.current_frame += 1
-
-        rhs = fem.integrate(
-            inertia_form,
-            fields={"phi": self._phi_field, "psi": self._test},
-            values={"dt": self.sim_dt},
-        )
-
-        phi = wp.zeros_like(rhs)
-        bsr_cg(self._matrix, b=rhs, x=phi, method="bicgstab", quiet=self._quiet)
-
-        wp.utils.array_cast(in_array=phi, out_array=self._phi_field.dof_values)
-
-    def render(self):
-        self.renderer.begin_frame(time=self.current_frame * self.sim_dt)
-        self.renderer.add_surface("phi", self._phi_field)
-        self.renderer.end_frame()
-
-
-if __name__ == "__main__":
-    import argparse
-
-    wp.set_module_options({"enable_backward": False})
-
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
-    parser.add_argument("--resolution", type=int, default=50, help="Grid resolution.")
-    parser.add_argument("--num_frames", type=int, default=250, help="Total number of frames.")
-    parser.add_argument("--viscosity", type=float, default=0.001, help="Fluid viscosity parameter.")
-    parser.add_argument("--ang_vel", type=float, default=1.0, help="Angular velocity.")
-    parser.add_argument("--mesh", choices=("grid", "tri", "quad"), default="grid", help="Mesh type.")
-    parser.add_argument(
-        "--headless",
-        action="store_true",
-        help="Run in headless mode, suppressing the opening of any graphical windows.",
-    )
-    parser.add_argument("--quiet", action="store_true")
-
-    args = parser.parse_known_args()[0]
-
-    with wp.ScopedDevice(args.device):
-        example = Example(
-            quiet=args.quiet,
-            resolution=args.resolution,
-            mesh=args.mesh,
-            viscosity=args.viscosity,
-            ang_vel=args.ang_vel,
-        )
-
-        for k in range(args.num_frames):
-            print(f"Frame {k}:")
-            example.step()
-            example.render()
-
-        if not args.headless:
-            example.renderer.plot()
diff --git a/warp/examples/fem/example_diffusion_3d.py b/warp/examples/fem/example_diffusion_3d.py
index a4cb79212..3e1dbea3b 100644
--- a/warp/examples/fem/example_diffusion_3d.py
+++ b/warp/examples/fem/example_diffusion_3d.py
@@ -25,7 +25,7 @@
 try:
     from .bsr_utils import bsr_cg
     from .example_diffusion import diffusion_form, linear_form
-    from .mesh_utils import gen_tetmesh
+    from .mesh_utils import gen_hexmesh, gen_tetmesh
     from .plot_utils import Plot
 except ImportError:
     from bsr_utils import bsr_cg
@@ -78,6 +78,9 @@ def __init__(
                 bounds_hi=wp.vec3(1.0, 0.5, 2.0),
             )
             self._geo = fem.Hexmesh(hex_vtx_indices, pos)
+        elif mesh == "nano":
+            volume = wp.Volume.allocate(min=[0, 0, 0], max=[1.0, 0.5, 2.0], voxel_size=1.0 / res[0], bg_value=None)
+            self._geo = fem.Nanogrid(volume)
         else:
             self._geo = fem.Grid3D(
                 res=res,
@@ -147,7 +150,7 @@ def render(self):
     parser.add_argument(
         "--boundary_compliance", type=float, default=0.0, help="Dirichlet boundary condition compliance."
     )
-    parser.add_argument("--mesh", choices=("grid", "tet", "hex"), default="grid", help="Mesh type.")
+    parser.add_argument("--mesh", choices=("grid", "tet", "hex", "nano"), default="grid", help="Mesh type.")
     parser.add_argument(
         "--headless",
         action="store_true",
diff --git a/warp/fem/__init__.py b/warp/fem/__init__.py
index a178c712e..18b904a4d 100644
--- a/warp/fem/__init__.py
+++ b/warp/fem/__init__.py
@@ -10,6 +10,7 @@
     Grid3D,
     Hexmesh,
     LinearGeometryPartition,
+    Nanogrid,
     Quadmesh2D,
     Tetmesh,
     Trimesh2D,
diff --git a/warp/fem/cache.py b/warp/fem/cache.py
index 4705e4d14..142ee529b 100644
--- a/warp/fem/cache.py
+++ b/warp/fem/cache.py
@@ -228,7 +228,7 @@ def array(self) -> wp.array:
 
     def _view_as(self, shape, dtype) -> "Temporary":
         def _view_reshaped_truncated(array):
-            return wp.types.array(
+            view = wp.types.array(
                 ptr=array.ptr,
                 dtype=dtype,
                 shape=shape,
@@ -238,6 +238,8 @@ def _view_reshaped_truncated(array):
                 copy=False,
                 grad=None if array.grad is None else _view_reshaped_truncated(array.grad),
             )
+            view._ref = array
+            return view
 
         self._array_view = _view_reshaped_truncated(self._raw_array)
         return self
diff --git a/warp/fem/geometry/__init__.py b/warp/fem/geometry/__init__.py
index a3cf2b715..f71b3ec3a 100644
--- a/warp/fem/geometry/__init__.py
+++ b/warp/fem/geometry/__init__.py
@@ -4,6 +4,7 @@
 from .grid_2d import Grid2D
 from .grid_3d import Grid3D
 from .hexmesh import Hexmesh
+from .nanogrid import Nanogrid
 from .partition import (
     ExplicitGeometryPartition,
     GeometryPartition,
diff --git a/warp/fem/geometry/element.py b/warp/fem/geometry/element.py
index 140ef7582..2c3075248 100644
--- a/warp/fem/geometry/element.py
+++ b/warp/fem/geometry/element.py
@@ -14,6 +14,10 @@ def instantiate_quadrature(order: int, family: Polynomial) -> Tuple[List[Coords]
         """Returns a quadrature of a given order for a prototypical element"""
         raise NotImplementedError
 
+    def center(self) -> Tuple[float]:
+        coords, _ = self.instantiate_quadrature(order=0, family=None)
+        return coords[0]
+
 
 def _point_count_from_order(order: int, family: Polynomial):
     if family == Polynomial.GAUSS_LEGENDRE:
diff --git a/warp/fem/geometry/grid_3d.py b/warp/fem/geometry/grid_3d.py
index adfed0318..071be4c6d 100644
--- a/warp/fem/geometry/grid_3d.py
+++ b/warp/fem/geometry/grid_3d.py
@@ -263,10 +263,6 @@ def cell_measure(args: CellArg, s: Sample):
     def cell_normal(args: CellArg, s: Sample):
         return wp.vec3(0.0)
 
-    @wp.func
-    def cell_transform_reference_gradient(args: CellArg, cell_index: ElementIndex, coords: Coords, ref_grad: wp.vec3):
-        return wp.cw_div(ref_grad, args.cell_size)
-
     @cached_arg_value
     def side_arg_value(self, device) -> SideArg:
         args = self.SideArg()
diff --git a/warp/fem/geometry/nanogrid.py b/warp/fem/geometry/nanogrid.py
new file mode 100644
index 000000000..dafa897a3
--- /dev/null
+++ b/warp/fem/geometry/nanogrid.py
@@ -0,0 +1,455 @@
+from typing import Optional
+
+import numpy as np
+
+import warp as wp
+from warp.fem import cache, utils
+from warp.fem.types import NULL_ELEMENT_INDEX, OUTSIDE, Coords, ElementIndex, Sample, make_free_sample
+
+from .element import Cube, Square
+from .geometry import Geometry
+
+# Flag used for building edge/face grids to disambiguiate axis within the grid
+GRID_AXIS_FLAG = wp.constant(wp.int32(1 << 20))
+FACE_AXIS_MASK = wp.constant(wp.uint8((1 << 3) - 1))
+FACE_INNER_OFFSET_BIT = wp.constant(wp.uint8(3))
+FACE_OUTER_OFFSET_BIT = wp.constant(wp.uint8(4))
+
+_mat32 = wp.mat(shape=(3, 2), dtype=float)
+
+
+@wp.func
+def _add_axis_flag(ijk: wp.vec3i, axis: int):
+    coord = ijk[axis]
+    ijk[axis] = wp.select(coord < 0, coord | GRID_AXIS_FLAG, coord & (~GRID_AXIS_FLAG))
+    return ijk
+
+
+@wp.func
+def _extract_axis_flag(ijk: wp.vec3i):
+    for ax in range(3):
+        coord = ijk[ax]
+        if coord < 0:
+            if (ijk[ax] & GRID_AXIS_FLAG) == 0:
+                ijk[ax] = ijk[ax] | GRID_AXIS_FLAG
+                return ax, ijk
+        else:
+            if (ijk[ax] & GRID_AXIS_FLAG) != 0:
+                ijk[ax] = ijk[ax] & (~GRID_AXIS_FLAG)
+                return ax, ijk
+
+    return -1, ijk
+
+
+@wp.struct
+class NanogridCellArg:
+    # Utility device functions
+    cell_grid: wp.uint64
+    cell_ijk: wp.array(dtype=wp.vec3i)
+    inverse_transform: wp.mat33
+    cell_volume: float
+
+
+@wp.struct
+class NanogridSideArg:
+    # Utility device functions
+    cell_arg: NanogridCellArg
+    face_ijk: wp.array(dtype=wp.vec3i)
+    face_flags: wp.array(dtype=wp.uint8)
+    face_areas: wp.vec3
+
+
+class Nanogrid(Geometry):
+    dimension = 3
+
+    def __init__(self, grid: wp.Volume, temporary_store: Optional[cache.TemporaryStore] = None):
+        self._cell_grid = grid
+        self._cell_grid_info = grid.get_grid_info()
+
+        device = grid.device
+
+        cell_count = grid.get_voxel_count()
+        self._cell_ijk = wp.array(shape=(cell_count,), dtype=wp.vec3i, device=device)
+        grid.get_voxels(out=self._cell_ijk)
+
+        self._node_grid = _build_node_grid(self._cell_ijk, grid, temporary_store)
+        node_count = self._node_grid.get_voxel_count()
+        self._node_ijk = wp.array(shape=(node_count,), dtype=wp.vec3i, device=device)
+        self._node_grid.get_voxels(out=self._node_ijk)
+
+        self._face_grid = _build_face_grid(self._cell_ijk, grid, temporary_store)
+        face_count = self._face_grid.get_voxel_count()
+        self._face_ijk = wp.array(shape=(face_count,), dtype=wp.vec3i, device=device)
+        self._face_grid.get_voxels(out=self._face_ijk)
+
+        self._face_flags = wp.array(shape=(face_count,), dtype=wp.uint8, device=device)
+        boundary_face_mask = cache.borrow_temporary(temporary_store, shape=(face_count,), dtype=wp.int32, device=device)
+
+        wp.launch(
+            _build_face_flags,
+            dim=face_count,
+            device=device,
+            inputs=[grid.id, self._face_ijk, self._face_flags, boundary_face_mask.array],
+        )
+        boundary_face_indices, _ = utils.masked_indices(boundary_face_mask.array)
+        self._boundary_face_indices = boundary_face_indices.detach()
+
+        self._edge_grid = None
+        self._edge_ijk = None
+
+    def _build_edge_grid(self, temporary_store: Optional[cache.TemporaryStore] = None):
+        self._edge_grid = _build_edge_grid(self._cell_ijk, self._cell_grid, temporary_store)
+        edge_count = self._edge_grid.get_voxel_count()
+        self._edge_ijk = wp.array(shape=(edge_count,), dtype=wp.vec3i, device=self._edge_grid.device)
+        self._edge_grid.get_voxels(out=self._edge_ijk)
+
+    def cell_count(self):
+        return self._cell_ijk.shape[0]
+
+    def vertex_count(self):
+        return self._node_ijk.shape[0]
+
+    def side_count(self):
+        return self._face_ijk.shape[0]
+
+    def edge_count(self):
+        if self._edge_ijk is None:
+            self._build_edge_grid()
+
+        return self._edge_ijk.shape[0]
+
+    def boundary_side_count(self):
+        return self._boundary_face_indices.shape[0]
+
+    def reference_cell(self) -> Cube:
+        return Cube()
+
+    def reference_side(self) -> Square:
+        return Square()
+
+    CellArg = NanogridCellArg
+
+    @cache.cached_arg_value
+    def cell_arg_value(self, device) -> CellArg:
+        args = self.CellArg()
+        args.cell_grid = self._cell_grid.id
+        args.cell_ijk = self._cell_ijk
+
+        transform = np.array(self._cell_grid_info.transform_matrix).reshape(3, 3)
+        args.inverse_transform = wp.mat33f(np.linalg.inv(transform))
+        args.cell_volume = abs(np.linalg.det(transform))
+
+        return args
+
+    @wp.func
+    def cell_position(args: CellArg, s: Sample):
+        uvw = wp.vec3(args.cell_ijk[s.element_index]) + s.element_coords
+        return wp.volume_index_to_world(args.cell_grid, uvw)
+
+    @wp.func
+    def cell_deformation_gradient(args: CellArg, s: Sample):
+        return wp.inverse(args.inverse_transform)
+
+    @wp.func
+    def cell_inverse_deformation_gradient(args: CellArg, s: Sample):
+        return args.inverse_transform
+
+    @wp.func
+    def cell_lookup(args: CellArg, pos: wp.vec3):
+        uvw = wp.volume_world_to_index(args.cell_grid, pos)
+        ijk = wp.vec3i(int(wp.floor(uvw[0])), int(wp.floor(uvw[1])), int(wp.floor(uvw[2])))
+        element_index = wp.volume_lookup_index(args.cell_grid, ijk[0], ijk[1], ijk[2])
+
+        return wp.select(
+            element_index == -1,
+            make_free_sample(element_index, uvw - wp.vec3(ijk)),
+            make_free_sample(NULL_ELEMENT_INDEX, Coords(OUTSIDE)),
+        )
+
+    @wp.func
+    def cell_lookup(args: CellArg, pos: wp.vec3, guess: Sample):
+        return Nanogrid.cell_lookup(args, pos)
+
+    @wp.func
+    def cell_measure(args: CellArg, s: Sample):
+        return args.cell_volume
+
+    @wp.func
+    def cell_normal(args: CellArg, s: Sample):
+        return wp.vec3(0.0)
+
+    SideArg = NanogridSideArg
+
+    @cache.cached_arg_value
+    def side_arg_value(self, device) -> SideArg:
+        args = self.SideArg()
+        args.cell_arg = self.cell_arg_value(device)
+        args.face_ijk = self._face_ijk.to(device)
+        args.face_flags = self._face_flags.to(device)
+        transform = np.array(self._cell_grid_info.transform_matrix).reshape(3, 3)
+        args.face_areas = wp.vec3(
+            tuple(np.linalg.norm(np.cross(transform[:, k - 2], transform[:, k - 1])) for k in range(3))
+        )
+
+        return args
+
+    @wp.struct
+    class SideIndexArg:
+        boundary_face_indices: wp.array(dtype=int)
+
+    @cache.cached_arg_value
+    def side_index_arg_value(self, device) -> SideIndexArg:
+        args = self.SideIndexArg()
+        args.boundary_face_indices = self._boundary_face_indices.to(device)
+        return args
+
+    @wp.func
+    def boundary_side_index(args: SideIndexArg, boundary_side_index: int):
+        return args.boundary_face_indices[boundary_side_index]
+
+    @wp.func
+    def _side_to_cell_coords(axis: int, inner: float, side_coords: Coords):
+        uvw = wp.vec3()
+        uvw[axis] = inner
+        uvw[(axis + 1) % 3] = side_coords[0]
+        uvw[(axis + 2) % 3] = side_coords[1]
+        return uvw
+
+    @wp.func
+    def _get_face_axis(flags: wp.uint8):
+        return wp.int32(flags & FACE_AXIS_MASK)
+
+    @wp.func
+    def _get_face_inner_offset(flags: wp.uint8):
+        return wp.int32(flags >> FACE_INNER_OFFSET_BIT) & 1
+
+    @wp.func
+    def _get_face_outer_offset(flags: wp.uint8):
+        return wp.int32(flags >> FACE_OUTER_OFFSET_BIT) & 1
+
+    @wp.func
+    def side_position(args: SideArg, s: Sample):
+        ijk = args.face_ijk[s.element_index]
+        axis = Nanogrid._get_face_axis(args.face_flags[s.element_index])
+
+        uvw = wp.vec3(ijk) + Nanogrid._side_to_cell_coords(axis, 0.0, s.element_coords)
+
+        cell_grid = args.cell_arg.cell_grid
+        return wp.volume_index_to_world(cell_grid, uvw)
+
+    @wp.func
+    def _face_tangent_vecs(args: SideArg, axis: int, flip: int):
+        u_axis = utils.unit_element(wp.vec3(), (axis + 1 + flip) % 3)
+        v_axis = utils.unit_element(wp.vec3(), (axis + 2 - flip) % 3)
+
+        cell_grid = args.cell_arg.cell_grid
+
+        return wp.volume_index_to_world_dir(cell_grid, u_axis), wp.volume_index_to_world_dir(cell_grid, v_axis)
+
+    @wp.func
+    def side_deformation_gradient(args: SideArg, s: Sample):
+        flags = args.face_flags[s.element_index]
+        axis = Nanogrid._get_face_axis(flags)
+        flip = Nanogrid._get_face_inner_offset(flags)
+        v1, v2 = Nanogrid._face_tangent_vecs(args, axis, flip)
+        return _mat32(v1, v2)
+
+    @wp.func
+    def side_inner_inverse_deformation_gradient(args: SideArg, s: Sample):
+        return Nanogrid.cell_inverse_deformation_gradient(args.cell_arg, s)
+
+    @wp.func
+    def side_outer_inverse_deformation_gradient(args: SideArg, s: Sample):
+        return Nanogrid.cell_inverse_deformation_gradient(args.cell_arg, s)
+
+    @wp.func
+    def side_measure(args: SideArg, s: Sample):
+        axis = Nanogrid._get_face_axis(args.face_flags[s.element_index])
+        return args.face_areas[axis]
+
+    @wp.func
+    def side_measure_ratio(args: SideArg, s: Sample):
+        axis = Nanogrid._get_face_axis(args.face_flags[s.element_index])
+        return args.face_areas[axis] / args.cell_arg.cell_volume
+
+    @wp.func
+    def side_normal(args: SideArg, s: Sample):
+        flags = args.face_flags[s.element_index]
+        axis = Nanogrid._get_face_axis(flags)
+        flip = Nanogrid._get_face_inner_offset(flags)
+
+        v1, v2 = Nanogrid._face_tangent_vecs(args, axis, flip)
+        return wp.cross(v1, v2) / args.face_areas[axis]
+
+    @wp.func
+    def side_inner_cell_index(args: SideArg, side_index: ElementIndex):
+        ijk = args.face_ijk[side_index]
+        flags = args.face_flags[side_index]
+        axis = Nanogrid._get_face_axis(flags)
+        offset = Nanogrid._get_face_inner_offset(flags)
+
+        ijk[axis] += offset - 1
+        cell_grid = args.cell_arg.cell_grid
+
+        return wp.volume_lookup_index(cell_grid, ijk[0], ijk[1], ijk[2])
+
+    @wp.func
+    def side_outer_cell_index(args: SideArg, side_index: ElementIndex):
+        ijk = args.face_ijk[side_index]
+        flags = args.face_flags[side_index]
+        axis = Nanogrid._get_face_axis(flags)
+        offset = Nanogrid._get_face_outer_offset(flags)
+
+        ijk[axis] -= offset
+        cell_grid = args.cell_arg.cell_grid
+
+        return wp.volume_lookup_index(cell_grid, ijk[0], ijk[1], ijk[2])
+
+    @wp.func
+    def side_inner_cell_coords(args: SideArg, side_index: ElementIndex, side_coords: Coords):
+        flags = args.face_flags[side_index]
+        axis = Nanogrid._get_face_axis(flags)
+        offset = float(Nanogrid._get_face_inner_offset(flags))
+        return Nanogrid._side_to_cell_coords(axis, 1.0 - offset, side_coords)
+
+    @wp.func
+    def side_outer_cell_coords(args: SideArg, side_index: ElementIndex, side_coords: Coords):
+        flags = args.face_flags[side_index]
+        axis = Nanogrid._get_face_axis(flags)
+        offset = float(Nanogrid._get_face_outer_offset(flags))
+        return Nanogrid._side_to_cell_coords(axis, offset, side_coords)
+
+    @wp.func
+    def side_from_cell_coords(
+        args: SideArg,
+        side_index: ElementIndex,
+        element_index: ElementIndex,
+        element_coords: Coords,
+    ):
+        flags = args.face_flags[side_index]
+        axis = Nanogrid._get_face_axis(flags)
+
+        cell_ijk = args.cell_arg.cell_ijk[element_index]
+        side_ijk = args.face_ijk[side_index]
+
+        on_side = float(side_ijk[axis] - cell_ijk[axis]) == element_coords[axis]
+
+        return wp.select(
+            on_side, Coords(OUTSIDE), Coords(element_coords[(axis + 1) % 3], element_coords[(axis + 2) % 3], 0.0)
+        )
+
+    @wp.func
+    def side_to_cell_arg(side_arg: SideArg):
+        return side_arg.cell_arg
+
+
+@wp.kernel
+def _cell_node_indices(
+    cell_ijk: wp.array(dtype=wp.vec3i),
+    node_ijk: wp.array2d(dtype=wp.vec3i),
+):
+    cell, n = wp.tid()
+    node_ijk[cell, n] = cell_ijk[cell] + wp.vec3i((n & 4) >> 2, (n & 2) >> 1, n & 1)
+
+
+@wp.kernel
+def _cell_face_indices(
+    cell_ijk: wp.array(dtype=wp.vec3i),
+    node_ijk: wp.array2d(dtype=wp.vec3i),
+):
+    cell = wp.tid()
+    ijk = cell_ijk[cell]
+    node_ijk[cell, 0] = _add_axis_flag(ijk, 0)
+    node_ijk[cell, 1] = _add_axis_flag(ijk, 1)
+    node_ijk[cell, 2] = _add_axis_flag(ijk, 2)
+
+    node_ijk[cell, 3] = _add_axis_flag(ijk + wp.vec3i(1, 0, 0), 0)
+    node_ijk[cell, 4] = _add_axis_flag(ijk + wp.vec3i(0, 1, 0), 1)
+    node_ijk[cell, 5] = _add_axis_flag(ijk + wp.vec3i(0, 0, 1), 2)
+
+
+@wp.kernel
+def _cell_edge_indices(
+    cell_ijk: wp.array(dtype=wp.vec3i),
+    edge_ijk: wp.array2d(dtype=wp.vec3i),
+):
+    cell = wp.tid()
+    ijk = cell_ijk[cell]
+    edge_ijk[cell, 0] = _add_axis_flag(ijk, 0)
+    edge_ijk[cell, 1] = _add_axis_flag(ijk, 1)
+    edge_ijk[cell, 2] = _add_axis_flag(ijk, 2)
+
+    edge_ijk[cell, 3] = _add_axis_flag(ijk + wp.vec3i(0, 1, 0), 0)
+    edge_ijk[cell, 4] = _add_axis_flag(ijk + wp.vec3i(0, 0, 1), 1)
+    edge_ijk[cell, 5] = _add_axis_flag(ijk + wp.vec3i(1, 0, 0), 2)
+
+    edge_ijk[cell, 6] = _add_axis_flag(ijk + wp.vec3i(0, 1, 1), 0)
+    edge_ijk[cell, 7] = _add_axis_flag(ijk + wp.vec3i(1, 0, 1), 1)
+    edge_ijk[cell, 8] = _add_axis_flag(ijk + wp.vec3i(1, 1, 0), 2)
+
+    edge_ijk[cell, 9] = _add_axis_flag(ijk + wp.vec3i(0, 0, 1), 0)
+    edge_ijk[cell, 10] = _add_axis_flag(ijk + wp.vec3i(1, 0, 0), 1)
+    edge_ijk[cell, 11] = _add_axis_flag(ijk + wp.vec3i(0, 1, 0), 2)
+
+
+def _build_node_grid(cell_ijk, grid: wp.Volume, temporary_store: cache.TemporaryStore):
+    cell_count = cell_ijk.shape[0]
+
+    cell_nodes = cache.borrow_temporary(temporary_store, shape=(cell_count, 8), dtype=wp.vec3i, device=cell_ijk.device)
+    wp.launch(
+        _cell_node_indices, dim=cell_nodes.array.shape, inputs=[cell_ijk, cell_nodes.array], device=cell_ijk.device
+    )
+    node_grid = wp.Volume.allocate_by_voxels(
+        cell_nodes.array.flatten(), voxel_size=grid.get_voxel_size()[0], device=cell_ijk.device
+    )
+
+    return node_grid
+
+
+def _build_face_grid(cell_ijk, grid: wp.Volume, temporary_store: cache.TemporaryStore):
+    cell_count = cell_ijk.shape[0]
+
+    cell_faces = cache.borrow_temporary(temporary_store, shape=(cell_count, 6), dtype=wp.vec3i, device=cell_ijk.device)
+    wp.launch(_cell_face_indices, dim=cell_count, inputs=[cell_ijk, cell_faces.array], device=cell_ijk.device)
+    face_grid = wp.Volume.allocate_by_voxels(
+        cell_faces.array.flatten(), voxel_size=grid.get_voxel_size()[0], device=cell_ijk.device
+    )
+
+    return face_grid
+
+
+def _build_edge_grid(cell_ijk, grid: wp.Volume, temporary_store: cache.TemporaryStore):
+    cell_count = cell_ijk.shape[0]
+
+    cell_edges = cache.borrow_temporary(temporary_store, shape=(cell_count, 12), dtype=wp.vec3i, device=cell_ijk.device)
+    wp.launch(_cell_edge_indices, dim=cell_count, inputs=[cell_ijk, cell_edges.array], device=cell_ijk.device)
+    edge_grid = wp.Volume.allocate_by_voxels(
+        cell_edges.array.flatten(), voxel_size=grid.get_voxel_size()[0], device=cell_ijk.device
+    )
+
+    return edge_grid
+
+
+@wp.kernel
+def _build_face_flags(
+    cell_grid: wp.uint64,
+    face_ijk: wp.array(dtype=wp.vec3i),
+    face_flags: wp.array(dtype=wp.uint8),
+    boundary_face_mask: wp.array(dtype=int),
+):
+    face = wp.tid()
+
+    axis, ijk = _extract_axis_flag(face_ijk[face])
+
+    ijk_minus = ijk
+    ijk_minus[axis] -= 1
+
+    plus_cell_index = wp.volume_lookup_index(cell_grid, ijk[0], ijk[1], ijk[2])
+    minus_cell_index = wp.volume_lookup_index(cell_grid, ijk_minus[0], ijk_minus[1], ijk_minus[2])
+
+    plus_boundary = wp.uint8(wp.select(plus_cell_index == -1, 0, 1)) << FACE_OUTER_OFFSET_BIT
+    minus_boundary = wp.uint8(wp.select(minus_cell_index == -1, 0, 1)) << FACE_INNER_OFFSET_BIT
+
+    face_ijk[face] = ijk
+    face_flags[face] = wp.uint8(axis) | plus_boundary | minus_boundary
+    boundary_face_mask[face] = wp.select((plus_boundary | minus_boundary) == 0, 1, 0)
diff --git a/warp/fem/integrate.py b/warp/fem/integrate.py
index 49cd85296..0b47f5020 100644
--- a/warp/fem/integrate.py
+++ b/warp/fem/integrate.py
@@ -9,6 +9,7 @@
     DiscreteField,
     FieldLike,
     FieldRestriction,
+    SpaceField,
     TestField,
     TrialField,
     make_restriction,
@@ -195,7 +196,7 @@ def _get_integrand_field_arguments(
         arg_type = argspec.annotations[arg]
         if arg_type == Field:
             if arg not in fields:
-                raise ValueError(f"Missing field for argument '{arg}'")
+                raise ValueError(f"Missing field for argument '{arg}' of integrand '{integrand.name}'")
             field_args[arg] = fields[arg]
         elif arg_type == Domain:
             domain_name = arg
@@ -208,6 +209,52 @@ def _get_integrand_field_arguments(
     return field_args, value_args, domain_name, sample_name
 
 
+def _check_field_compat(
+    integrand: Integrand,
+    fields: Dict[str, FieldLike],
+    field_args: Dict[str, FieldLike],
+    domain: GeometryDomain = None,
+):
+    # Check field compatilibity
+    for name, field in fields.items():
+        if name not in field_args:
+            raise ValueError(
+                f"Passed field argument '{name}' does not match any parameter of integrand '{integrand.name}'"
+            )
+
+        if isinstance(field, SpaceField) and domain is not None:
+            space = field.space
+            if space.geometry != domain.geometry:
+                raise ValueError(f"Field '{name}' must be defined on the same geometry as the integration domain")
+            if space.dimension != domain.dimension:
+                raise ValueError(
+                    f"Field '{name}' dimension ({space.dimension}) does not match that of the integration domain ({domain.dimension}). Maybe a forgotten `.trace()`?"
+                )
+
+
+def _populate_value_struct(ValueStruct: wp.codegen.Struct, values: Dict[str, Any], integrand_name: str):
+    value_struct_values = ValueStruct()
+    for k, v in values.items():
+        try:
+            setattr(value_struct_values, k, v)
+        except Exception as err:
+            if k not in ValueStruct.vars:
+                raise ValueError(
+                    f"Passed value argument '{k}' does not match any of the integrand '{integrand_name}' parameters"
+                ) from err
+            raise ValueError(
+                f"Passed value argument '{k}' of type '{wp.types.type_repr(v)}' is incompatible with the integrand '{integrand_name}' parameter of type '{wp.types.type_repr(ValueStruct.vars[k].type)}'"
+            ) from err
+
+    missing_values = ValueStruct.vars.keys() - values.keys()
+    if missing_values:
+        wp.utils.warn(
+            f"Missing values for parameter(s) '{', '.join(missing_values)}' of the integrand '{integrand_name}', will be zero-initialized"
+        )
+
+    return value_struct_values
+
+
 def _get_test_and_trial_fields(
     fields: Dict[str, FieldLike],
 ):
@@ -217,14 +264,17 @@ def _get_test_and_trial_fields(
     trial_name = None
 
     for name, field in fields.items():
+        if not isinstance(field, FieldLike):
+            raise ValueError(f"Passed field argument '{name}' is not a proper Field")
+
         if isinstance(field, TestField):
             if test is not None:
-                raise ValueError("Duplicate test field argument")
+                raise ValueError(f"More than one test field argument: '{test_name}' and '{name}'")
             test = field
             test_name = name
         elif isinstance(field, TrialField):
             if trial is not None:
-                raise ValueError("Duplicate test field argument")
+                raise ValueError(f"More than one trial field argument: '{trial_name}' and '{name}'")
             trial = field
             trial_name = name
 
@@ -759,6 +809,8 @@ def _generate_integrate_kernel(
 
     # Not found in cache, transform integrand and generate  kernel
 
+    _check_field_compat(integrand, fields, field_args, domain)
+
     integrand_func = _translate_integrand(
         integrand,
         field_args,
@@ -843,6 +895,7 @@ def _generate_integrate_kernel(
 
 
 def _launch_integrate_kernel(
+    integrand: Integrand,
     kernel: wp.Kernel,
     FieldStruct: wp.codegen.Struct,
     ValueStruct: wp.codegen.Struct,
@@ -870,9 +923,7 @@ def _launch_integrate_kernel(
     for k, v in fields.items():
         setattr(field_arg_values, k, v.eval_arg_value(device=device))
 
-    value_struct_values = ValueStruct()
-    for k, v in values.items():
-        setattr(value_struct_values, k, v)
+    value_struct_values = _populate_value_struct(ValueStruct, values, integrand_name=integrand.name)
 
     # Constant form
     if test is None and trial is None:
@@ -1211,6 +1262,7 @@ def integrate(
     )
 
     return _launch_integrate_kernel(
+        integrand=integrand,
         kernel=kernel,
         FieldStruct=FieldStruct,
         ValueStruct=ValueStruct,
@@ -1428,6 +1480,8 @@ def _generate_interpolate_kernel(
     if kernel is not None:
         return kernel, FieldStruct, ValueStruct
 
+    _check_field_compat(integrand, fields, field_args, domain)
+
     # Generate interpolation kernel
     if isinstance(dest, FieldRestriction):
         # need to split into kernel + function for diffferentiability
@@ -1499,6 +1553,7 @@ def _generate_interpolate_kernel(
 
 
 def _launch_interpolate_kernel(
+    integrand: Integrand,
     kernel: wp.kernel,
     FieldStruct: wp.codegen.Struct,
     ValueStruct: wp.codegen.Struct,
@@ -1517,9 +1572,7 @@ def _launch_interpolate_kernel(
     for k, v in fields.items():
         setattr(field_arg_values, k, v.eval_arg_value(device=device))
 
-    value_struct_values = ValueStruct()
-    for k, v in values.items():
-        setattr(value_struct_values, k, v)
+    value_struct_values = _populate_value_struct(ValueStruct, values, integrand_name=integrand.name)
 
     if isinstance(dest, FieldRestriction):
         dest_node_arg = dest.space_restriction.node_arg(device=device)
@@ -1618,6 +1671,7 @@ def interpolate(
     )
 
     return _launch_interpolate_kernel(
+        integrand=integrand,
         kernel=kernel,
         FieldStruct=FieldStruct,
         ValueStruct=ValueStruct,
diff --git a/warp/fem/space/__init__.py b/warp/fem/space/__init__.py
index 2dbf69859..b91fd6e4c 100644
--- a/warp/fem/space/__init__.py
+++ b/warp/fem/space/__init__.py
@@ -9,53 +9,24 @@
 
 from .function_space import FunctionSpace
 from .topology import SpaceTopology
-from .basis_space import BasisSpace, PointBasisSpace
+from .basis_space import BasisSpace, PointBasisSpace, ShapeBasisSpace, make_discontinuous_basis_space
 from .collocated_function_space import CollocatedFunctionSpace
+from .shape import ElementBasis, get_shape_function
+
+from .grid_2d_function_space import make_grid_2d_space_topology
+
+from .grid_3d_function_space import make_grid_3d_space_topology
+
+from .trimesh_2d_function_space import make_trimesh_2d_space_topology
+
+from .tetmesh_function_space import make_tetmesh_space_topology
+
+from .quadmesh_2d_function_space import make_quadmesh_2d_space_topology
+
+from .hexmesh_function_space import make_hexmesh_space_topology
+
+from .nanogrid_function_space import make_nanogrid_space_topology
 
-from .grid_2d_function_space import (
-    GridPiecewiseConstantBasis,
-    GridBipolynomialBasisSpace,
-    GridDGBipolynomialBasisSpace,
-    GridSerendipityBasisSpace,
-    GridDGSerendipityBasisSpace,
-    GridDGPolynomialBasisSpace,
-)
-from .grid_3d_function_space import (
-    GridTripolynomialBasisSpace,
-    GridDGTripolynomialBasisSpace,
-    Grid3DPiecewiseConstantBasis,
-    Grid3DSerendipityBasisSpace,
-    Grid3DDGSerendipityBasisSpace,
-    Grid3DDGPolynomialBasisSpace,
-)
-from .trimesh_2d_function_space import (
-    Trimesh2DPiecewiseConstantBasis,
-    Trimesh2DPolynomialBasisSpace,
-    Trimesh2DDGPolynomialBasisSpace,
-    Trimesh2DNonConformingPolynomialBasisSpace,
-)
-from .tetmesh_function_space import (
-    TetmeshPiecewiseConstantBasis,
-    TetmeshPolynomialBasisSpace,
-    TetmeshDGPolynomialBasisSpace,
-    TetmeshNonConformingPolynomialBasisSpace,
-)
-from .quadmesh_2d_function_space import (
-    Quadmesh2DPiecewiseConstantBasis,
-    Quadmesh2DBipolynomialBasisSpace,
-    Quadmesh2DDGBipolynomialBasisSpace,
-    Quadmesh2DSerendipityBasisSpace,
-    Quadmesh2DDGSerendipityBasisSpace,
-    Quadmesh2DPolynomialBasisSpace,
-)
-from .hexmesh_function_space import (
-    HexmeshPiecewiseConstantBasis,
-    HexmeshTripolynomialBasisSpace,
-    HexmeshDGTripolynomialBasisSpace,
-    HexmeshSerendipityBasisSpace,
-    HexmeshDGSerendipityBasisSpace,
-    HexmeshPolynomialBasisSpace,
-)
 
 from .partition import SpacePartition, make_space_partition
 from .restriction import SpaceRestriction
@@ -105,17 +76,6 @@ def make_space_restriction(
     )
 
 
-class ElementBasis(Enum):
-    """Choice of basis function to equip individual elements"""
-
-    LAGRANGE = 0
-    """Lagrange basis functions :math:`P_k` for simplices, tensor products :math:`Q_k` for squares and cubes"""
-    SERENDIPITY = 1
-    """Serendipity elements :math:`S_k`, corresponding to Lagrange nodes with interior points removed (for degree <= 3)"""
-    NONCONFORMING_POLYNOMIAL = 2
-    """Simplex Lagrange basis functions :math:`P_{kd}` embedded into non conforming reference elements (e.g. squares or cubes). Discontinuous only."""
-
-
 def make_polynomial_basis_space(
     geo: _geometry.Geometry,
     degree: int = 1,
@@ -141,110 +101,35 @@ def make_polynomial_basis_space(
 
     if element_basis is None:
         element_basis = ElementBasis.LAGRANGE
+    elif element_basis == ElementBasis.SERENDIPITY and degree == 1:
+        # Degree-1 serendipity is always equivalent to Lagrange
+        element_basis = ElementBasis.LAGRANGE
+
+    shape = get_shape_function(geo.reference_cell(), geo.dimension, degree, element_basis, family)
+
+    if discontinuous or degree == 0 or element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
+        return make_discontinuous_basis_space(geo, shape)
 
+    topology = None
     if isinstance(base_geo, _geometry.Grid2D):
-        if degree == 0:
-            return GridPiecewiseConstantBasis(geo)
-
-        if element_basis == ElementBasis.SERENDIPITY and degree > 1:
-            if discontinuous:
-                return GridDGSerendipityBasisSpace(geo, degree=degree, family=family)
-            else:
-                return GridSerendipityBasisSpace(geo, degree=degree, family=family)
-
-        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
-            return GridDGPolynomialBasisSpace(geo, degree=degree)
-
-        if discontinuous:
-            return GridDGBipolynomialBasisSpace(geo, degree=degree, family=family)
-        else:
-            return GridBipolynomialBasisSpace(geo, degree=degree, family=family)
-
-    if isinstance(base_geo, _geometry.Grid3D):
-        if degree == 0:
-            return Grid3DPiecewiseConstantBasis(geo)
-
-        if element_basis == ElementBasis.SERENDIPITY and degree > 1:
-            if discontinuous:
-                return Grid3DDGSerendipityBasisSpace(geo, degree=degree, family=family)
-            else:
-                return Grid3DSerendipityBasisSpace(geo, degree=degree, family=family)
-
-        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
-            return Grid3DDGPolynomialBasisSpace(geo, degree=degree)
-
-        if discontinuous:
-            return GridDGTripolynomialBasisSpace(geo, degree=degree, family=family)
-        else:
-            return GridTripolynomialBasisSpace(geo, degree=degree, family=family)
-
-    if isinstance(base_geo, _geometry.Trimesh2D):
-        if degree == 0:
-            return Trimesh2DPiecewiseConstantBasis(geo)
-
-        if element_basis == ElementBasis.SERENDIPITY and degree > 2:
-            raise NotImplementedError("Serendipity variant not implemented yet")
-
-        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
-            return Trimesh2DNonConformingPolynomialBasisSpace(geo, degree=degree)
-
-        if discontinuous:
-            return Trimesh2DDGPolynomialBasisSpace(geo, degree=degree)
-        else:
-            return Trimesh2DPolynomialBasisSpace(geo, degree=degree)
-
-    if isinstance(base_geo, _geometry.Tetmesh):
-        if degree == 0:
-            return TetmeshPiecewiseConstantBasis(geo)
-
-        if element_basis == ElementBasis.SERENDIPITY and degree > 2:
-            raise NotImplementedError("Serendipity variant not implemented yet")
-
-        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
-            return TetmeshNonConformingPolynomialBasisSpace(geo, degree=degree)
-
-        if discontinuous:
-            return TetmeshDGPolynomialBasisSpace(geo, degree=degree)
-        else:
-            return TetmeshPolynomialBasisSpace(geo, degree=degree)
-
-    if isinstance(base_geo, _geometry.Quadmesh2D):
-        if degree == 0:
-            return Quadmesh2DPiecewiseConstantBasis(geo)
-
-        if element_basis == ElementBasis.SERENDIPITY and degree > 1:
-            if discontinuous:
-                return Quadmesh2DDGSerendipityBasisSpace(geo, degree=degree, family=family)
-            else:
-                return Quadmesh2DSerendipityBasisSpace(geo, degree=degree, family=family)
-
-        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
-            return Quadmesh2DPolynomialBasisSpace(geo, degree=degree)
-
-        if discontinuous:
-            return Quadmesh2DDGBipolynomialBasisSpace(geo, degree=degree, family=family)
-        else:
-            return Quadmesh2DBipolynomialBasisSpace(geo, degree=degree, family=family)
-
-    if isinstance(base_geo, _geometry.Hexmesh):
-        if degree == 0:
-            return HexmeshPiecewiseConstantBasis(geo)
-
-        if element_basis == ElementBasis.SERENDIPITY and degree > 1:
-            if discontinuous:
-                return HexmeshDGSerendipityBasisSpace(geo, degree=degree, family=family)
-            else:
-                return HexmeshSerendipityBasisSpace(geo, degree=degree, family=family)
-
-        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
-            return HexmeshPolynomialBasisSpace(geo, degree=degree)
-
-        if discontinuous:
-            return HexmeshDGTripolynomialBasisSpace(geo, degree=degree, family=family)
-        else:
-            return HexmeshTripolynomialBasisSpace(geo, degree=degree, family=family)
-
-    raise NotImplementedError()
+        topology = make_grid_2d_space_topology(geo, shape)
+    elif isinstance(base_geo, _geometry.Grid3D):
+        topology = make_grid_3d_space_topology(geo, shape)
+    elif isinstance(base_geo, _geometry.Trimesh2D):
+        topology = make_trimesh_2d_space_topology(geo, shape)
+    elif isinstance(base_geo, _geometry.Tetmesh):
+        topology = make_tetmesh_space_topology(geo, shape)
+    elif isinstance(base_geo, _geometry.Quadmesh2D):
+        topology = make_quadmesh_2d_space_topology(geo, shape)
+    elif isinstance(base_geo, _geometry.Hexmesh):
+        topology = make_hexmesh_space_topology(geo, shape)
+    elif isinstance(base_geo, _geometry.Nanogrid):
+        topology = make_nanogrid_space_topology(geo, shape)
+
+    if topology is None:
+        raise NotImplementedError(f"Unsupported geometry type {geo.name}")
+
+    return ShapeBasisSpace(topology, shape)
 
 
 def make_collocated_function_space(
diff --git a/warp/fem/space/basis_space.py b/warp/fem/space/basis_space.py
index b05d77f34..dd7cdfd47 100644
--- a/warp/fem/space/basis_space.py
+++ b/warp/fem/space/basis_space.py
@@ -394,6 +394,40 @@ def __eq__(self, other: "TraceBasisSpace") -> bool:
         return self._topo == other._topo
 
 
+class PiecewiseConstantBasisSpace(ShapeBasisSpace):
+    class Trace(TraceBasisSpace):
+        def make_node_coords_in_element(self):
+            # Makes the single node visible to all sides; useful for interpolating on boundaries
+            # For higher-order non-conforming elements direct interpolation on boundary is not possible,
+            # need to do proper integration then solve with mass matrix
+
+            CENTER_COORDS = Coords(self.geometry.reference_side().center())
+
+            @cache.dynamic_func(suffix=self._basis.name)
+            def trace_node_coords_in_element(
+                geo_side_arg: self.geometry.SideArg,
+                basis_arg: self.BasisArg,
+                element_index: ElementIndex,
+                node_index_in_elt: int,
+            ):
+                return CENTER_COORDS
+
+            return trace_node_coords_in_element
+
+    def trace(self):
+        return PiecewiseConstantBasisSpace.Trace(self)
+
+
+def make_discontinuous_basis_space(geometry: Geometry, shape: ShapeFunction):
+    topology = DiscontinuousSpaceTopology(geometry, shape.NODES_PER_ELEMENT)
+
+    if shape.NODES_PER_ELEMENT == 1:
+        # piecewise-constant space
+        return PiecewiseConstantBasisSpace(topology=topology, shape=shape)
+
+    return ShapeBasisSpace(topology=topology, shape=shape)
+
+
 class PointBasisSpace(BasisSpace):
     """An unstructured :class:`BasisSpace` that is non-zero at a finite set of points only.
 
diff --git a/warp/fem/space/collocated_function_space.py b/warp/fem/space/collocated_function_space.py
index 250a5d675..cecb0ba87 100644
--- a/warp/fem/space/collocated_function_space.py
+++ b/warp/fem/space/collocated_function_space.py
@@ -36,7 +36,7 @@ def __init__(self, basis: BasisSpace, dtype: type = float, dof_mapper: DofMapper
         self.element_outer_weight_gradient = self._basis.make_element_outer_weight_gradient()
 
         # For backward compatibility
-        if hasattr(basis, "node_grid"):
+        if hasattr(basis.topology, "node_grid"):
             self.node_grid = basis.node_grid
         if hasattr(basis, "node_triangulation"):
             self.node_triangulation = basis.node_triangulation
diff --git a/warp/fem/space/grid_2d_function_space.py b/warp/fem/space/grid_2d_function_space.py
index d11edd4b8..d053b7f2b 100644
--- a/warp/fem/space/grid_2d_function_space.py
+++ b/warp/fem/space/grid_2d_function_space.py
@@ -3,22 +3,22 @@
 import warp as wp
 from warp.fem import cache
 from warp.fem.geometry import Grid2D
-from warp.fem.polynomial import Polynomial, is_closed
-from warp.fem.types import Coords, ElementIndex
+from warp.fem.polynomial import is_closed
+from warp.fem.types import ElementIndex
 
-from .basis_space import ShapeBasisSpace, TraceBasisSpace
 from .shape import (
-    ConstantShapeFunction,
     ShapeFunction,
     SquareBipolynomialShapeFunctions,
-    SquareNonConformingPolynomialShapeFunctions,
     SquareSerendipityShapeFunctions,
 )
-from .topology import DiscontinuousSpaceTopologyMixin, SpaceTopology, forward_base_topology
+from .topology import SpaceTopology, forward_base_topology
 
 
 class Grid2DSpaceTopology(SpaceTopology):
     def __init__(self, grid: Grid2D, shape: ShapeFunction):
+        if not is_closed(shape.family):
+            raise ValueError("A closed polynomial family is required to define a continuous function space")
+
         super().__init__(grid, shape.NODES_PER_ELEMENT)
         self._shape = shape
 
@@ -37,53 +37,6 @@ def _vertex_index(cell_arg: Grid2D.CellArg, cell_index: ElementIndex, vidx_in_ce
         return Grid2D._from_2d_index(x_stride, corner)
 
 
-class Grid2DDiscontinuousSpaceTopology(
-    DiscontinuousSpaceTopologyMixin,
-    Grid2DSpaceTopology,
-):
-    pass
-
-
-class Grid2DBasisSpace(ShapeBasisSpace):
-    def __init__(self, topology: Grid2DSpaceTopology, shape: ShapeFunction):
-        super().__init__(topology, shape)
-
-        self._grid: Grid2D = topology.geometry
-
-
-class GridPiecewiseConstantBasis(Grid2DBasisSpace):
-    def __init__(self, grid: Grid2D):
-        shape = ConstantShapeFunction(grid.reference_cell(), space_dimension=2)
-        topology = Grid2DDiscontinuousSpaceTopology(grid, shape)
-        super().__init__(shape=shape, topology=topology)
-
-        if isinstance(grid, Grid2D):
-            self.node_grid = self._node_grid
-
-    def _node_grid(self):
-        res = self._grid.res
-
-        X = (np.arange(0, res[0], dtype=float) + 0.5) * self._grid.cell_size[0] + self._grid.origin[0]
-        Y = (np.arange(0, res[1], dtype=float) + 0.5) * self._grid.cell_size[1] + self._grid.origin[1]
-        return np.meshgrid(X, Y, indexing="ij")
-
-    class Trace(TraceBasisSpace):
-        @wp.func
-        def _node_coords_in_element(
-            side_arg: Grid2D.SideArg,
-            basis_arg: Grid2DBasisSpace.BasisArg,
-            element_index: ElementIndex,
-            node_index_in_element: int,
-        ):
-            return Coords(0.5, 0.0, 0.0)
-
-        def make_node_coords_in_element(self):
-            return self._node_coords_in_element
-
-    def trace(self):
-        return GridPiecewiseConstantBasis.Trace(self)
-
-
 class GridBipolynomialSpaceTopology(Grid2DSpaceTopology):
     def __init__(self, grid: Grid2D, shape: SquareBipolynomialShapeFunctions):
         super().__init__(grid, shape)
@@ -119,30 +72,8 @@ def element_node_index(
 
         return element_node_index
 
-
-class GridBipolynomialBasisSpace(Grid2DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        if not is_closed(family):
-            raise ValueError("A closed polynomial family is required to define a continuous function space")
-
-        shape = SquareBipolynomialShapeFunctions(degree, family=family)
-        topology = forward_base_topology(GridBipolynomialSpaceTopology, grid, shape)
-
-        super().__init__(topology, shape)
-
-        if isinstance(grid, Grid2D):
-            self.node_grid = self._node_grid
-
     def _node_grid(self):
-        res = self._grid.res
+        res = self.geometry.res
 
         cell_coords = np.array(self._shape.LOBATTO_COORDS)[:-1]
 
@@ -161,22 +92,6 @@ def _node_grid(self):
         return np.meshgrid(X, Y, indexing="ij")
 
 
-class GridDGBipolynomialBasisSpace(Grid2DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = SquareBipolynomialShapeFunctions(degree, family=family)
-        topology = Grid2DDiscontinuousSpaceTopology(grid, shape)
-
-        super().__init__(shape=shape, topology=topology)
-
-
 class GridSerendipitySpaceTopology(Grid2DSpaceTopology):
     def __init__(self, grid: Grid2D, shape: SquareSerendipityShapeFunctions):
         super().__init__(grid, shape)
@@ -223,45 +138,11 @@ def element_node_index(
         return element_node_index
 
 
-class GridSerendipityBasisSpace(Grid2DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = SquareSerendipityShapeFunctions(degree, family=family)
-        topology = forward_base_topology(GridSerendipitySpaceTopology, grid, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
-
-
-class GridDGSerendipityBasisSpace(Grid2DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = SquareSerendipityShapeFunctions(degree, family=family)
-        topology = Grid2DDiscontinuousSpaceTopology(grid, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
-
+def make_grid_2d_space_topology(grid: Grid2D, shape: ShapeFunction):
+    if isinstance(shape, SquareSerendipityShapeFunctions):
+        return forward_base_topology(GridSerendipitySpaceTopology, grid, shape)
 
-class GridDGPolynomialBasisSpace(Grid2DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid2D,
-        degree: int,
-    ):
-        shape = SquareNonConformingPolynomialShapeFunctions(degree)
-        topology = Grid2DDiscontinuousSpaceTopology(grid, shape=shape)
+    if isinstance(shape, SquareBipolynomialShapeFunctions):
+        return forward_base_topology(GridBipolynomialSpaceTopology, grid, shape)
 
-        super().__init__(topology=topology, shape=shape)
+    raise ValueError(f"Unsupported shape function {shape.name}")
diff --git a/warp/fem/space/grid_3d_function_space.py b/warp/fem/space/grid_3d_function_space.py
index ce0ef0373..9f6f53f2b 100644
--- a/warp/fem/space/grid_3d_function_space.py
+++ b/warp/fem/space/grid_3d_function_space.py
@@ -3,23 +3,25 @@
 import warp as wp
 from warp.fem import cache
 from warp.fem.geometry import Grid3D
-from warp.fem.polynomial import Polynomial, is_closed
-from warp.fem.types import Coords, ElementIndex
+from warp.fem.polynomial import is_closed
+from warp.fem.types import ElementIndex
 
-from .basis_space import ShapeBasisSpace, TraceBasisSpace
-from .shape import ConstantShapeFunction, ShapeFunction
-from .shape.cube_shape_function import (
-    CubeNonConformingPolynomialShapeFunctions,
+from .shape import (
     CubeSerendipityShapeFunctions,
     CubeTripolynomialShapeFunctions,
+    ShapeFunction,
 )
-from .topology import DiscontinuousSpaceTopologyMixin, SpaceTopology, forward_base_topology
+from .topology import SpaceTopology, forward_base_topology
 
 
 class Grid3DSpaceTopology(SpaceTopology):
     def __init__(self, grid: Grid3D, shape: ShapeFunction):
+        if not is_closed(shape.family):
+            raise ValueError("A closed polynomial family is required to define a continuous function space")
+
         super().__init__(grid, shape.NODES_PER_ELEMENT)
         self._shape = shape
+        self._grid = grid
 
     @wp.func
     def _vertex_coords(vidx_in_cell: int):
@@ -37,52 +39,6 @@ def _vertex_index(cell_arg: Grid3D.CellArg, cell_index: ElementIndex, vidx_in_ce
         return Grid3D._from_3d_index(strides, corner)
 
 
-class Grid3DDiscontinuousSpaceTopology(
-    DiscontinuousSpaceTopologyMixin,
-    Grid3DSpaceTopology,
-):
-    pass
-
-
-class Grid3DBasisSpace(ShapeBasisSpace):
-    def __init__(self, topology: Grid3DSpaceTopology, shape: ShapeFunction):
-        super().__init__(topology, shape)
-
-        self._grid: Grid3D = topology.geometry
-
-
-class Grid3DPiecewiseConstantBasis(Grid3DBasisSpace):
-    def __init__(self, grid: Grid3D):
-        shape = ConstantShapeFunction(grid.reference_cell(), space_dimension=3)
-        topology = Grid3DDiscontinuousSpaceTopology(grid, shape)
-        super().__init__(shape=shape, topology=topology)
-
-        if isinstance(grid, Grid3D):
-            self.node_grid = self._node_grid
-
-    def _node_grid(self):
-        X = (np.arange(0, self.geometry.res[0], dtype=float) + 0.5) * self._grid.cell_size[0] + self._grid.bounds_lo[0]
-        Y = (np.arange(0, self.geometry.res[1], dtype=float) + 0.5) * self._grid.cell_size[1] + self._grid.bounds_lo[1]
-        Z = (np.arange(0, self.geometry.res[2], dtype=float) + 0.5) * self._grid.cell_size[2] + self._grid.bounds_lo[2]
-        return np.meshgrid(X, Y, Z, indexing="ij")
-
-    class Trace(TraceBasisSpace):
-        @wp.func
-        def _node_coords_in_element(
-            side_arg: Grid3D.SideArg,
-            basis_arg: Grid3DBasisSpace.BasisArg,
-            element_index: ElementIndex,
-            node_index_in_element: int,
-        ):
-            return Coords(0.5, 0.5, 0.0)
-
-        def make_node_coords_in_element(self):
-            return self._node_coords_in_element
-
-    def trace(self):
-        return Grid3DPiecewiseConstantBasis.Trace(self)
-
-
 class GridTripolynomialSpaceTopology(Grid3DSpaceTopology):
     def __init__(self, grid: Grid3D, shape: CubeTripolynomialShapeFunctions):
         super().__init__(grid, shape)
@@ -123,30 +79,8 @@ def element_node_index(
 
         return element_node_index
 
-
-class GridTripolynomialBasisSpace(Grid3DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid3D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        if not is_closed(family):
-            raise ValueError("A closed polynomial family is required to define a continuous function space")
-
-        shape = CubeTripolynomialShapeFunctions(degree, family=family)
-        topology = forward_base_topology(GridTripolynomialSpaceTopology, grid, shape)
-
-        super().__init__(topology, shape)
-
-        if isinstance(grid, Grid3D):
-            self.node_grid = self._node_grid
-
     def _node_grid(self):
-        res = self._grid.res
+        res = self.geometry.res
 
         cell_coords = np.array(self._shape.LOBATTO_COORDS)[:-1]
 
@@ -171,44 +105,6 @@ def _node_grid(self):
         return np.meshgrid(X, Y, Z, indexing="ij")
 
 
-class GridDGTripolynomialBasisSpace(Grid3DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid3D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = CubeTripolynomialShapeFunctions(degree, family=family)
-        topology = Grid3DDiscontinuousSpaceTopology(grid, shape)
-
-        super().__init__(shape=shape, topology=topology)
-
-    def node_grid(self):
-        res = self._grid.res
-
-        cell_coords = np.array(self._shape.LOBATTO_COORDS)
-
-        grid_coords_x = np.repeat(np.arange(0, res[0], dtype=float), len(cell_coords)) + np.tile(
-            cell_coords, reps=res[0]
-        )
-        X = grid_coords_x * self._grid.cell_size[0] + self._grid.origin[0]
-
-        grid_coords_y = np.repeat(np.arange(0, res[1], dtype=float), len(cell_coords)) + np.tile(
-            cell_coords, reps=res[1]
-        )
-        Y = grid_coords_y * self._grid.cell_size[1] + self._grid.origin[1]
-
-        grid_coords_z = np.repeat(np.arange(0, res[2], dtype=float), len(cell_coords)) + np.tile(
-            cell_coords, reps=res[2]
-        )
-        Z = grid_coords_z * self._grid.cell_size[2] + self._grid.origin[2]
-
-        return np.meshgrid(X, Y, Z, indexing="ij")
-
-
 class Grid3DSerendipitySpaceTopology(Grid3DSpaceTopology):
     def __init__(self, grid: Grid3D, shape: CubeSerendipityShapeFunctions):
         super().__init__(grid, shape)
@@ -261,45 +157,11 @@ def element_node_index(
         return element_node_index
 
 
-class Grid3DSerendipityBasisSpace(Grid3DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid3D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = CubeSerendipityShapeFunctions(degree, family=family)
-        topology = forward_base_topology(Grid3DSerendipitySpaceTopology, grid, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
-
-
-class Grid3DDGSerendipityBasisSpace(Grid3DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid3D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = CubeSerendipityShapeFunctions(degree, family=family)
-        topology = Grid3DDiscontinuousSpaceTopology(grid, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
-
+def make_grid_3d_space_topology(grid: Grid3D, shape: ShapeFunction):
+    if isinstance(shape, CubeSerendipityShapeFunctions):
+        return forward_base_topology(Grid3DSerendipitySpaceTopology, grid, shape)
 
-class Grid3DDGPolynomialBasisSpace(Grid3DBasisSpace):
-    def __init__(
-        self,
-        grid: Grid3D,
-        degree: int,
-    ):
-        shape = CubeNonConformingPolynomialShapeFunctions(degree)
-        topology = Grid3DDiscontinuousSpaceTopology(grid, shape=shape)
+    if isinstance(shape, CubeTripolynomialShapeFunctions):
+        return forward_base_topology(GridTripolynomialSpaceTopology, grid, shape)
 
-        super().__init__(topology=topology, shape=shape)
+    raise ValueError(f"Unsupported shape function {shape.name}")
diff --git a/warp/fem/space/hexmesh_function_space.py b/warp/fem/space/hexmesh_function_space.py
index 099134866..afd6b9da1 100644
--- a/warp/fem/space/hexmesh_function_space.py
+++ b/warp/fem/space/hexmesh_function_space.py
@@ -6,23 +6,24 @@
     FACE_ORIENTATION,
     FACE_TRANSLATION,
 )
-from warp.fem.polynomial import Polynomial, is_closed
-from warp.fem.types import Coords, ElementIndex
+from warp.fem.polynomial import is_closed
+from warp.fem.types import ElementIndex
 
-from .basis_space import ShapeBasisSpace, TraceBasisSpace
 from .shape import (
-    ConstantShapeFunction,
-    CubeNonConformingPolynomialShapeFunctions,
     CubeSerendipityShapeFunctions,
     CubeTripolynomialShapeFunctions,
     ShapeFunction,
 )
-from .topology import DiscontinuousSpaceTopologyMixin, SpaceTopology, forward_base_topology
+from .topology import SpaceTopology, forward_base_topology
 
 _FACE_ORIENTATION_I = wp.constant(wp.mat(shape=(16, 2), dtype=int)(FACE_ORIENTATION))
 _FACE_TRANSLATION_I = wp.constant(wp.mat(shape=(4, 2), dtype=int)(FACE_TRANSLATION))
 
-_CUBE_VERTEX_INDICES = wp.constant(wp.vec(length=8, dtype=int)([0, 4, 3, 7, 1, 5, 2, 6]))
+# map from shape function vertex indexing to hexmesh vertex indexing
+_CUBE_TO_HEX_VERTEX = wp.constant(wp.vec(length=8, dtype=int)([0, 4, 3, 7, 1, 5, 2, 6]))
+
+# map from shape function edge indexing to hexmesh edge indexing
+_CUBE_TO_HEX_EDGE = wp.constant(wp.vec(length=12, dtype=int)([0, 4, 2, 6, 3, 1, 7, 5, 8, 11, 9, 10]))
 
 
 @wp.struct
@@ -45,9 +46,12 @@ def __init__(
         need_hex_edge_indices: bool = True,
         need_hex_face_indices: bool = True,
     ):
+        if not is_closed(shape.family):
+            raise ValueError("A closed polynomial family is required to define a continuous function space")
+
         super().__init__(mesh, shape.NODES_PER_ELEMENT)
         self._mesh = mesh
-        self._shape = shape
+        self.shape = shape
 
         if need_hex_edge_indices:
             self._hex_edge_indices = self._mesh.hex_edge_indices
@@ -111,44 +115,6 @@ def _compute_hex_face_indices_kernel(
         hex_face_indices[hx1, local_face_1] = wp.vec2i(f, ori_1)
 
 
-class HexmeshDiscontinuousSpaceTopology(
-    DiscontinuousSpaceTopologyMixin,
-    SpaceTopology,
-):
-    def __init__(self, mesh: Hexmesh, shape: ShapeFunction):
-        super().__init__(mesh, shape.NODES_PER_ELEMENT)
-
-
-class HexmeshBasisSpace(ShapeBasisSpace):
-    def __init__(self, topology: HexmeshSpaceTopology, shape: ShapeFunction):
-        super().__init__(topology, shape)
-
-        self._mesh: Hexmesh = topology.geometry
-
-
-class HexmeshPiecewiseConstantBasis(HexmeshBasisSpace):
-    def __init__(self, mesh: Hexmesh):
-        shape = ConstantShapeFunction(mesh.reference_cell(), space_dimension=3)
-        topology = HexmeshDiscontinuousSpaceTopology(mesh, shape)
-        super().__init__(shape=shape, topology=topology)
-
-    class Trace(TraceBasisSpace):
-        @wp.func
-        def _node_coords_in_element(
-            side_arg: Hexmesh.SideArg,
-            basis_arg: HexmeshBasisSpace.BasisArg,
-            element_index: ElementIndex,
-            node_index_in_element: int,
-        ):
-            return Coords(0.5, 0.5, 0.0)
-
-        def make_node_coords_in_element(self):
-            return self._node_coords_in_element
-
-    def trace(self):
-        return HexmeshPiecewiseConstantBasis.Trace(self)
-
-
 class HexmeshTripolynomialSpaceTopology(HexmeshSpaceTopology):
     def __init__(self, mesh: Hexmesh, shape: CubeTripolynomialShapeFunctions):
         super().__init__(mesh, shape, need_hex_edge_indices=shape.ORDER >= 2, need_hex_face_indices=shape.ORDER >= 2)
@@ -156,7 +122,7 @@ def __init__(self, mesh: Hexmesh, shape: CubeTripolynomialShapeFunctions):
         self.element_node_index = self._make_element_node_index()
 
     def node_count(self) -> int:
-        ORDER = self._shape.ORDER
+        ORDER = self.shape.ORDER
         INTERIOR_NODES_PER_EDGE = max(0, ORDER - 1)
         INTERIOR_NODES_PER_FACE = INTERIOR_NODES_PER_EDGE**2
         INTERIOR_NODES_PER_CELL = INTERIOR_NODES_PER_EDGE**3
@@ -182,7 +148,7 @@ def _rotate_face_index(type_index: int, ori: int, size: int):
         return rot_i * size + rot_j
 
     def _make_element_node_index(self):
-        ORDER = self._shape.ORDER
+        ORDER = self.shape.ORDER
         INTERIOR_NODES_PER_EDGE = wp.constant(max(0, ORDER - 1))
         INTERIOR_NODES_PER_FACE = wp.constant(INTERIOR_NODES_PER_EDGE**2)
         INTERIOR_NODES_PER_CELL = wp.constant(INTERIOR_NODES_PER_EDGE**3)
@@ -194,18 +160,19 @@ def element_node_index(
             element_index: ElementIndex,
             node_index_in_elt: int,
         ):
-            node_type, type_instance, type_index = self._shape.node_type_and_type_index(node_index_in_elt)
+            node_type, type_instance, type_index = self.shape.node_type_and_type_index(node_index_in_elt)
 
             if node_type == CubeTripolynomialShapeFunctions.VERTEX:
-                return geo_arg.hex_vertex_indices[element_index, _CUBE_VERTEX_INDICES[type_instance]]
+                return geo_arg.hex_vertex_indices[element_index, _CUBE_TO_HEX_VERTEX[type_instance]]
 
             offset = topo_arg.vertex_count
 
             if node_type == CubeTripolynomialShapeFunctions.EDGE:
-                edge_index = topo_arg.hex_edge_indices[element_index, type_instance]
+                hex_edge = _CUBE_TO_HEX_EDGE[type_instance]
+                edge_index = topo_arg.hex_edge_indices[element_index, hex_edge]
 
-                v0 = geo_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[type_instance, 0]]
-                v1 = geo_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[type_instance, 1]]
+                v0 = geo_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[hex_edge, 0]]
+                v1 = geo_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[hex_edge, 1]]
 
                 if v0 > v1:
                     type_index = ORDER - 1 - type_index
@@ -232,52 +199,21 @@ def element_node_index(
         return element_node_index
 
 
-class HexmeshTripolynomialBasisSpace(HexmeshBasisSpace):
-    def __init__(
-        self,
-        mesh: Hexmesh,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        if not is_closed(family):
-            raise ValueError("A closed polynomial family is required to define a continuous function space")
-
-        shape = CubeTripolynomialShapeFunctions(degree, family=family)
-        topology = forward_base_topology(HexmeshTripolynomialSpaceTopology, mesh, shape)
-
-        super().__init__(topology, shape)
-
-
-class HexmeshDGTripolynomialBasisSpace(HexmeshBasisSpace):
+class HexmeshSerendipitySpaceTopology(HexmeshSpaceTopology):
     def __init__(
         self,
-        mesh: Hexmesh,
-        degree: int,
-        family: Polynomial,
+        grid: Hexmesh,
+        shape: CubeSerendipityShapeFunctions,
     ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = CubeTripolynomialShapeFunctions(degree, family=family)
-        topology = HexmeshDiscontinuousSpaceTopology(mesh, shape)
-
-        super().__init__(topology, shape)
-
-
-class HexmeshSerendipitySpaceTopology(HexmeshSpaceTopology):
-    def __init__(self, grid: Hexmesh, shape: CubeSerendipityShapeFunctions):
         super().__init__(grid, shape, need_hex_edge_indices=True, need_hex_face_indices=False)
 
         self.element_node_index = self._make_element_node_index()
 
     def node_count(self) -> int:
-        return self.geometry.vertex_count() + (self._shape.ORDER - 1) * self.geometry.edge_count()
+        return self.geometry.vertex_count() + (self.shape.ORDER - 1) * self.geometry.edge_count()
 
     def _make_element_node_index(self):
-        ORDER = self._shape.ORDER
+        ORDER = self.shape.ORDER
 
         @cache.dynamic_func(suffix=self.name)
         def element_node_index(
@@ -286,17 +222,18 @@ def element_node_index(
             element_index: ElementIndex,
             node_index_in_elt: int,
         ):
-            node_type, type_index = self._shape.node_type_and_type_index(node_index_in_elt)
+            node_type, type_index = self.shape.node_type_and_type_index(node_index_in_elt)
 
             if node_type == CubeSerendipityShapeFunctions.VERTEX:
-                return cell_arg.hex_vertex_indices[element_index, _CUBE_VERTEX_INDICES[type_index]]
+                return cell_arg.hex_vertex_indices[element_index, _CUBE_TO_HEX_VERTEX[type_index]]
 
             type_instance, index_in_edge = CubeSerendipityShapeFunctions._cube_edge_index(node_type, type_index)
+            hex_edge = _CUBE_TO_HEX_EDGE[type_instance]
 
-            edge_index = topo_arg.hex_edge_indices[element_index, type_instance]
+            edge_index = topo_arg.hex_edge_indices[element_index, hex_edge]
 
-            v0 = cell_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[type_instance, 0]]
-            v1 = cell_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[type_instance, 1]]
+            v0 = cell_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[hex_edge, 0]]
+            v1 = cell_arg.hex_vertex_indices[element_index, EDGE_VERTEX_INDICES[hex_edge, 1]]
 
             if v0 > v1:
                 index_in_edge = ORDER - 1 - index_in_edge
@@ -306,45 +243,11 @@ def element_node_index(
         return element_node_index
 
 
-class HexmeshSerendipityBasisSpace(HexmeshBasisSpace):
-    def __init__(
-        self,
-        mesh: Hexmesh,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = CubeSerendipityShapeFunctions(degree, family=family)
-        topology = forward_base_topology(HexmeshSerendipitySpaceTopology, mesh, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
+def make_hexmesh_space_topology(mesh: Hexmesh, shape: ShapeFunction):
+    if isinstance(shape, CubeSerendipityShapeFunctions):
+        return forward_base_topology(HexmeshSerendipitySpaceTopology, mesh, shape)
 
+    if isinstance(shape, CubeTripolynomialShapeFunctions):
+        return forward_base_topology(HexmeshTripolynomialSpaceTopology, mesh, shape)
 
-class HexmeshDGSerendipityBasisSpace(HexmeshBasisSpace):
-    def __init__(
-        self,
-        mesh: Hexmesh,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = CubeSerendipityShapeFunctions(degree, family=family)
-        topology = HexmeshDiscontinuousSpaceTopology(mesh, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
-
-
-class HexmeshPolynomialBasisSpace(HexmeshBasisSpace):
-    def __init__(
-        self,
-        mesh: Hexmesh,
-        degree: int,
-    ):
-        shape = CubeNonConformingPolynomialShapeFunctions(degree)
-        topology = HexmeshDiscontinuousSpaceTopology(mesh, shape)
-
-        super().__init__(topology, shape)
+    raise ValueError(f"Unsupported shape function {shape.name}")
diff --git a/warp/fem/space/nanogrid_function_space.py b/warp/fem/space/nanogrid_function_space.py
new file mode 100644
index 000000000..091ae88b4
--- /dev/null
+++ b/warp/fem/space/nanogrid_function_space.py
@@ -0,0 +1,202 @@
+import warp as wp
+from warp.fem import cache
+from warp.fem.geometry import Nanogrid
+from warp.fem.geometry.nanogrid import _add_axis_flag
+from warp.fem.polynomial import is_closed
+from warp.fem.types import ElementIndex
+
+from .shape import (
+    CubeSerendipityShapeFunctions,
+    CubeTripolynomialShapeFunctions,
+    ShapeFunction,
+)
+from .topology import SpaceTopology, forward_base_topology
+
+
+@wp.struct
+class NanogridTopologyArg:
+    vertex_grid: wp.uint64
+    face_grid: wp.uint64
+    edge_grid: wp.uint64
+
+    vertex_count: int
+    edge_count: int
+    face_count: int
+
+
+class NanogridSpaceTopology(SpaceTopology):
+    TopologyArg = NanogridTopologyArg
+
+    def __init__(
+        self,
+        grid: Nanogrid,
+        shape: ShapeFunction,
+        need_edge_indices: bool = True,
+        need_face_indices: bool = True,
+    ):
+        if not is_closed(shape.family):
+            raise ValueError("A closed polynomial family is required to define a continuous function space")
+
+        super().__init__(grid, shape.NODES_PER_ELEMENT)
+        self._grid = grid
+        self._shape = shape
+
+        if need_edge_indices:
+            self._edge_count = self._grid.edge_count()
+        else:
+            self._edge_count = 0
+
+        self._vertex_grid = grid._node_grid
+        self._face_grid = grid._face_grid
+        self._edge_grid = grid._edge_grid
+
+    @cache.cached_arg_value
+    def topo_arg_value(self, device):
+        arg = NanogridTopologyArg()
+
+        arg.vertex_grid = self._vertex_grid.id
+        arg.face_grid = self._face_grid.id
+        arg.edge_grid = -1 if self._edge_grid is None else self._edge_grid.id
+
+        arg.vertex_count = self._grid.vertex_count()
+        arg.face_count = self._grid.side_count()
+        arg.edge_count = self._edge_count
+        return arg
+
+
+@wp.func
+def _cell_vertex_coord(cell_ijk: wp.vec3i, n: int):
+    return cell_ijk + wp.vec3i((n & 4) >> 2, (n & 2) >> 1, n & 1)
+
+
+@wp.func
+def _cell_edge_coord(cell_ijk: wp.vec3i, axis: int, offset: int):
+    e_ijk = cell_ijk
+    e_ijk[(axis + 1) % 3] += offset >> 1
+    e_ijk[(axis + 2) % 3] += offset & 1
+    return _add_axis_flag(e_ijk, axis)
+
+
+@wp.func
+def _cell_face_coord(cell_ijk: wp.vec3i, axis: int, offset: int):
+    f_ijk = cell_ijk
+    f_ijk[axis] += offset
+    return _add_axis_flag(f_ijk, axis)
+
+
+class NanogridTripolynomialSpaceTopology(NanogridSpaceTopology):
+    def __init__(self, grid: Nanogrid, shape: CubeTripolynomialShapeFunctions):
+        super().__init__(grid, shape, need_edge_indices=shape.ORDER >= 2, need_face_indices=shape.ORDER >= 2)
+
+        self.element_node_index = self._make_element_node_index()
+
+    def node_count(self) -> int:
+        ORDER = self._shape.ORDER
+        INTERIOR_NODES_PER_EDGE = max(0, ORDER - 1)
+        INTERIOR_NODES_PER_FACE = INTERIOR_NODES_PER_EDGE**2
+        INTERIOR_NODES_PER_CELL = INTERIOR_NODES_PER_EDGE**3
+
+        return (
+            self._grid.vertex_count()
+            + self._grid.edge_count() * INTERIOR_NODES_PER_EDGE
+            + self._grid.side_count() * INTERIOR_NODES_PER_FACE
+            + self._grid.cell_count() * INTERIOR_NODES_PER_CELL
+        )
+
+    def _make_element_node_index(self):
+        ORDER = self._shape.ORDER
+        INTERIOR_NODES_PER_EDGE = wp.constant(max(0, ORDER - 1))
+        INTERIOR_NODES_PER_FACE = wp.constant(INTERIOR_NODES_PER_EDGE**2)
+        INTERIOR_NODES_PER_CELL = wp.constant(INTERIOR_NODES_PER_EDGE**3)
+
+        @cache.dynamic_func(suffix=self.name)
+        def element_node_index(
+            geo_arg: Nanogrid.CellArg,
+            topo_arg: NanogridTopologyArg,
+            element_index: ElementIndex,
+            node_index_in_elt: int,
+        ):
+            node_type, type_instance, type_index = self._shape.node_type_and_type_index(node_index_in_elt)
+
+            ijk = geo_arg.cell_ijk[element_index]
+
+            if node_type == CubeTripolynomialShapeFunctions.VERTEX:
+                n_ijk = _cell_vertex_coord(ijk, type_instance)
+                return wp.volume_lookup_index(topo_arg.vertex_grid, n_ijk[0], n_ijk[1], n_ijk[2])
+
+            offset = topo_arg.vertex_count
+
+            if node_type == CubeTripolynomialShapeFunctions.EDGE:
+                axis = type_instance >> 2
+                node_offset = type_instance & 3
+
+                n_ijk = _cell_edge_coord(ijk, axis, node_offset)
+
+                edge_index = wp.volume_lookup_index(topo_arg.edge_grid, n_ijk[0], n_ijk[1], n_ijk[2])
+                return offset + INTERIOR_NODES_PER_EDGE * edge_index + type_index
+
+            offset += INTERIOR_NODES_PER_EDGE * topo_arg.edge_count
+
+            if node_type == CubeTripolynomialShapeFunctions.FACE:
+                axis = type_instance >> 1
+                node_offset = type_instance & 1
+
+                n_ijk = _cell_face_coord(ijk, axis, node_offset)
+
+                face_index = wp.volume_lookup_index(topo_arg.face_grid, n_ijk[0], n_ijk[1], n_ijk[2])
+                return offset + INTERIOR_NODES_PER_FACE * face_index + type_index
+
+            offset += INTERIOR_NODES_PER_FACE * topo_arg.face_count
+
+            return offset + INTERIOR_NODES_PER_CELL * element_index + type_index
+
+        return element_node_index
+
+
+class NanogridSerendipitySpaceTopology(NanogridSpaceTopology):
+    def __init__(self, grid: Nanogrid, shape: CubeSerendipityShapeFunctions):
+        super().__init__(grid, shape, need_edge_indices=True, need_face_indices=False)
+
+        self.element_node_index = self._make_element_node_index()
+
+    def node_count(self) -> int:
+        return self.geometry.vertex_count() + (self._shape.ORDER - 1) * self.geometry.edge_count()
+
+    def _make_element_node_index(self):
+        ORDER = self._shape.ORDER
+
+        @cache.dynamic_func(suffix=self.name)
+        def element_node_index(
+            cell_arg: Nanogrid.CellArg,
+            topo_arg: NanogridSpaceTopology.TopologyArg,
+            element_index: ElementIndex,
+            node_index_in_elt: int,
+        ):
+            node_type, type_index = self._shape.node_type_and_type_index(node_index_in_elt)
+
+            ijk = cell_arg.cell_ijk[element_index]
+
+            if node_type == CubeSerendipityShapeFunctions.VERTEX:
+                n_ijk = _cell_vertex_coord(ijk, type_index)
+                return wp.volume_lookup_index(topo_arg.vertex_grid, n_ijk[0], n_ijk[1], n_ijk[2])
+
+            type_instance, index_in_edge = CubeSerendipityShapeFunctions._cube_edge_index(node_type, type_index)
+            axis = type_instance >> 2
+            node_offset = type_instance & 3
+
+            n_ijk = _cell_edge_coord(ijk, axis, node_offset)
+
+            edge_index = wp.volume_lookup_index(topo_arg.edge_grid, n_ijk[0], n_ijk[1], n_ijk[2])
+            return topo_arg.vertex_count + (ORDER - 1) * edge_index + index_in_edge
+
+        return element_node_index
+
+
+def make_nanogrid_space_topology(grid: Nanogrid, shape: ShapeFunction):
+    if isinstance(shape, CubeSerendipityShapeFunctions):
+        return forward_base_topology(NanogridSerendipitySpaceTopology, grid, shape)
+
+    if isinstance(shape, CubeTripolynomialShapeFunctions):
+        return forward_base_topology(NanogridTripolynomialSpaceTopology, grid, shape)
+
+    raise ValueError(f"Unsupported shape function {shape.name}")
diff --git a/warp/fem/space/quadmesh_2d_function_space.py b/warp/fem/space/quadmesh_2d_function_space.py
index 27803b556..22f1e9a56 100644
--- a/warp/fem/space/quadmesh_2d_function_space.py
+++ b/warp/fem/space/quadmesh_2d_function_space.py
@@ -1,18 +1,15 @@
 import warp as wp
 from warp.fem import cache
 from warp.fem.geometry import Quadmesh2D
-from warp.fem.polynomial import Polynomial, is_closed
-from warp.fem.types import Coords, ElementIndex
+from warp.fem.polynomial import is_closed
+from warp.fem.types import ElementIndex
 
-from .basis_space import ShapeBasisSpace, TraceBasisSpace
 from .shape import (
-    ConstantShapeFunction,
     ShapeFunction,
     SquareBipolynomialShapeFunctions,
-    SquareNonConformingPolynomialShapeFunctions,
     SquareSerendipityShapeFunctions,
 )
-from .topology import DiscontinuousSpaceTopologyMixin, SpaceTopology, forward_base_topology
+from .topology import SpaceTopology, forward_base_topology
 
 
 @wp.struct
@@ -28,6 +25,9 @@ class Quadmesh2DSpaceTopology(SpaceTopology):
     TopologyArg = Quadmesh2DTopologyArg
 
     def __init__(self, mesh: Quadmesh2D, shape: ShapeFunction):
+        if not is_closed(shape.family):
+            raise ValueError("A closed polynomial family is required to define a continuous function space")
+
         super().__init__(mesh, shape.NODES_PER_ELEMENT)
         self._mesh = mesh
         self._shape = shape
@@ -107,44 +107,6 @@ def _compute_quad_edge_indices_kernel(
             quad_edge_indices[q1, t1_edge] = e
 
 
-class Quadmesh2DDiscontinuousSpaceTopology(
-    DiscontinuousSpaceTopologyMixin,
-    SpaceTopology,
-):
-    def __init__(self, mesh: Quadmesh2D, shape: ShapeFunction):
-        super().__init__(mesh, shape.NODES_PER_ELEMENT)
-
-
-class Quadmesh2DBasisSpace(ShapeBasisSpace):
-    def __init__(self, topology: Quadmesh2DSpaceTopology, shape: ShapeFunction):
-        super().__init__(topology, shape)
-
-        self._mesh: Quadmesh2D = topology.geometry
-
-
-class Quadmesh2DPiecewiseConstantBasis(Quadmesh2DBasisSpace):
-    def __init__(self, mesh: Quadmesh2D):
-        shape = ConstantShapeFunction(mesh.reference_cell(), space_dimension=2)
-        topology = Quadmesh2DDiscontinuousSpaceTopology(mesh, shape)
-        super().__init__(shape=shape, topology=topology)
-
-    class Trace(TraceBasisSpace):
-        @wp.func
-        def _node_coords_in_element(
-            side_arg: Quadmesh2D.SideArg,
-            basis_arg: Quadmesh2DBasisSpace.BasisArg,
-            element_index: ElementIndex,
-            node_index_in_element: int,
-        ):
-            return Coords(0.5, 0.0, 0.0)
-
-        def make_node_coords_in_element(self):
-            return self._node_coords_in_element
-
-    def trace(self):
-        return Quadmesh2DPiecewiseConstantBasis.Trace(self)
-
-
 class Quadmesh2DBipolynomialSpaceTopology(Quadmesh2DSpaceTopology):
     def __init__(self, mesh: Quadmesh2D, shape: SquareBipolynomialShapeFunctions):
         super().__init__(mesh, shape)
@@ -236,41 +198,6 @@ def element_node_index(
         return element_node_index
 
 
-class Quadmesh2DBipolynomialBasisSpace(Quadmesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Quadmesh2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        if not is_closed(family):
-            raise ValueError("A closed polynomial family is required to define a continuous function space")
-
-        shape = SquareBipolynomialShapeFunctions(degree, family=family)
-        topology = forward_base_topology(Quadmesh2DBipolynomialSpaceTopology, mesh, shape)
-
-        super().__init__(topology, shape)
-
-
-class Quadmesh2DDGBipolynomialBasisSpace(Quadmesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Quadmesh2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = SquareBipolynomialShapeFunctions(degree, family=family)
-        topology = Quadmesh2DDiscontinuousSpaceTopology(mesh, shape)
-
-        super().__init__(topology, shape)
-
-
 class Quadmesh2DSerendipitySpaceTopology(Quadmesh2DSpaceTopology):
     def __init__(self, grid: Quadmesh2D, shape: SquareSerendipityShapeFunctions):
         super().__init__(grid, shape)
@@ -324,45 +251,11 @@ def element_node_index(
         return element_node_index
 
 
-class Quadmesh2DSerendipityBasisSpace(Quadmesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Quadmesh2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
+def make_quadmesh_2d_space_topology(mesh: Quadmesh2D, shape: ShapeFunction):
+    if isinstance(shape, SquareSerendipityShapeFunctions):
+        return forward_base_topology(Quadmesh2DSerendipitySpaceTopology, mesh, shape)
 
-        shape = SquareSerendipityShapeFunctions(degree, family=family)
-        topology = forward_base_topology(Quadmesh2DSerendipitySpaceTopology, mesh, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
-
-
-class Quadmesh2DDGSerendipityBasisSpace(Quadmesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Quadmesh2D,
-        degree: int,
-        family: Polynomial,
-    ):
-        if family is None:
-            family = Polynomial.LOBATTO_GAUSS_LEGENDRE
-
-        shape = SquareSerendipityShapeFunctions(degree, family=family)
-        topology = Quadmesh2DDiscontinuousSpaceTopology(mesh, shape=shape)
-
-        super().__init__(topology=topology, shape=shape)
-
-
-class Quadmesh2DPolynomialBasisSpace(Quadmesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Quadmesh2D,
-        degree: int,
-    ):
-        shape = SquareNonConformingPolynomialShapeFunctions(degree)
-        topology = Quadmesh2DDiscontinuousSpaceTopology(mesh, shape)
+    if isinstance(shape, SquareBipolynomialShapeFunctions):
+        return forward_base_topology(Quadmesh2DBipolynomialSpaceTopology, mesh, shape)
 
-        super().__init__(topology, shape)
+    raise ValueError(f"Unsupported shape function {shape.name}")
diff --git a/warp/fem/space/restriction.py b/warp/fem/space/restriction.py
index 5a2098e55..561f786eb 100644
--- a/warp/fem/space/restriction.py
+++ b/warp/fem/space/restriction.py
@@ -38,7 +38,10 @@ def _compute_node_element_indices(self, device, temporary_store: TemporaryStore)
 
         NODES_PER_ELEMENT = self.space_topology.NODES_PER_ELEMENT
 
-        @cache.dynamic_kernel(suffix=f"{self.domain.name}_{self.space_topology.name}_{self.space_partition.name}")
+        @cache.dynamic_kernel(
+            suffix=f"{self.domain.name}_{self.space_topology.name}_{self.space_partition.name}",
+            kernel_options={"max_unroll": 8},
+        )
         def fill_element_node_indices(
             element_arg: self.domain.ElementArg,
             domain_index_arg: self.domain.ElementIndexArg,
diff --git a/warp/fem/space/shape/__init__.py b/warp/fem/space/shape/__init__.py
index 5ff73d0ed..0d9f64ed7 100644
--- a/warp/fem/space/shape/__init__.py
+++ b/warp/fem/space/shape/__init__.py
@@ -1,3 +1,9 @@
+from enum import Enum
+from typing import Optional
+
+from warp.fem.geometry import element as _element
+from warp.fem.polynomial import Polynomial
+
 from .cube_shape_function import (
     CubeNonConformingPolynomialShapeFunctions,
     CubeSerendipityShapeFunctions,
@@ -11,3 +17,74 @@
 )
 from .tet_shape_function import TetrahedronNonConformingPolynomialShapeFunctions, TetrahedronPolynomialShapeFunctions
 from .triangle_shape_function import Triangle2DNonConformingPolynomialShapeFunctions, Triangle2DPolynomialShapeFunctions
+
+
+class ElementBasis(Enum):
+    """Choice of basis function to equip individual elements"""
+
+    LAGRANGE = 0
+    """Lagrange basis functions :math:`P_k` for simplices, tensor products :math:`Q_k` for squares and cubes"""
+    SERENDIPITY = 1
+    """Serendipity elements :math:`S_k`, corresponding to Lagrange nodes with interior points removed (for degree <= 3)"""
+    NONCONFORMING_POLYNOMIAL = 2
+    """Simplex Lagrange basis functions :math:`P_{kd}` embedded into non conforming reference elements (e.g. squares or cubes). Discontinuous only."""
+
+
+def get_shape_function(
+    element: _element.Element,
+    space_dimension: int,
+    degree: int,
+    element_basis: ElementBasis,
+    family: Optional[Polynomial] = None,
+):
+    """
+    Equips a reference element with a shape function basis.
+
+    Args:
+        element: the reference element on which to build the shape function
+        space_dimension: the dimension of the embedding space
+        degree: polynomial degree of the per-element shape functions
+        element_basis: type of basis function for the individual elements
+        family: Polynomial family used to generate the shape function basis. If not provided, a reasonable basis is chosen.
+
+    Returns:
+        the corresponding shape function
+    """
+
+    if degree == 0:
+        return ConstantShapeFunction(element, space_dimension)
+
+    if family is None:
+        family = Polynomial.LOBATTO_GAUSS_LEGENDRE
+
+    if isinstance(element, _element.Square):
+        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
+            return SquareNonConformingPolynomialShapeFunctions(degree=degree)
+        if element_basis == ElementBasis.SERENDIPITY and degree > 1:
+            return SquareSerendipityShapeFunctions(degree=degree, family=family)
+
+        return SquareBipolynomialShapeFunctions(degree=degree, family=family)
+    if isinstance(element, _element.Triangle):
+        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
+            return Triangle2DNonConformingPolynomialShapeFunctions(degree=degree)
+        if element_basis == ElementBasis.SERENDIPITY and degree > 2:
+            raise NotImplementedError("Serendipity variant not implemented yet for Triangle elements")
+
+        return Triangle2DPolynomialShapeFunctions(degree=degree)
+
+    if isinstance(element, _element.Cube):
+        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
+            return CubeNonConformingPolynomialShapeFunctions(degree=degree)
+        if element_basis == ElementBasis.SERENDIPITY and degree > 1:
+            return CubeSerendipityShapeFunctions(degree=degree, family=family)
+
+        return CubeTripolynomialShapeFunctions(degree=degree, family=family)
+    if isinstance(element, _element.Tetrahedron):
+        if element_basis == ElementBasis.NONCONFORMING_POLYNOMIAL:
+            return TetrahedronNonConformingPolynomialShapeFunctions(degree=degree)
+        if element_basis == ElementBasis.SERENDIPITY and degree > 2:
+            raise NotImplementedError("Serendipity variant not implemented yet for Tet elements")
+
+        return TetrahedronPolynomialShapeFunctions(degree=degree)
+
+    return NotImplementedError("Unrecognized element type")
diff --git a/warp/fem/space/shape/cube_shape_function.py b/warp/fem/space/shape/cube_shape_function.py
index bc6700781..a2ce5983a 100644
--- a/warp/fem/space/shape/cube_shape_function.py
+++ b/warp/fem/space/shape/cube_shape_function.py
@@ -10,16 +10,6 @@
 
 from .tet_shape_function import TetrahedronPolynomialShapeFunctions
 
-_CUBE_EDGE_INDICES = wp.constant(
-    wp.mat(shape=(3, 4), dtype=int)(
-        [
-            [0, 4, 2, 6],
-            [3, 1, 7, 5],
-            [8, 11, 9, 10],
-        ]
-    )
-)
-
 
 class CubeTripolynomialShapeFunctions:
     VERTEX = 0
@@ -96,13 +86,13 @@ def node_type_and_type_index(
                         return CubeTripolynomialShapeFunctions.VERTEX, type_instance, 0
 
                     # z edge
-                    type_instance = _CUBE_EDGE_INDICES[2, mi * 2 + mj]
+                    type_instance = 8 + mi * 2 + mj
                     type_index = k - 1
                     return CubeTripolynomialShapeFunctions.EDGE, type_instance, type_index
 
                 if zk + mk == 1:
                     # y edge
-                    type_instance = _CUBE_EDGE_INDICES[1, mk * 2 + mi]
+                    type_instance = 4 + mk * 2 + mi
                     type_index = j - 1
                     return CubeTripolynomialShapeFunctions.EDGE, type_instance, type_index
 
@@ -114,7 +104,7 @@ def node_type_and_type_index(
             if zj + mj == 1:
                 if zk + mk == 1:
                     # x edge
-                    type_instance = _CUBE_EDGE_INDICES[0, mj * 2 + mk]
+                    type_instance = mj * 2 + mk
                     type_index = i - 1
                     return CubeTripolynomialShapeFunctions.EDGE, type_instance, type_index
 
@@ -399,7 +389,7 @@ def _vertex_coords(vidx_in_cell: int):
     def _edge_coords(type_index: int):
         index_in_side = type_index // 4
         side_offset = type_index - 4 * index_in_side
-        return (wp.vec3i(index_in_side + 1, side_offset // 2, side_offset % 2),)
+        return wp.vec3i(index_in_side + 1, side_offset // 2, side_offset & 1)
 
     @wp.func
     def _edge_axis(node_type: int):
@@ -410,7 +400,7 @@ def _cube_edge_index(node_type: int, type_index: int):
         index_in_side = type_index // 4
         side_offset = type_index - 4 * index_in_side
 
-        return _CUBE_EDGE_INDICES[node_type - CubeSerendipityShapeFunctions.EDGE_X, side_offset], index_in_side
+        return 4 * (node_type - CubeSerendipityShapeFunctions.EDGE_X) + side_offset, index_in_side
 
     def _get_node_lobatto_indices(self):
         ORDER = self.ORDER
diff --git a/warp/fem/space/tetmesh_function_space.py b/warp/fem/space/tetmesh_function_space.py
index b998a46f2..1ff789017 100644
--- a/warp/fem/space/tetmesh_function_space.py
+++ b/warp/fem/space/tetmesh_function_space.py
@@ -1,16 +1,13 @@
 import warp as wp
 from warp.fem import cache
 from warp.fem.geometry import Tetmesh
-from warp.fem.types import Coords, ElementIndex
+from warp.fem.types import ElementIndex
 
-from .basis_space import ShapeBasisSpace, TraceBasisSpace
 from .shape import (
-    ConstantShapeFunction,
     ShapeFunction,
-    TetrahedronNonConformingPolynomialShapeFunctions,
     TetrahedronPolynomialShapeFunctions,
 )
-from .topology import DiscontinuousSpaceTopologyMixin, SpaceTopology, forward_base_topology
+from .topology import SpaceTopology, forward_base_topology
 
 
 @wp.struct
@@ -130,44 +127,6 @@ def _compute_tet_face_indices_kernel(
             tet_face_indices[t1, t1_face] = e
 
 
-class TetmeshDiscontinuousSpaceTopology(
-    DiscontinuousSpaceTopologyMixin,
-    SpaceTopology,
-):
-    def __init__(self, mesh: Tetmesh, shape: ShapeFunction):
-        super().__init__(mesh, shape.NODES_PER_ELEMENT)
-
-
-class TetmeshBasisSpace(ShapeBasisSpace):
-    def __init__(self, topology: TetmeshSpaceTopology, shape: ShapeFunction):
-        super().__init__(topology, shape)
-
-        self._mesh: Tetmesh = topology.geometry
-
-
-class TetmeshPiecewiseConstantBasis(TetmeshBasisSpace):
-    def __init__(self, mesh: Tetmesh):
-        shape = ConstantShapeFunction(mesh.reference_cell(), space_dimension=3)
-        topology = TetmeshDiscontinuousSpaceTopology(mesh, shape)
-        super().__init__(shape=shape, topology=topology)
-
-    class Trace(TraceBasisSpace):
-        @wp.func
-        def _node_coords_in_element(
-            side_arg: Tetmesh.SideArg,
-            basis_arg: TetmeshBasisSpace.BasisArg,
-            element_index: ElementIndex,
-            node_index_in_element: int,
-        ):
-            return Coords(1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0)
-
-        def make_node_coords_in_element(self):
-            return self._node_coords_in_element
-
-    def trace(self):
-        return TetmeshPiecewiseConstantBasis.Trace(self)
-
-
 class TetmeshPolynomialSpaceTopology(TetmeshSpaceTopology):
     def __init__(self, mesh: Tetmesh, shape: TetrahedronPolynomialShapeFunctions):
         super().__init__(mesh, shape, need_tet_edge_indices=shape.ORDER >= 2, need_tet_face_indices=shape.ORDER >= 3)
@@ -258,37 +217,8 @@ def element_node_index(
         return element_node_index
 
 
-class TetmeshPolynomialBasisSpace(TetmeshBasisSpace):
-    def __init__(
-        self,
-        mesh: Tetmesh,
-        degree: int,
-    ):
-        shape = TetrahedronPolynomialShapeFunctions(degree)
-        topology = forward_base_topology(TetmeshPolynomialSpaceTopology, mesh, shape)
-
-        super().__init__(topology, shape)
-
-
-class TetmeshDGPolynomialBasisSpace(TetmeshBasisSpace):
-    def __init__(
-        self,
-        mesh: Tetmesh,
-        degree: int,
-    ):
-        shape = TetrahedronPolynomialShapeFunctions(degree)
-        topology = TetmeshDiscontinuousSpaceTopology(mesh, shape)
-
-        super().__init__(topology, shape)
-
-
-class TetmeshNonConformingPolynomialBasisSpace(TetmeshBasisSpace):
-    def __init__(
-        self,
-        mesh: Tetmesh,
-        degree: int,
-    ):
-        shape = TetrahedronNonConformingPolynomialShapeFunctions(degree)
-        topology = TetmeshDiscontinuousSpaceTopology(mesh, shape)
+def make_tetmesh_space_topology(mesh: Tetmesh, shape: ShapeFunction):
+    if isinstance(shape, TetrahedronPolynomialShapeFunctions):
+        return forward_base_topology(TetmeshPolynomialSpaceTopology, mesh, shape)
 
-        super().__init__(topology, shape)
+    raise ValueError(f"Unsupported shape function {shape.name}")
diff --git a/warp/fem/space/trimesh_2d_function_space.py b/warp/fem/space/trimesh_2d_function_space.py
index 9149f63f2..dc4cc2864 100644
--- a/warp/fem/space/trimesh_2d_function_space.py
+++ b/warp/fem/space/trimesh_2d_function_space.py
@@ -1,16 +1,13 @@
 import warp as wp
 from warp.fem import cache
 from warp.fem.geometry import Trimesh2D
-from warp.fem.types import Coords, ElementIndex
+from warp.fem.types import ElementIndex
 
-from .basis_space import ShapeBasisSpace, TraceBasisSpace
 from .shape import (
-    ConstantShapeFunction,
     ShapeFunction,
-    Triangle2DNonConformingPolynomialShapeFunctions,
     Triangle2DPolynomialShapeFunctions,
 )
-from .topology import DiscontinuousSpaceTopologyMixin, SpaceTopology, forward_base_topology
+from .topology import SpaceTopology, forward_base_topology
 
 
 @wp.struct
@@ -95,44 +92,6 @@ def _compute_tri_edge_indices_kernel(
             tri_edge_indices[t1, t1_edge] = e
 
 
-class Trimesh2DDiscontinuousSpaceTopology(
-    DiscontinuousSpaceTopologyMixin,
-    SpaceTopology,
-):
-    def __init__(self, mesh: Trimesh2D, shape: ShapeFunction):
-        super().__init__(mesh, shape.NODES_PER_ELEMENT)
-
-
-class Trimesh2DBasisSpace(ShapeBasisSpace):
-    def __init__(self, topology: Trimesh2DSpaceTopology, shape: ShapeFunction):
-        super().__init__(topology, shape)
-
-        self._mesh: Trimesh2D = topology.geometry
-
-
-class Trimesh2DPiecewiseConstantBasis(Trimesh2DBasisSpace):
-    def __init__(self, mesh: Trimesh2D):
-        shape = ConstantShapeFunction(mesh.reference_cell(), space_dimension=2)
-        topology = Trimesh2DDiscontinuousSpaceTopology(mesh, shape)
-        super().__init__(shape=shape, topology=topology)
-
-    class Trace(TraceBasisSpace):
-        @wp.func
-        def _node_coords_in_element(
-            side_arg: Trimesh2D.SideArg,
-            basis_arg: Trimesh2DBasisSpace.BasisArg,
-            element_index: ElementIndex,
-            node_index_in_element: int,
-        ):
-            return Coords(0.5, 0.0, 0.0)
-
-        def make_node_coords_in_element(self):
-            return self._node_coords_in_element
-
-    def trace(self):
-        return Trimesh2DPiecewiseConstantBasis.Trace(self)
-
-
 class Trimesh2DPolynomialSpaceTopology(Trimesh2DSpaceTopology):
     def __init__(self, mesh: Trimesh2D, shape: Triangle2DPolynomialShapeFunctions):
         super().__init__(mesh, shape)
@@ -187,37 +146,8 @@ def element_node_index(
         return element_node_index
 
 
-class Trimesh2DPolynomialBasisSpace(Trimesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Trimesh2D,
-        degree: int,
-    ):
-        shape = Triangle2DPolynomialShapeFunctions(degree)
-        topology = forward_base_topology(Trimesh2DPolynomialSpaceTopology, mesh, shape)
-
-        super().__init__(topology, shape)
-
-
-class Trimesh2DDGPolynomialBasisSpace(Trimesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Trimesh2D,
-        degree: int,
-    ):
-        shape = Triangle2DPolynomialShapeFunctions(degree)
-        topology = Trimesh2DDiscontinuousSpaceTopology(mesh, shape)
-
-        super().__init__(topology, shape)
-
-
-class Trimesh2DNonConformingPolynomialBasisSpace(Trimesh2DBasisSpace):
-    def __init__(
-        self,
-        mesh: Trimesh2D,
-        degree: int,
-    ):
-        shape = Triangle2DNonConformingPolynomialShapeFunctions(degree)
-        topology = Trimesh2DDiscontinuousSpaceTopology(mesh, shape)
+def make_trimesh_2d_space_topology(mesh: Trimesh2D, shape: ShapeFunction):
+    if isinstance(shape, Triangle2DPolynomialShapeFunctions):
+        return forward_base_topology(Trimesh2DPolynomialSpaceTopology, mesh, shape)
 
-        super().__init__(topology, shape)
+    raise ValueError(f"Unsupported shape function {shape.name}")
diff --git a/warp/tests/test_examples.py b/warp/tests/test_examples.py
index 921306559..153847ad6 100644
--- a/warp/tests/test_examples.py
+++ b/warp/tests/test_examples.py
@@ -333,13 +333,14 @@ class TestFemDiffusionExamples(unittest.TestCase):
     test_options={"headless": True},
 )
 
-# The following examples do not need CUDA
 add_example_test(
     TestFemExamples,
     name="fem.example_apic_fluid",
-    devices=test_devices,
-    test_options={"num_frames": 1, "res": "16,16,16"},
+    devices=get_selected_cuda_test_devices(),
+    test_options={"num_frames": 5, "voxel_size": 2.0},
 )
+
+# The following examples do not need CUDA
 add_example_test(
     TestFemDiffusionExamples,
     name="fem.example_diffusion",
@@ -363,9 +364,9 @@ class TestFemDiffusionExamples(unittest.TestCase):
 )
 add_example_test(
     TestFemExamples,
-    name="fem.example_convection_diffusion_dg0",
+    name="fem.example_burgers",
     devices=test_devices,
-    test_options={"resolution": 20, "num_frames": 25, "mesh": "quad", "headless": True},
+    test_options={"resolution": 20, "num_frames": 25, "degree": 1, "headless": True},
 )
 add_example_test(
     TestFemExamples,
diff --git a/warp/tests/test_fem.py b/warp/tests/test_fem.py
index f93d13413..aa48a8626 100644
--- a/warp/tests/test_fem.py
+++ b/warp/tests/test_fem.py
@@ -433,13 +433,13 @@ def test_geo_sides_kernel(
         wp.expect_near(coords, inner_side_coords, 0.0001)
         wp.expect_near(coords, outer_side_coords, 0.0001)
 
-        vol = geo.side_measure(side_arg, s)
-        wp.atomic_add(side_measures, side_index, vol * qp_weights[q])
+        area = geo.side_measure(side_arg, s)
+        wp.atomic_add(side_measures, side_index, area * qp_weights[q])
 
         # test consistency of side normal, measure, and deformation gradient
         F = geo.side_deformation_gradient(side_arg, s)
         F_det = DeformedGeometry._side_measure(F)
-        wp.expect_near(F_det * REF_MEASURE, vol)
+        wp.expect_near(F_det * REF_MEASURE, area)
 
         nor = geo.side_normal(side_arg, s)
         F_cross = DeformedGeometry._side_normal(F)
@@ -587,6 +587,28 @@ def test_hex_mesh(test, device):
     assert_np_equal(cell_measures.numpy(), np.full(cell_measures.shape, 1.0 / (N**3)), tol=1.0e-4)
 
 
+def test_nanogrid(test, device):
+    N = 8
+
+    points = wp.array([[0.5, 0.5, 0.5]], dtype=float, device=device)
+    volume = wp.Volume.allocate_by_tiles(
+        tile_points=points, voxel_size=1.0 / N, translation=(0.0, 0.0, 0.0), bg_value=None, device=device
+    )
+
+    geo = fem.Nanogrid(volume)
+
+    test.assertEqual(geo.cell_count(), (N) ** 3)
+    test.assertEqual(geo.vertex_count(), (N + 1) ** 3)
+    test.assertEqual(geo.side_count(), 3 * (N + 1) * N**2)
+    test.assertEqual(geo.boundary_side_count(), 6 * N * N)
+    test.assertEqual(geo.edge_count(), 3 * N * (N + 1) ** 2)
+
+    side_measures, cell_measures = _launch_test_geometry_kernel(geo, device)
+
+    assert_np_equal(side_measures.numpy(), np.full(side_measures.shape, 1.0 / (N**2)), tol=1.0e-4)
+    assert_np_equal(cell_measures.numpy(), np.full(cell_measures.shape, 1.0 / (N**3)), tol=1.0e-4)
+
+
 @integrand
 def _rigid_deformation_field(s: Sample, domain: Domain, translation: wp.vec3, rotation: wp.vec3, scale: float):
     q = wp.quat_from_axis_angle(wp.normalize(rotation), wp.length(rotation))
@@ -1234,6 +1256,7 @@ def test_particle_quadratures(test, device):
 
 
 devices = get_test_devices()
+cuda_devices = get_selected_cuda_test_devices()
 
 
 class TestFem(unittest.TestCase):
@@ -1253,6 +1276,7 @@ class TestFem(unittest.TestCase):
 add_function_test(TestFem, "test_grid_3d", test_grid_3d, devices=devices)
 add_function_test(TestFem, "test_tet_mesh", test_tet_mesh, devices=devices)
 add_function_test(TestFem, "test_hex_mesh", test_hex_mesh, devices=devices)
+add_function_test(TestFem, "test_nanogrid", test_nanogrid, devices=cuda_devices)
 add_function_test(TestFem, "test_deformed_geometry", test_deformed_geometry, devices=devices)
 add_function_test(TestFem, "test_dof_mapper", test_dof_mapper)
 add_function_test(TestFem, "test_point_basis", test_point_basis)