diff --git a/reproject/array_utils.py b/reproject/array_utils.py
index ec3a39a2e..1d5aae562 100644
--- a/reproject/array_utils.py
+++ b/reproject/array_utils.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-__all__ = ["map_coordinates"]
+__all__ = ["map_coordinates", "sample_array_edges"]
 
 
 def map_coordinates(image, coords, **kwargs):
@@ -35,3 +35,22 @@ def map_coordinates(image, coords, **kwargs):
     values[reset] = kwargs.get("cval", 0.0)
 
     return values
+
+
+def sample_array_edges(shape, *, n_samples):
+    # Given an N-dimensional array shape, sample each edge of the array using
+    # the requested number of samples (which will include vertices). To do this
+    # we iterate through the dimensions and for each one we sample the points
+    # in that dimension and iterate over the combination of other vertices.
+    # Returns an array with dimensions (N, n_samples)
+    all_positions = []
+    ndim = len(shape)
+    shape = np.array(shape)
+    for idim in range(ndim):
+        for vertex in range(2**ndim):
+            positions = -0.5 + shape * ((vertex & (2 ** np.arange(ndim))) > 0).astype(int)
+            positions = np.broadcast_to(positions, (n_samples, ndim)).copy()
+            positions[:, idim] = np.linspace(-0.5, shape[idim] - 0.5, n_samples)
+            all_positions.append(positions)
+    positions = np.unique(np.vstack(all_positions), axis=0).T
+    return positions
diff --git a/reproject/mosaicking/coadd.py b/reproject/mosaicking/coadd.py
index 8de178e35..bbeb652d9 100644
--- a/reproject/mosaicking/coadd.py
+++ b/reproject/mosaicking/coadd.py
@@ -4,6 +4,7 @@
 from astropy.wcs import WCS
 from astropy.wcs.wcsapi import SlicedLowLevelWCS
 
+from ..array_utils import sample_array_edges
 from ..utils import parse_input_data, parse_input_weights, parse_output_projection
 from .background import determine_offset_matrix, solve_corrections_sgd
 from .subset_array import ReprojectedArraySubset
@@ -11,6 +12,10 @@
 __all__ = ["reproject_and_coadd"]
 
 
+def _noop(iterable):
+    return iterable
+
+
 def reproject_and_coadd(
     input_data,
     output_projection,
@@ -24,14 +29,15 @@ def reproject_and_coadd(
     background_reference=None,
     output_array=None,
     output_footprint=None,
+    block_sizes=None,
+    progress_bar=None,
+    blank_pixel_value=0,
     **kwargs,
 ):
     """
-    Given a set of input images, reproject and co-add these to a single
+    Given a set of input data, reproject and co-add these to a single
     final image.
 
-    This currently only works with 2-d images with celestial WCS.
-
     Parameters
     ----------
     input_data : iterable
@@ -77,7 +83,7 @@ def reproject_and_coadd(
         `~astropy.io.fits.HDUList` instance, specifies the HDU to use.
     reproject_function : callable
         The function to use for the reprojection.
-    combine_function : { 'mean', 'sum', 'median', 'first', 'last', 'min', 'max' }
+    combine_function : { 'mean', 'sum', 'first', 'last', 'min', 'max' }
         The type of function to use for combining the values into the final
         image. For 'first' and 'last', respectively, the reprojected images are
         simply overlaid on top of each other. With respect to the order of the
@@ -92,11 +98,22 @@ def reproject_and_coadd(
     output_array : array or None
         The final output array.  Specify this if you already have an
         appropriately-shaped array to store the data in.  Must match shape
-        specified with ``shape_out`` or derived from the output projection.
+        specified with `shape_out` or derived from the output
+        projection.
     output_footprint : array or None
         The final output footprint array.  Specify this if you already have an
         appropriately-shaped array to store the data in.  Must match shape
-        specified with ``shape_out`` or derived from the output projection.
+        specified with `shape_out` or derived from the output projection.
+    block_sizes : list of tuples or None
+        The block size to use for each dataset.  Could also be a single tuple
+        if you want the sample block size for all data sets.
+    progress_bar : callable, optional
+        If specified, use this as a progress_bar to track loop iterations over
+        data sets.
+    blank_pixel_value : float, optional
+        Value to use for areas of the resulting mosaic that do not have input
+        data.
+
     **kwargs
         Keyword arguments to be passed to the reprojection function.
 
@@ -116,34 +133,49 @@ def reproject_and_coadd(
 
     # Validate inputs
 
-    if combine_function not in ("mean", "sum", "median", "first", "last", "min", "max"):
-        raise ValueError("combine_function should be one of mean/sum/median/first/last/min/max")
+    if combine_function not in ("mean", "sum", "first", "last", "min", "max"):
+        raise ValueError("combine_function should be one of mean/sum/first/last/min/max")
 
     if reproject_function is None:
         raise ValueError(
             "reprojection function should be specified with the reproject_function argument"
         )
 
+    if progress_bar is None:
+        progress_bar = _noop
+
     # Parse the output projection to avoid having to do it for each
 
     wcs_out, shape_out = parse_output_projection(output_projection, shape_out=shape_out)
 
-    if output_array is not None and output_array.shape != shape_out:
+    if output_array is None:
+        output_array = np.zeros(shape_out)
+    elif output_array.shape != shape_out:
         raise ValueError(
             "If you specify an output array, it must have a shape matching "
             f"the output shape {shape_out}"
         )
-    if output_footprint is not None and output_footprint.shape != shape_out:
+
+    if output_footprint is None:
+        output_footprint = np.zeros(shape_out)
+    elif output_footprint.shape != shape_out:
         raise ValueError(
             "If you specify an output footprint array, it must have a shape matching "
             f"the output shape {shape_out}"
         )
 
+    # Define 'on-the-fly' mode: in the case where we don't need to match
+    # the backgrounds and we are combining with 'mean' or 'sum', we don't
+    # have to keep track of the intermediate arrays and can just modify
+    # the output array on-the-fly
+    on_the_fly = not match_background and combine_function in ("mean", "sum")
+
     # Start off by reprojecting individual images to the final projection
 
-    arrays = []
+    if not on_the_fly:
+        arrays = []
 
-    for idata in range(len(input_data)):
+    for idata in progress_bar(range(len(input_data))):
         # We need to pre-parse the data here since we need to figure out how to
         # optimize/minimize the size of each output tile (see below).
         array_in, wcs_in = parse_input_data(input_data[idata], hdu_in=hdu_in)
@@ -166,13 +198,9 @@ def reproject_and_coadd(
         # significant distortion (when the edges of the input image become
         # convex in the output projection), and transforming every edge pixel,
         # which provides a lot of redundant information.
-        ny, nx = array_in.shape
-        n_per_edge = 11
-        xs = np.linspace(-0.5, nx - 0.5, n_per_edge)
-        ys = np.linspace(-0.5, ny - 0.5, n_per_edge)
-        xs = np.concatenate((xs, np.full(n_per_edge, xs[-1]), xs, np.full(n_per_edge, xs[0])))
-        ys = np.concatenate((np.full(n_per_edge, ys[0]), ys, np.full(n_per_edge, ys[-1]), ys))
-        xc_out, yc_out = wcs_out.world_to_pixel(wcs_in.pixel_to_world(xs, ys))
+
+        edges = sample_array_edges(array_in.shape, n_samples=11)[::-1]
+        edges_out = wcs_out.world_to_pixel(wcs_in.pixel_to_world(*edges))[::-1]
 
         # Determine the cutout parameters
 
@@ -180,28 +208,38 @@ def reproject_and_coadd(
         # such as all-sky images or full solar disk views. In this case we skip
         # this step and just use the full output WCS for reprojection.
 
-        if np.any(np.isnan(xc_out)) or np.any(np.isnan(yc_out)):
-            imin = 0
-            imax = shape_out[1]
-            jmin = 0
-            jmax = shape_out[0]
-        else:
-            imin = max(0, int(np.floor(xc_out.min() + 0.5)))
-            imax = min(shape_out[1], int(np.ceil(xc_out.max() + 0.5)))
-            jmin = max(0, int(np.floor(yc_out.min() + 0.5)))
-            jmax = min(shape_out[0], int(np.ceil(yc_out.max() + 0.5)))
+        ndim_out = len(shape_out)
 
-        if imax < imin or jmax < jmin:
+        skip_data = False
+        if np.any(np.isnan(edges_out)):
+            bounds = list(zip([0] * ndim_out, shape_out))
+        else:
+            bounds = []
+            for idim in range(ndim_out):
+                imin = max(0, int(np.floor(edges_out[idim].min() + 0.5)))
+                imax = min(shape_out[idim], int(np.ceil(edges_out[idim].max() + 0.5)))
+                bounds.append((imin, imax))
+                if imax < imin:
+                    skip_data = True
+                    break
+
+        if skip_data:
             continue
 
+        slice_out = tuple([slice(imin, imax) for (imin, imax) in bounds])
+
         if isinstance(wcs_out, WCS):
-            wcs_out_indiv = wcs_out[jmin:jmax, imin:imax]
+            wcs_out_indiv = wcs_out[slice_out]
         else:
-            wcs_out_indiv = SlicedLowLevelWCS(
-                wcs_out.low_level_wcs, (slice(jmin, jmax), slice(imin, imax))
-            )
+            wcs_out_indiv = SlicedLowLevelWCS(wcs_out.low_level_wcs, slice_out)
 
-        shape_out_indiv = (jmax - jmin, imax - imin)
+        shape_out_indiv = [imax - imin for (imin, imax) in bounds]
+
+        if block_sizes is not None:
+            if len(block_sizes) == len(input_data) and len(block_sizes[idata]) == len(shape_out):
+                kwargs["block_size"] = block_sizes[idata]
+            else:
+                kwargs["block_size"] = block_sizes
 
         # TODO: optimize handling of weights by making reprojection functions
         # able to handle weights, and make the footprint become the combined
@@ -235,12 +273,20 @@ def reproject_and_coadd(
             weights[reset] = 0.0
             footprint *= weights
 
-        array = ReprojectedArraySubset(array, footprint, imin, imax, jmin, jmax)
+        array = ReprojectedArraySubset(array, footprint, bounds)
 
         # TODO: make sure we gracefully handle the case where the
         # output image is empty (due e.g. to no overlap).
 
-        arrays.append(array)
+        if on_the_fly:
+            # By default, values outside of the footprint are set to NaN
+            # but we set these to 0 here to avoid getting NaNs in the
+            # means/sums.
+            array.array[array.footprint == 0] = 0
+            output_array[array.view_in_original_array] += array.array * array.footprint
+            output_footprint[array.view_in_original_array] += array.footprint
+        else:
+            arrays.append(array)
 
     # If requested, try and match the backgrounds.
     if match_background and len(arrays) > 1:
@@ -251,37 +297,32 @@ def reproject_and_coadd(
         for array, correction in zip(arrays, corrections, strict=True):
             array.array -= correction
 
-    # At this point, the images are now ready to be co-added.
-
-    if output_array is None:
-        output_array = np.zeros(shape_out)
-    if output_footprint is None:
-        output_footprint = np.zeros(shape_out)
-
-    if combine_function == "min":
-        output_array[...] = np.inf
-    elif combine_function == "max":
-        output_array[...] = -np.inf
-
     if combine_function in ("mean", "sum"):
-        for array in arrays:
-            # By default, values outside of the footprint are set to NaN
-            # but we set these to 0 here to avoid getting NaNs in the
-            # means/sums.
-            array.array[array.footprint == 0] = 0
+        if match_background:
+            # if we're not matching the background, this part has already been done
+            for array in arrays:
+                # By default, values outside of the footprint are set to NaN
+                # but we set these to 0 here to avoid getting NaNs in the
+                # means/sums.
+                array.array[array.footprint == 0] = 0
 
-            output_array[array.view_in_original_array] += array.array * array.footprint
-            output_footprint[array.view_in_original_array] += array.footprint
+                output_array[array.view_in_original_array] += array.array * array.footprint
+                output_footprint[array.view_in_original_array] += array.footprint
 
         if combine_function == "mean":
             with np.errstate(invalid="ignore"):
                 output_array /= output_footprint
-                output_array[output_footprint == 0] = 0
+                output_array[output_footprint == 0] = blank_pixel_value
 
     elif combine_function in ("first", "last", "min", "max"):
+        if combine_function == "min":
+            output_array[...] = np.inf
+        elif combine_function == "max":
+            output_array[...] = -np.inf
+
         for array in arrays:
             if combine_function == "first":
-                mask = (output_footprint[array.view_in_original_array] == 0) & (array.footprint > 0)
+                mask = output_footprint[array.view_in_original_array] == 0
             elif combine_function == "last":
                 mask = array.footprint > 0
             elif combine_function == "min":
@@ -300,13 +341,6 @@ def reproject_and_coadd(
                 mask, array.array, output_array[array.view_in_original_array]
             )
 
-    elif combine_function == "median":
-        # Here we need to operate in chunks since we could otherwise run
-        # into memory issues
-
-        raise NotImplementedError("combine_function='median' is not yet implemented")
-
-    if combine_function in ("min", "max"):
-        output_array[output_footprint == 0] = 0.0
+    output_array[output_footprint == 0] = blank_pixel_value
 
     return output_array, output_footprint
diff --git a/reproject/mosaicking/subset_array.py b/reproject/mosaicking/subset_array.py
index 37bde7ebb..010114e0a 100644
--- a/reproject/mosaicking/subset_array.py
+++ b/reproject/mosaicking/subset_array.py
@@ -15,35 +15,36 @@ class ReprojectedArraySubset:
     # rather than the center, which is not well defined for even-sized
     # cutouts.
 
-    def __init__(self, array, footprint, imin, imax, jmin, jmax):
+    def __init__(self, array, footprint, bounds):
         self.array = array
         self.footprint = footprint
-        self.imin = imin
-        self.imax = imax
-        self.jmin = jmin
-        self.jmax = jmax
+        self.bounds = bounds
 
     def __repr__(self):
-        return f"<ReprojectedArraySubset at [{self.jmin}:{self.jmax},{self.imin}:{self.imax}]>"
+        bounds_str = "[" + ",".join(f"{imin}:{imax}" for (imin, imax) in self.bounds) + "]"
+        return f"<ReprojectedArraySubset at {bounds_str}>"
 
     @property
     def view_in_original_array(self):
-        return (slice(self.jmin, self.jmax), slice(self.imin, self.imax))
+        return tuple([slice(imin, imax) for (imin, imax) in self.bounds])
 
     @property
     def shape(self):
-        return (self.jmax - self.jmin, self.imax - self.imin)
+        return tuple((imax - imin) for (imin, imax) in self.bounds)
 
     def overlaps(self, other):
         # Note that the use of <= or >= instead of < and > is due to
         # the fact that the max values are exclusive (so +1 above the
         # last value).
-        return not (
-            self.imax <= other.imin
-            or other.imax <= self.imin
-            or self.jmax <= other.jmin
-            or other.jmax <= self.jmin
-        )
+        if len(self.bounds) != len(other.bounds):
+            raise ValueError(
+                f"Mismatch in number of dimensions, expected "
+                f"{len(self.bounds)} dimensions and got {len(other.bounds)}"
+            )
+        for (imin, imax), (imin_other, imax_other) in zip(self.bounds, other.bounds, strict=False):
+            if imax <= imin_other or imax_other <= imin:
+                return False
+        return True
 
     def __add__(self, other):
         return self._operation(other, operator.add)
@@ -58,42 +59,39 @@ def __truediv__(self, other):
         return self._operation(other, operator.truediv)
 
     def _operation(self, other, op):
+        if len(self.bounds) != len(other.bounds):
+            raise ValueError(
+                f"Mismatch in number of dimensions, expected "
+                f"{len(self.bounds)} dimensions and got {len(other.bounds)}"
+            )
+
         # Determine cutout parameters for overlap region
 
-        imin = max(self.imin, other.imin)
-        imax = min(self.imax, other.imax)
-        jmin = max(self.jmin, other.jmin)
-        jmax = min(self.jmax, other.jmax)
-
-        if imax < imin:
-            imax = imin
-
-        if jmax < jmin:
-            jmax = jmin
-
-        # Extract cutout from each
-
-        self_array = self.array[
-            jmin - self.jmin : jmax - self.jmin,
-            imin - self.imin : imax - self.imin,
-        ]
-        self_footprint = self.footprint[
-            jmin - self.jmin : jmax - self.jmin,
-            imin - self.imin : imax - self.imin,
-        ]
-
-        other_array = other.array[
-            jmin - other.jmin : jmax - other.jmin,
-            imin - other.imin : imax - other.imin,
-        ]
-        other_footprint = other.footprint[
-            jmin - other.jmin : jmax - other.jmin,
-            imin - other.imin : imax - other.imin,
-        ]
+        overlap_bounds = []
+        self_slices = []
+        other_slices = []
+        for (imin, imax), (imin_other, imax_other) in zip(self.bounds, other.bounds, strict=False):
+            imin_overlap = max(imin, imin_other)
+            imax_overlap = min(imax, imax_other)
+            if imax_overlap < imin_overlap:
+                imax_overlap = imin_overlap
+            overlap_bounds.append((imin_overlap, imax_overlap))
+            self_slices.append(slice(imin_overlap - imin, imax_overlap - imin))
+            other_slices.append(slice(imin_overlap - imin_other, imax_overlap - imin_other))
+
+        self_slices = tuple(self_slices)
+
+        self_array = self.array[self_slices]
+        self_footprint = self.footprint[self_slices]
+
+        other_slices = tuple(other_slices)
+
+        other_array = other.array[other_slices]
+        other_footprint = other.footprint[other_slices]
 
         # Carry out operator and store result in ReprojectedArraySubset
 
         array = op(self_array, other_array)
         footprint = (self_footprint > 0) & (other_footprint > 0)
 
-        return ReprojectedArraySubset(array, footprint, imin, imax, jmin, jmax)
+        return ReprojectedArraySubset(array, footprint, overlap_bounds)
diff --git a/reproject/mosaicking/tests/test_coadd.py b/reproject/mosaicking/tests/test_coadd.py
index d9e3de683..04b7c271c 100644
--- a/reproject/mosaicking/tests/test_coadd.py
+++ b/reproject/mosaicking/tests/test_coadd.py
@@ -80,7 +80,6 @@ def test_coadd_no_overlap(self, combine_function, reproject_function):
 
         input_data = self._get_tiles(self._nonoverlapping_views)
 
-        input_data = [(self.array, self.wcs)]
         array, footprint = reproject_and_coadd(
             input_data,
             self.wcs,
diff --git a/reproject/mosaicking/tests/test_subset_array.py b/reproject/mosaicking/tests/test_subset_array.py
index 3ecde69f9..dc05ef76c 100644
--- a/reproject/mosaicking/tests/test_subset_array.py
+++ b/reproject/mosaicking/tests/test_subset_array.py
@@ -14,21 +14,35 @@ def setup_method(self, method):
         self.array1 = np.random.random((123, 87))
         self.array2 = np.random.random((123, 87))
         self.array3 = np.random.random((123, 87))
+        self.array4 = np.random.random((123, 87, 16))
 
         self.footprint1 = (self.array1 > 0.5).astype(int)
         self.footprint2 = (self.array2 > 0.5).astype(int)
         self.footprint3 = (self.array3 > 0.5).astype(int)
+        self.footprint4 = (self.array4 > 0.5).astype(int)
 
         self.subset1 = ReprojectedArraySubset(
-            self.array1[20:88, 34:40], self.footprint1[20:88, 34:40], 34, 40, 20, 88
+            self.array1[20:88, 34:40],
+            self.footprint1[20:88, 34:40],
+            [(20, 88), (34, 40)],
         )
 
         self.subset2 = ReprojectedArraySubset(
-            self.array2[50:123, 37:42], self.footprint2[50:123, 37:42], 37, 42, 50, 123
+            self.array2[50:123, 37:42],
+            self.footprint2[50:123, 37:42],
+            [(50, 123), (37, 42)],
         )
 
         self.subset3 = ReprojectedArraySubset(
-            self.array3[40:50, 11:19], self.footprint3[40:50, 11:19], 11, 19, 40, 50
+            self.array3[40:50, 11:19],
+            self.footprint3[40:50, 11:19],
+            [(40, 50), (11, 19)],
+        )
+
+        self.subset4 = ReprojectedArraySubset(
+            self.array4[30:35, 40:45, 1:4],
+            self.footprint4[30:35, 40:45, 1:4],
+            [(30, 35), (40, 45), (1, 4)],
         )
 
     def test_repr(self):
@@ -55,17 +69,23 @@ def test_overlaps(self):
     @pytest.mark.parametrize("op", [operator.add, operator.sub, operator.mul, operator.truediv])
     def test_arithmetic(self, op):
         subset = op(self.subset1, self.subset2)
-        assert subset.imin == 37
-        assert subset.imax == 40
-        assert subset.jmin == 50
-        assert subset.jmax == 88
+        assert subset.bounds == [(50, 88), (37, 40)]
         expected = op(self.array1[50:88, 37:40], self.array2[50:88, 37:40])
         assert_equal(subset.array, expected)
 
     def test_arithmetic_nooverlap(self):
         subset = self.subset1 - self.subset3
-        assert subset.imin == 34
-        assert subset.imax == 34
-        assert subset.jmin == 40
-        assert subset.jmax == 50
+        assert subset.bounds == [(40, 50), (34, 34)]
         assert subset.shape == (10, 0)
+
+    def test_overlaps_dimension_mismatch(self):
+        with pytest.raises(
+            ValueError, match=("Mismatch in number of dimensions, expected 2 dimensions and got 3")
+        ):
+            self.subset1.overlaps(self.subset4)
+
+    def test_arithmetic_dimension_mismatch(self):
+        with pytest.raises(
+            ValueError, match=("Mismatch in number of dimensions, expected 2 dimensions and got 3")
+        ):
+            self.subset1 - self.subset4
diff --git a/reproject/utils.py b/reproject/utils.py
index 3ff46555c..9f72eabbe 100644
--- a/reproject/utils.py
+++ b/reproject/utils.py
@@ -38,7 +38,10 @@ def _dask_to_numpy_memmap(dask_array, tmp_dir):
     with tempfile.TemporaryDirectory() as zarr_tmp:
         # First compute and store the dask array to zarr using whatever
         # the default scheduler is at this point
-        dask_array.to_zarr(zarr_tmp)
+        try:
+            dask_array.to_zarr(zarr_tmp)
+        except ValueError:
+            dask_array.rechunk().to_zarr(zarr_tmp)
 
         # Load the array back to dask
         zarr_array = da.from_zarr(zarr_tmp)