NCAS-CMS · valeriupredoi · Jul 20, 2023 · Jul 5, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/activestorage/dummy_data.py b/activestorage/dummy_data.py
@@ -66,13 +66,21 @@ def make_validrange_ncdata(filename='test_validrange.nc', chunksize=(3, 3, 1), n
     return make_ncdata(filename, chunksize, n, compression=None, valid_range=[-1.0, 1.2 * n ** 3])
 
 
+def make_compressed_ncdata(filename='test_vanilla.nc', chunksize=(3, 3, 1), n=10, compression=None, shuffle=False):
+    """
+    Make a compressed and optionally shuffled vanilla test dataset which is
+    three dimensional with indices and values that aid in testing data
+    extraction.
+    """
+    return make_ncdata(filename, chunksize, n, compression=compression, shuffle=shuffle)
+
+
 def make_vanilla_ncdata(filename='test_vanilla.nc', chunksize=(3, 3, 1), n=10):
     """
     Make a vanilla test dataset which is three dimensional with indices and values that
     aid in testing data extraction.
     """
-    r = make_ncdata(filename, chunksize, n, None, False)
-    return 
+    return make_ncdata(filename, chunksize, n)
 
 
 def make_ncdata(filename, chunksize, n, compression=None, 
@@ -81,7 +89,8 @@ def make_ncdata(filename, chunksize, n, compression=None,
                 valid_range=None,
                 valid_min=None,
                 valid_max=None,
-                partially_missing_data=False):
+                partially_missing_data=False,
+                shuffle=False):
     """ 
     If compression is required, it can be passed in via keyword
     and is applied to all variables.
@@ -96,6 +105,8 @@ def make_ncdata(filename, chunksize, n, compression=None,
     partially_missing_data = True makes half the data missing so we can 
     ensure we find some chunks which are all missing ... Can 
     only be used in combination with a missing value.
+
+    shuffle: if True, apply the HDF5 shuffle filter before compression.
     """
     if partially_missing_data and not missing:
         raise ValueError(f'Missing data value keyword provided and set to {missing} '
@@ -119,15 +130,15 @@ def make_holes(var, indices, attribute, value, dummy):
     ydim = ds.createDimension("ydim", n)
     zdim = ds.createDimension("zdim", n)
 
-    x = ds.createVariable("x","i4",("xdim",), fill_value=fillvalue, compression=compression)
-    y = ds.createVariable("y","i4",("ydim",), fill_value=fillvalue, compression=compression)
-    z = ds.createVariable("z","i4",("zdim",), fill_value=fillvalue, compression=compression)
+    x = ds.createVariable("x","i4",("xdim",), fill_value=fillvalue, compression=compression, shuffle=shuffle)
+    y = ds.createVariable("y","i4",("ydim",), fill_value=fillvalue, compression=compression, shuffle=shuffle)
+    z = ds.createVariable("z","i4",("zdim",), fill_value=fillvalue, compression=compression, shuffle=shuffle)
 
     for a,s in zip([x, y, z],[1, n, n * n]):
         a[:] = dd * s
 
     dvar = ds.createVariable("data","f8", ("xdim","ydim","zdim"),
-                             chunksizes=chunksize, compression=compression)
+                             chunksizes=chunksize, compression=compression, shuffle=shuffle)
     dvar[:] = data
 
     nm1, nm2 = n - 1, n - 2
@@ -179,7 +190,7 @@ def make_holes(var, indices, attribute, value, dummy):
     var = ds.variables['data']
     print(f'\nCreated file "{filename}" with a variable called "data" with shape {var.shape} and chunking, compression {var.chunking()},{compression}\n')
 
-    return mindices, findices, vrindices, vm1indices, vm2indices 
+    return ds, mindices, findices, vrindices, vm1indices, vm2indices
 
 
 if __name__=="__main__":

diff --git a/activestorage/storage.py b/activestorage/storage.py
@@ -8,6 +8,8 @@ def reduce_chunk(rfile, offset, size, compression, filters, missing, dtype, shap
 
     rfile - the actual file with the data 
     offset, size - where and what we want ...
+    compression - optional `numcodecs.abc.Codec` compression codec
+    filters - optional list of `numcodecs.abc.Codec` filter codecs
     dtype - likely float32 in most cases. 
     shape - will be a tuple, something like (3,3,1), this is the dimensionality of the 
             chunk itself
@@ -22,15 +24,12 @@ def reduce_chunk(rfile, offset, size, compression, filters, missing, dtype, shap
 
     """
 
-    if compression is not None:
-        raise NotImplementedError("Compression is not yet supported!")
-    if filters is not None:
-        raise NotImplementedError("Filters are not yet supported!")
-
     #FIXME: for the moment, open the file every time ... we might want to do that, or not
     with open(rfile,'rb') as open_file:
         # get the data
         chunk = read_block(open_file, offset, size)
+        # reverse any compression and filters
+        chunk = filter_pipeline(chunk, compression, filters)
         # make it a numpy array of bytes
         chunk = ensure_ndarray(chunk)
         # convert to the appropriate data type
@@ -51,6 +50,26 @@ def reduce_chunk(rfile, offset, size, compression, filters, missing, dtype, shap
     else:
         return tmp, None
 
+
+def filter_pipeline(chunk, compression, filters):
+    """
+    Reverse any compression and filters applied to the chunk.
+
+    When a chunk is written, the filters are applied in order, then compression
+    is applied. For reading, we must reverse this pipeline.
+
+    :param chunk: possibly filtered and compressed bytes
+    :param compression: optional `numcodecs.abc.Codec` compression codec
+    :param filters: optional list of `numcodecs.abc.Codec` filter codecs
+    :returns: decompressed and defiltered chunk bytes
+    """
+    if compression is not None:
+        chunk = compression.decode(chunk)
+    for filter in reversed(filters or []):
+        chunk = filter.decode(chunk)
+    return chunk
+
+
 def remove_missing(data, missing):
     """ 
     As we are using numpy, we can use a masked array, storage implementations

diff --git a/tests/test_compression.py b/tests/test_compression.py
@@ -0,0 +1,42 @@
+import os
+import pytest
+
+from activestorage.active import Active
+from activestorage.config import *
+from activestorage.dummy_data import make_compressed_ncdata
+
+import utils
+
+
+def create_compressed_dataset(tmp_path: str, compression: str, shuffle: bool):
+    """
+    Make a vanilla test dataset which is compressed and optionally shuffled.
+    """
+    temp_file = str(tmp_path / "test_compression.nc")
+    test_data = make_compressed_ncdata(filename=temp_file, compression=compression, shuffle=shuffle)
+
+    # Sanity check that test data is compressed and filtered as expected.
+    test_data_filters = test_data[0].variables['data'].filters()
+    assert test_data_filters[compression]
+    assert test_data_filters['shuffle'] == shuffle
+
+    test_file = utils.write_to_storage(temp_file)
+    if USE_S3:
+        os.remove(temp_file)
+    return test_file
+
+
+@pytest.mark.skipif(USE_S3, reason="Compression and filtering not supported in S3 yet")
+@pytest.mark.parametrize('compression', ['zlib'])
+@pytest.mark.parametrize('shuffle', [False, True])
+def test_compression_and_filters(tmp_path: str, compression: str, shuffle: bool):
+    """
+    Test use of datasets with compression and filters applied.
+    """
+    test_file = create_compressed_dataset(tmp_path, compression, shuffle)
+
+    active = Active(test_file, 'data', utils.get_storage_type())
+    active._version = 1
+    active._method = "min"
+    result = active[0:2,4:6,7:9]
+    assert result == 740.0
diff --git a/tests/unit/test_storage.py b/tests/unit/test_storage.py
@@ -21,26 +21,6 @@ def test_reduce_chunk():
     assert rc[0] == -1
     assert rc[1] == 15
 
-    # with compression
-    with pytest.raises(NotImplementedError) as exc:
-        rc = st.reduce_chunk(rfile, offset, size,
-                             compression="Blosc", filters=None,
-                             missing=[None, 2050, None, None],
-                             dtype="i2", shape=(8, 8),
-                             order="C", chunk_selection=slice(0, 2, 1),
-                             method=np.max)
-    assert str(exc.value) == "Compression is not yet supported!"
-
-    # with filters
-    with pytest.raises(NotImplementedError) as exc:
-        rc = st.reduce_chunk(rfile, offset, size,
-                             compression=None, filters="filters",
-                             missing=[None, 2050, None, None],
-                             dtype="i2", shape=(8, 8),
-                             order="C", chunk_selection=slice(0, 2, 1),
-                             method=np.max)
-    assert str(exc.value) == "Filters are not yet supported!"
-
 
 def test_reduced_chunk_masked_data():
     """Test method with masked data."""